Discovery CPU details for default thread selection (#6264)

On windows, detect large multi-socket systems and reduce to the number of cores in one socket for best performance
2024-10-15 11:36:08 -07:00 · 2024-10-15 11:36:08 -07:00 · 24636dfa87
commit 24636dfa87
parent 1d7fa3ad2d
7 changed files with 408 additions and 24 deletions
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -229,7 +229,10 @@ func GetGPUInfo() GpuInfoList {
 			slog.Warn("error looking up system memory", "error", err)
 		}
 		depPath := LibraryDir()
-
+		details, err := GetCPUDetails()
 		if err != nil {
 			slog.Warn("failed to lookup CPU details", "error", err)
 		}
 		cpus = []CPUInfo{
 			{
 				GpuInfo: GpuInfo{
@ -239,6 +242,7 @@ func GetGPUInfo() GpuInfoList {
 					ID:             "0",
 					DependencyPath: depPath,
 				},
 				CPUs: details,
 			},
 		}
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@ -10,7 +10,9 @@ package gpu
 import "C"
 import (
 	"log/slog"
 	"runtime"
 	"syscall"
 	"github.com/ollama/ollama/format"
 )
@ -69,11 +71,30 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 func GetSystemInfo() SystemInfo {
 	mem, _ := GetCPUMem()
 	query := "hw.perflevel0.physicalcpu"
 	perfCores, err := syscall.SysctlUint32(query)
 	if err != nil {
 		slog.Warn("failed to discover physical CPU details", "query", query, "error", err)
 	}
 	query = "hw.perflevel1.physicalcpu"
 	efficiencyCores, _ := syscall.SysctlUint32(query) // On x86 xeon this wont return data
 	// Determine thread count
 	query = "hw.logicalcpu"
 	logicalCores, _ := syscall.SysctlUint32(query)
 	return SystemInfo{
 		System: CPUInfo{
 			GpuInfo: GpuInfo{
 				memInfo: mem,
 			},
 			CPUs: []CPU{
 				{
 					CoreCount:           int(perfCores + efficiencyCores),
 					EfficiencyCoreCount: int(efficiencyCores),
 					ThreadCount:         int(logicalCores),
 				},
 			},
 		},
 		GPUs: GetGPUInfo(),
 	}
--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
@ -4,6 +4,8 @@ import (
 	"bufio"
 	"fmt"
 	"os"
 	"reflect"
 	"regexp"
 	"strings"
 	"github.com/ollama/ollama/format"
@ -90,3 +92,95 @@ func GetCPUMem() (memInfo, error) {
 	}
 	return mem, nil
 }
 const CpuInfoFilename = "/proc/cpuinfo"
 type linuxCpuInfo struct {
 	ID         string `cpuinfo:"processor"`
 	VendorID   string `cpuinfo:"vendor_id"`
 	ModelName  string `cpuinfo:"model name"`
 	PhysicalID string `cpuinfo:"physical id"`
 	Siblings   string `cpuinfo:"siblings"`
 	CoreID     string `cpuinfo:"core id"`
 }
 func GetCPUDetails() ([]CPU, error) {
 	file, err := os.Open(CpuInfoFilename)
 	if err != nil {
 		return nil, err
 	}
 	reColumns := regexp.MustCompile("\t+: ")
 	scanner := bufio.NewScanner(file)
 	cpuInfos := []linuxCpuInfo{}
 	cpu := &linuxCpuInfo{}
 	for scanner.Scan() {
 		line := scanner.Text()
 		if sl := reColumns.Split(line, 2); len(sl) > 1 {
 			t := reflect.TypeOf(cpu).Elem()
 			s := reflect.ValueOf(cpu).Elem()
 			for i := range t.NumField() {
 				field := t.Field(i)
 				tag := field.Tag.Get("cpuinfo")
 				if tag == sl[0] {
 					s.FieldByName(field.Name).SetString(sl[1])
 					break
 				}
 			}
 		} else if strings.TrimSpace(line) == "" && cpu.ID != "" {
 			cpuInfos = append(cpuInfos, *cpu)
 			cpu = &linuxCpuInfo{}
 		}
 	}
 	// Process the sockets/cores/threads
 	socketByID := map[string]*CPU{}
 	coreBySocket := map[string]map[string]struct{}{}
 	threadsByCoreBySocket := map[string]map[string]int{}
 	for _, c := range cpuInfos {
 		if _, found := socketByID[c.PhysicalID]; !found {
 			socketByID[c.PhysicalID] = &CPU{
 				ID:        c.PhysicalID,
 				VendorID:  c.VendorID,
 				ModelName: c.ModelName,
 			}
 			coreBySocket[c.PhysicalID] = map[string]struct{}{}
 			threadsByCoreBySocket[c.PhysicalID] = map[string]int{}
 		}
 		if c.CoreID != "" {
 			coreBySocket[c.PhysicalID][c.PhysicalID+":"+c.CoreID] = struct{}{}
 			threadsByCoreBySocket[c.PhysicalID][c.PhysicalID+":"+c.CoreID]++
 		} else {
 			coreBySocket[c.PhysicalID][c.PhysicalID+":"+c.ID] = struct{}{}
 			threadsByCoreBySocket[c.PhysicalID][c.PhysicalID+":"+c.ID]++
 		}
 	}
 	// Tally up the values from the tracking maps
 	for id, s := range socketByID {
 		s.CoreCount = len(coreBySocket[id])
 		s.ThreadCount = 0
 		for _, tc := range threadsByCoreBySocket[id] {
 			s.ThreadCount += tc
 		}
 		// This only works if HT is enabled, consider a more reliable model, maybe cache size comparisons?
 		efficiencyCoreCount := 0
 		for _, threads := range threadsByCoreBySocket[id] {
 			if threads == 1 {
 				efficiencyCoreCount++
 			}
 		}
 		if efficiencyCoreCount == s.CoreCount {
 			// 1:1 mapping means they're not actually efficiency cores, but regular cores
 			s.EfficiencyCoreCount = 0
 		} else {
 			s.EfficiencyCoreCount = efficiencyCoreCount
 		}
 	}
 	result := []CPU{}
 	for _, c := range socketByID {
 		result = append(result, *c)
 	}
 	return result, nil
 }
--- a/gpu/gpu_windows.go
+++ b/gpu/gpu_windows.go
@ -2,6 +2,7 @@ package gpu
 import (
 	"fmt"
 	"log/slog"
 	"syscall"
 	"unsafe"
 )
@ -22,6 +23,7 @@ var (
 	k32                              = syscall.NewLazyDLL("kernel32.dll")
 	globalMemoryStatusExProc         = k32.NewProc("GlobalMemoryStatusEx")
 	sizeofMemoryStatusEx             = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
 	GetLogicalProcessorInformationEx = k32.NewProc("GetLogicalProcessorInformationEx")
 )
 var CudartGlobs = []string{
@ -55,3 +57,178 @@ func GetCPUMem() (memInfo, error) {
 	}
 	return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys, FreeSwap: memStatus.AvailPageFile}, nil
 }
 type LOGICAL_PROCESSOR_RELATIONSHIP uint32
 const (
 	RelationProcessorCore LOGICAL_PROCESSOR_RELATIONSHIP = iota
 	RelationNumaNode
 	RelationCache
 	RelationProcessorPackage
 	RelationGroup
 	RelationProcessorDie
 	RelationNumaNodeEx
 	RelationProcessorModule
 )
 const RelationAll LOGICAL_PROCESSOR_RELATIONSHIP = 0xffff
 type GROUP_AFFINITY struct {
 	Mask     uintptr // KAFFINITY
 	Group    uint16
 	Reserved [3]uint16
 }
 type PROCESSOR_RELATIONSHIP struct {
 	Flags           byte
 	EfficiencyClass byte
 	Reserved        [20]byte
 	GroupCount      uint16
 	GroupMask       [1]GROUP_AFFINITY // len GroupCount
 }
 // Omitted unused structs: NUMA_NODE_RELATIONSHIP CACHE_RELATIONSHIP GROUP_RELATIONSHIP
 type SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX struct {
 	Relationship LOGICAL_PROCESSOR_RELATIONSHIP
 	Size         uint32
 	U            [1]byte // Union len Size
 	// PROCESSOR_RELATIONSHIP
 	// NUMA_NODE_RELATIONSHIP
 	// CACHE_RELATIONSHIP
 	// GROUP_RELATIONSHIP
 }
 func (group *GROUP_AFFINITY) IsMember(target *GROUP_AFFINITY) bool {
 	if group == nil || target == nil {
 		return false
 	}
 	return group.Mask&target.Mask != 0
 }
 type winPackage struct {
 	groups              []*GROUP_AFFINITY
 	coreCount           int // performance cores = coreCount - efficiencyCoreCount
 	efficiencyCoreCount int
 	threadCount         int
 }
 func (pkg *winPackage) IsMember(target *GROUP_AFFINITY) bool {
 	for _, group := range pkg.groups {
 		if group.IsMember(target) {
 			return true
 		}
 	}
 	return false
 }
 func getLogicalProcessorInformationEx() ([]byte, error) {
 	buf := make([]byte, 1)
 	bufSize := len(buf)
 	ret, _, err := GetLogicalProcessorInformationEx.Call(
 		uintptr(RelationAll),
 		uintptr(unsafe.Pointer(&buf[0])),
 		uintptr(unsafe.Pointer(&bufSize)),
 	)
 	if ret != 0 {
 		return nil, fmt.Errorf("failed to determine size info ret:%d %w", ret, err)
 	}
 	buf = make([]byte, bufSize)
 	ret, _, err = GetLogicalProcessorInformationEx.Call(
 		uintptr(RelationAll),
 		uintptr(unsafe.Pointer(&buf[0])),
 		uintptr(unsafe.Pointer(&bufSize)),
 	)
 	if ret == 0 {
 		return nil, fmt.Errorf("failed to gather processor information ret:%d buflen:%d %w", ret, bufSize, err)
 	}
 	return buf, nil
 }
 func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
 	var slpi *SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX
 	// Find all the packages first
 	packages := []*winPackage{}
 	for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
 		slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
 		if slpi.Relationship != RelationProcessorPackage {
 			continue
 		}
 		pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
 		pkg := &winPackage{}
 		ga0 := unsafe.Pointer(&pr.GroupMask[0])
 		for j := range pr.GroupCount {
 			gm := (*GROUP_AFFINITY)(unsafe.Pointer(uintptr(ga0) + uintptr(j)*unsafe.Sizeof(GROUP_AFFINITY{})))
 			pkg.groups = append(pkg.groups, gm)
 		}
 		packages = append(packages, pkg)
 	}
 	slog.Info("packages", "count", len(packages))
 	// To identify efficiency cores we have to compare the relative values
 	// Larger values are "less efficient" (aka, more performant)
 	var maxEfficiencyClass byte
 	for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
 		slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
 		if slpi.Relationship != RelationProcessorCore {
 			continue
 		}
 		pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
 		if pr.EfficiencyClass > maxEfficiencyClass {
 			maxEfficiencyClass = pr.EfficiencyClass
 		}
 	}
 	if maxEfficiencyClass > 0 {
 		slog.Info("efficiency cores detected", "maxEfficiencyClass", maxEfficiencyClass)
 	}
 	// then match up the Cores to the Packages, count up cores, threads and efficiency cores
 	for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
 		slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
 		if slpi.Relationship != RelationProcessorCore {
 			continue
 		}
 		pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
 		ga0 := unsafe.Pointer(&pr.GroupMask[0])
 		for j := range pr.GroupCount {
 			gm := (*GROUP_AFFINITY)(unsafe.Pointer(uintptr(ga0) + uintptr(j)*unsafe.Sizeof(GROUP_AFFINITY{})))
 			for _, pkg := range packages {
 				if pkg.IsMember(gm) {
 					pkg.coreCount++
 					if pr.Flags == 0 {
 						pkg.threadCount++
 					} else {
 						pkg.threadCount += 2
 					}
 					if pr.EfficiencyClass < maxEfficiencyClass {
 						pkg.efficiencyCoreCount++
 					}
 				}
 			}
 		}
 	}
 	// Sumarize the results
 	for i, pkg := range packages {
 		slog.Info("", "package", i, "cores", pkg.coreCount, "efficiency", pkg.efficiencyCoreCount, "threads", pkg.threadCount)
 	}
 	return packages
 }
 func GetCPUDetails() ([]CPU, error) {
 	buf, err := getLogicalProcessorInformationEx()
 	if err != nil {
 		return nil, err
 	}
 	packages := processSystemLogicalProcessorInforationList(buf)
 	cpus := make([]CPU, len(packages))
 	for i, pkg := range packages {
 		cpus[i].CoreCount = pkg.coreCount
 		cpus[i].EfficiencyCoreCount = pkg.efficiencyCoreCount
 		cpus[i].ThreadCount = pkg.threadCount
 	}
 	return cpus, nil
 }
--- a/gpu/gpu_windows_test.go
+++ b/gpu/gpu_windows_test.go
--- a/gpu/types.go
+++ b/gpu/types.go
@ -10,11 +10,11 @@ import (
 type memInfo struct {
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
-	FreeSwap    uint64 `json:"free_swap,omitempty"`
+	FreeSwap    uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
 }
 // Beginning of an `ollama info` command
-type GpuInfo struct {
+type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	memInfo
 	Library string `json:"library,omitempty"`
@ -49,6 +49,17 @@ type GpuInfo struct {
 type CPUInfo struct {
 	GpuInfo
 	CPUs []CPU
 }
 // CPU type represents a CPU Package occupying a socket
 type CPU struct {
 	ID                  string `cpuinfo:"processor"`
 	VendorID            string `cpuinfo:"vendor_id"`
 	ModelName           string `cpuinfo:"model name"`
 	CoreCount           int
 	EfficiencyCoreCount int // Performance = CoreCount - Efficiency
 	ThreadCount         int
 }
 type CudaGPUInfo struct {
@ -158,3 +169,12 @@ type SystemInfo struct {
 	UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
 	DiscoveryErrors []string             `json:"discovery_errors"`
 }
 // Return the optimal number of threads to use for inference
 func (si SystemInfo) GetOptimalThreadCount() int {
 	if len(si.System.CPUs) == 0 {
 		return 0
 	}
 	// Allocate thread count matching the performance cores on a single socket
 	return si.System.CPUs[0].CoreCount - si.System.CPUs[0].EfficiencyCoreCount
 }
--- a/llm/server.go
+++ b/llm/server.go
@ -98,15 +98,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var systemFreeMemory uint64
 	var systemSwapFreeMemory uint64
-	systemMemInfo, err := gpu.GetCPUMem()
+	systemInfo := gpu.GetSystemInfo()
-	if err != nil {
+	systemTotalMemory = systemInfo.System.TotalMemory
-		slog.Error("failed to lookup system memory", "error", err)
+	systemFreeMemory = systemInfo.System.FreeMemory
-	} else {
+	systemSwapFreeMemory = systemInfo.System.FreeSwap
 		systemTotalMemory = systemMemInfo.TotalMemory
 		systemFreeMemory = systemMemInfo.FreeMemory
 		systemSwapFreeMemory = systemMemInfo.FreeSwap
 	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
 	}
 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
 	if opts.NumGPU == 0 {
@ -217,8 +213,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mmproj", projectors[0])
 	}
 	defaultThreads := systemInfo.GetOptimalThreadCount()
 	if opts.NumThread > 0 {
 		params = append(params, "--threads", strconv.Itoa(opts.NumThread))
 	} else if defaultThreads > 0 {
 		params = append(params, "--threads", strconv.Itoa(defaultThreads))
 	}
 	if !opts.F16KV {
@ -260,15 +259,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mlock")
 	}
-	if gpu.IsNUMA() && gpus[0].Library == "cpu" {
+	// TODO - NUMA support currently doesn't work properly
 		numaMode := "distribute"
 		if runtime.GOOS == "linux" {
 			if _, err := exec.LookPath("numactl"); err == nil {
 				numaMode = "numactl"
 			}
 		}
 		params = append(params, "--numa", numaMode)
 	}
 	params = append(params, "--parallel", strconv.Itoa(numParallel))