Discovery CPU details for default thread selection (#6264)
On windows, detect large multi-socket systems and reduce to the number of cores in one socket for best performance
This commit is contained in:
parent
1d7fa3ad2d
commit
24636dfa87
7 changed files with 408 additions and 24 deletions
|
@ -229,7 +229,10 @@ func GetGPUInfo() GpuInfoList {
|
|||
slog.Warn("error looking up system memory", "error", err)
|
||||
}
|
||||
depPath := LibraryDir()
|
||||
|
||||
details, err := GetCPUDetails()
|
||||
if err != nil {
|
||||
slog.Warn("failed to lookup CPU details", "error", err)
|
||||
}
|
||||
cpus = []CPUInfo{
|
||||
{
|
||||
GpuInfo: GpuInfo{
|
||||
|
@ -239,6 +242,7 @@ func GetGPUInfo() GpuInfoList {
|
|||
ID: "0",
|
||||
DependencyPath: depPath,
|
||||
},
|
||||
CPUs: details,
|
||||
},
|
||||
}
|
||||
|
||||
|
|
|
@ -10,7 +10,9 @@ package gpu
|
|||
import "C"
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"runtime"
|
||||
"syscall"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
)
|
||||
|
@ -69,11 +71,30 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
|||
|
||||
func GetSystemInfo() SystemInfo {
|
||||
mem, _ := GetCPUMem()
|
||||
query := "hw.perflevel0.physicalcpu"
|
||||
perfCores, err := syscall.SysctlUint32(query)
|
||||
if err != nil {
|
||||
slog.Warn("failed to discover physical CPU details", "query", query, "error", err)
|
||||
}
|
||||
query = "hw.perflevel1.physicalcpu"
|
||||
efficiencyCores, _ := syscall.SysctlUint32(query) // On x86 xeon this wont return data
|
||||
|
||||
// Determine thread count
|
||||
query = "hw.logicalcpu"
|
||||
logicalCores, _ := syscall.SysctlUint32(query)
|
||||
|
||||
return SystemInfo{
|
||||
System: CPUInfo{
|
||||
GpuInfo: GpuInfo{
|
||||
memInfo: mem,
|
||||
},
|
||||
CPUs: []CPU{
|
||||
{
|
||||
CoreCount: int(perfCores + efficiencyCores),
|
||||
EfficiencyCoreCount: int(efficiencyCores),
|
||||
ThreadCount: int(logicalCores),
|
||||
},
|
||||
},
|
||||
},
|
||||
GPUs: GetGPUInfo(),
|
||||
}
|
||||
|
|
|
@ -4,6 +4,8 @@ import (
|
|||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"reflect"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/format"
|
||||
|
@ -90,3 +92,95 @@ func GetCPUMem() (memInfo, error) {
|
|||
}
|
||||
return mem, nil
|
||||
}
|
||||
|
||||
const CpuInfoFilename = "/proc/cpuinfo"
|
||||
|
||||
type linuxCpuInfo struct {
|
||||
ID string `cpuinfo:"processor"`
|
||||
VendorID string `cpuinfo:"vendor_id"`
|
||||
ModelName string `cpuinfo:"model name"`
|
||||
PhysicalID string `cpuinfo:"physical id"`
|
||||
Siblings string `cpuinfo:"siblings"`
|
||||
CoreID string `cpuinfo:"core id"`
|
||||
}
|
||||
|
||||
func GetCPUDetails() ([]CPU, error) {
|
||||
file, err := os.Open(CpuInfoFilename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
reColumns := regexp.MustCompile("\t+: ")
|
||||
scanner := bufio.NewScanner(file)
|
||||
cpuInfos := []linuxCpuInfo{}
|
||||
cpu := &linuxCpuInfo{}
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if sl := reColumns.Split(line, 2); len(sl) > 1 {
|
||||
t := reflect.TypeOf(cpu).Elem()
|
||||
s := reflect.ValueOf(cpu).Elem()
|
||||
for i := range t.NumField() {
|
||||
field := t.Field(i)
|
||||
tag := field.Tag.Get("cpuinfo")
|
||||
if tag == sl[0] {
|
||||
s.FieldByName(field.Name).SetString(sl[1])
|
||||
break
|
||||
}
|
||||
}
|
||||
} else if strings.TrimSpace(line) == "" && cpu.ID != "" {
|
||||
cpuInfos = append(cpuInfos, *cpu)
|
||||
cpu = &linuxCpuInfo{}
|
||||
}
|
||||
}
|
||||
|
||||
// Process the sockets/cores/threads
|
||||
socketByID := map[string]*CPU{}
|
||||
coreBySocket := map[string]map[string]struct{}{}
|
||||
threadsByCoreBySocket := map[string]map[string]int{}
|
||||
for _, c := range cpuInfos {
|
||||
if _, found := socketByID[c.PhysicalID]; !found {
|
||||
socketByID[c.PhysicalID] = &CPU{
|
||||
ID: c.PhysicalID,
|
||||
VendorID: c.VendorID,
|
||||
ModelName: c.ModelName,
|
||||
}
|
||||
coreBySocket[c.PhysicalID] = map[string]struct{}{}
|
||||
threadsByCoreBySocket[c.PhysicalID] = map[string]int{}
|
||||
}
|
||||
if c.CoreID != "" {
|
||||
coreBySocket[c.PhysicalID][c.PhysicalID+":"+c.CoreID] = struct{}{}
|
||||
threadsByCoreBySocket[c.PhysicalID][c.PhysicalID+":"+c.CoreID]++
|
||||
} else {
|
||||
coreBySocket[c.PhysicalID][c.PhysicalID+":"+c.ID] = struct{}{}
|
||||
threadsByCoreBySocket[c.PhysicalID][c.PhysicalID+":"+c.ID]++
|
||||
}
|
||||
}
|
||||
|
||||
// Tally up the values from the tracking maps
|
||||
for id, s := range socketByID {
|
||||
s.CoreCount = len(coreBySocket[id])
|
||||
s.ThreadCount = 0
|
||||
for _, tc := range threadsByCoreBySocket[id] {
|
||||
s.ThreadCount += tc
|
||||
}
|
||||
|
||||
// This only works if HT is enabled, consider a more reliable model, maybe cache size comparisons?
|
||||
efficiencyCoreCount := 0
|
||||
for _, threads := range threadsByCoreBySocket[id] {
|
||||
if threads == 1 {
|
||||
efficiencyCoreCount++
|
||||
}
|
||||
}
|
||||
if efficiencyCoreCount == s.CoreCount {
|
||||
// 1:1 mapping means they're not actually efficiency cores, but regular cores
|
||||
s.EfficiencyCoreCount = 0
|
||||
} else {
|
||||
s.EfficiencyCoreCount = efficiencyCoreCount
|
||||
}
|
||||
}
|
||||
|
||||
result := []CPU{}
|
||||
for _, c := range socketByID {
|
||||
result = append(result, *c)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package gpu
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
@ -22,6 +23,7 @@ var (
|
|||
k32 = syscall.NewLazyDLL("kernel32.dll")
|
||||
globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
|
||||
sizeofMemoryStatusEx = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
|
||||
GetLogicalProcessorInformationEx = k32.NewProc("GetLogicalProcessorInformationEx")
|
||||
)
|
||||
|
||||
var CudartGlobs = []string{
|
||||
|
@ -55,3 +57,178 @@ func GetCPUMem() (memInfo, error) {
|
|||
}
|
||||
return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys, FreeSwap: memStatus.AvailPageFile}, nil
|
||||
}
|
||||
|
||||
type LOGICAL_PROCESSOR_RELATIONSHIP uint32
|
||||
|
||||
const (
|
||||
RelationProcessorCore LOGICAL_PROCESSOR_RELATIONSHIP = iota
|
||||
RelationNumaNode
|
||||
RelationCache
|
||||
RelationProcessorPackage
|
||||
RelationGroup
|
||||
RelationProcessorDie
|
||||
RelationNumaNodeEx
|
||||
RelationProcessorModule
|
||||
)
|
||||
const RelationAll LOGICAL_PROCESSOR_RELATIONSHIP = 0xffff
|
||||
|
||||
type GROUP_AFFINITY struct {
|
||||
Mask uintptr // KAFFINITY
|
||||
Group uint16
|
||||
Reserved [3]uint16
|
||||
}
|
||||
|
||||
type PROCESSOR_RELATIONSHIP struct {
|
||||
Flags byte
|
||||
EfficiencyClass byte
|
||||
Reserved [20]byte
|
||||
GroupCount uint16
|
||||
GroupMask [1]GROUP_AFFINITY // len GroupCount
|
||||
}
|
||||
|
||||
// Omitted unused structs: NUMA_NODE_RELATIONSHIP CACHE_RELATIONSHIP GROUP_RELATIONSHIP
|
||||
|
||||
type SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX struct {
|
||||
Relationship LOGICAL_PROCESSOR_RELATIONSHIP
|
||||
Size uint32
|
||||
U [1]byte // Union len Size
|
||||
// PROCESSOR_RELATIONSHIP
|
||||
// NUMA_NODE_RELATIONSHIP
|
||||
// CACHE_RELATIONSHIP
|
||||
// GROUP_RELATIONSHIP
|
||||
}
|
||||
|
||||
func (group *GROUP_AFFINITY) IsMember(target *GROUP_AFFINITY) bool {
|
||||
if group == nil || target == nil {
|
||||
return false
|
||||
}
|
||||
return group.Mask&target.Mask != 0
|
||||
}
|
||||
|
||||
type winPackage struct {
|
||||
groups []*GROUP_AFFINITY
|
||||
coreCount int // performance cores = coreCount - efficiencyCoreCount
|
||||
efficiencyCoreCount int
|
||||
threadCount int
|
||||
}
|
||||
|
||||
func (pkg *winPackage) IsMember(target *GROUP_AFFINITY) bool {
|
||||
for _, group := range pkg.groups {
|
||||
if group.IsMember(target) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func getLogicalProcessorInformationEx() ([]byte, error) {
|
||||
buf := make([]byte, 1)
|
||||
bufSize := len(buf)
|
||||
ret, _, err := GetLogicalProcessorInformationEx.Call(
|
||||
uintptr(RelationAll),
|
||||
uintptr(unsafe.Pointer(&buf[0])),
|
||||
uintptr(unsafe.Pointer(&bufSize)),
|
||||
)
|
||||
if ret != 0 {
|
||||
return nil, fmt.Errorf("failed to determine size info ret:%d %w", ret, err)
|
||||
}
|
||||
|
||||
buf = make([]byte, bufSize)
|
||||
ret, _, err = GetLogicalProcessorInformationEx.Call(
|
||||
uintptr(RelationAll),
|
||||
uintptr(unsafe.Pointer(&buf[0])),
|
||||
uintptr(unsafe.Pointer(&bufSize)),
|
||||
)
|
||||
if ret == 0 {
|
||||
return nil, fmt.Errorf("failed to gather processor information ret:%d buflen:%d %w", ret, bufSize, err)
|
||||
}
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
|
||||
var slpi *SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX
|
||||
// Find all the packages first
|
||||
packages := []*winPackage{}
|
||||
for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
|
||||
slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
|
||||
if slpi.Relationship != RelationProcessorPackage {
|
||||
continue
|
||||
}
|
||||
pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
|
||||
pkg := &winPackage{}
|
||||
ga0 := unsafe.Pointer(&pr.GroupMask[0])
|
||||
for j := range pr.GroupCount {
|
||||
gm := (*GROUP_AFFINITY)(unsafe.Pointer(uintptr(ga0) + uintptr(j)*unsafe.Sizeof(GROUP_AFFINITY{})))
|
||||
pkg.groups = append(pkg.groups, gm)
|
||||
}
|
||||
packages = append(packages, pkg)
|
||||
}
|
||||
|
||||
slog.Info("packages", "count", len(packages))
|
||||
|
||||
// To identify efficiency cores we have to compare the relative values
|
||||
// Larger values are "less efficient" (aka, more performant)
|
||||
var maxEfficiencyClass byte
|
||||
for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
|
||||
slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
|
||||
if slpi.Relationship != RelationProcessorCore {
|
||||
continue
|
||||
}
|
||||
pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
|
||||
if pr.EfficiencyClass > maxEfficiencyClass {
|
||||
maxEfficiencyClass = pr.EfficiencyClass
|
||||
}
|
||||
}
|
||||
if maxEfficiencyClass > 0 {
|
||||
slog.Info("efficiency cores detected", "maxEfficiencyClass", maxEfficiencyClass)
|
||||
}
|
||||
|
||||
// then match up the Cores to the Packages, count up cores, threads and efficiency cores
|
||||
for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
|
||||
slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
|
||||
if slpi.Relationship != RelationProcessorCore {
|
||||
continue
|
||||
}
|
||||
pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
|
||||
ga0 := unsafe.Pointer(&pr.GroupMask[0])
|
||||
for j := range pr.GroupCount {
|
||||
gm := (*GROUP_AFFINITY)(unsafe.Pointer(uintptr(ga0) + uintptr(j)*unsafe.Sizeof(GROUP_AFFINITY{})))
|
||||
for _, pkg := range packages {
|
||||
if pkg.IsMember(gm) {
|
||||
pkg.coreCount++
|
||||
if pr.Flags == 0 {
|
||||
pkg.threadCount++
|
||||
} else {
|
||||
pkg.threadCount += 2
|
||||
}
|
||||
if pr.EfficiencyClass < maxEfficiencyClass {
|
||||
pkg.efficiencyCoreCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sumarize the results
|
||||
for i, pkg := range packages {
|
||||
slog.Info("", "package", i, "cores", pkg.coreCount, "efficiency", pkg.efficiencyCoreCount, "threads", pkg.threadCount)
|
||||
}
|
||||
|
||||
return packages
|
||||
}
|
||||
|
||||
func GetCPUDetails() ([]CPU, error) {
|
||||
buf, err := getLogicalProcessorInformationEx()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
packages := processSystemLogicalProcessorInforationList(buf)
|
||||
cpus := make([]CPU, len(packages))
|
||||
|
||||
for i, pkg := range packages {
|
||||
cpus[i].CoreCount = pkg.coreCount
|
||||
cpus[i].EfficiencyCoreCount = pkg.efficiencyCoreCount
|
||||
cpus[i].ThreadCount = pkg.threadCount
|
||||
}
|
||||
return cpus, nil
|
||||
}
|
||||
|
|
77
gpu/gpu_windows_test.go
Normal file
77
gpu/gpu_windows_test.go
Normal file
File diff suppressed because one or more lines are too long
24
gpu/types.go
24
gpu/types.go
|
@ -10,11 +10,11 @@ import (
|
|||
type memInfo struct {
|
||||
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||
FreeSwap uint64 `json:"free_swap,omitempty"`
|
||||
FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
|
||||
}
|
||||
|
||||
// Beginning of an `ollama info` command
|
||||
type GpuInfo struct {
|
||||
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
|
||||
memInfo
|
||||
Library string `json:"library,omitempty"`
|
||||
|
||||
|
@ -49,6 +49,17 @@ type GpuInfo struct {
|
|||
|
||||
type CPUInfo struct {
|
||||
GpuInfo
|
||||
CPUs []CPU
|
||||
}
|
||||
|
||||
// CPU type represents a CPU Package occupying a socket
|
||||
type CPU struct {
|
||||
ID string `cpuinfo:"processor"`
|
||||
VendorID string `cpuinfo:"vendor_id"`
|
||||
ModelName string `cpuinfo:"model name"`
|
||||
CoreCount int
|
||||
EfficiencyCoreCount int // Performance = CoreCount - Efficiency
|
||||
ThreadCount int
|
||||
}
|
||||
|
||||
type CudaGPUInfo struct {
|
||||
|
@ -158,3 +169,12 @@ type SystemInfo struct {
|
|||
UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
|
||||
DiscoveryErrors []string `json:"discovery_errors"`
|
||||
}
|
||||
|
||||
// Return the optimal number of threads to use for inference
|
||||
func (si SystemInfo) GetOptimalThreadCount() int {
|
||||
if len(si.System.CPUs) == 0 {
|
||||
return 0
|
||||
}
|
||||
// Allocate thread count matching the performance cores on a single socket
|
||||
return si.System.CPUs[0].CoreCount - si.System.CPUs[0].EfficiencyCoreCount
|
||||
}
|
||||
|
|
|
@ -98,15 +98,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
var systemFreeMemory uint64
|
||||
var systemSwapFreeMemory uint64
|
||||
|
||||
systemMemInfo, err := gpu.GetCPUMem()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup system memory", "error", err)
|
||||
} else {
|
||||
systemTotalMemory = systemMemInfo.TotalMemory
|
||||
systemFreeMemory = systemMemInfo.FreeMemory
|
||||
systemSwapFreeMemory = systemMemInfo.FreeSwap
|
||||
systemInfo := gpu.GetSystemInfo()
|
||||
systemTotalMemory = systemInfo.System.TotalMemory
|
||||
systemFreeMemory = systemInfo.System.FreeMemory
|
||||
systemSwapFreeMemory = systemInfo.System.FreeSwap
|
||||
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
|
||||
}
|
||||
|
||||
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||
if opts.NumGPU == 0 {
|
||||
|
@ -217,8 +213,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
params = append(params, "--mmproj", projectors[0])
|
||||
}
|
||||
|
||||
defaultThreads := systemInfo.GetOptimalThreadCount()
|
||||
if opts.NumThread > 0 {
|
||||
params = append(params, "--threads", strconv.Itoa(opts.NumThread))
|
||||
} else if defaultThreads > 0 {
|
||||
params = append(params, "--threads", strconv.Itoa(defaultThreads))
|
||||
}
|
||||
|
||||
if !opts.F16KV {
|
||||
|
@ -260,15 +259,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
params = append(params, "--mlock")
|
||||
}
|
||||
|
||||
if gpu.IsNUMA() && gpus[0].Library == "cpu" {
|
||||
numaMode := "distribute"
|
||||
if runtime.GOOS == "linux" {
|
||||
if _, err := exec.LookPath("numactl"); err == nil {
|
||||
numaMode = "numactl"
|
||||
}
|
||||
}
|
||||
params = append(params, "--numa", numaMode)
|
||||
}
|
||||
// TODO - NUMA support currently doesn't work properly
|
||||
|
||||
params = append(params, "--parallel", strconv.Itoa(numParallel))
|
||||
|
||||
|
|
Loading…
Reference in a new issue