Merge pull request #4153 from dhiltgen/gpu_verbose_response
Add GPU usage
This commit is contained in:
commit
ee49844d09
3 changed files with 40 additions and 20 deletions
|
@ -53,6 +53,8 @@ func HumanBytes(b int64) string {
|
|||
|
||||
func HumanBytes2(b uint64) string {
|
||||
switch {
|
||||
case b >= GibiByte:
|
||||
return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
|
||||
case b >= MebiByte:
|
||||
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
|
||||
case b >= KibiByte:
|
||||
|
|
|
@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
|
|||
// Split up the GPUs by type and try them
|
||||
for _, gpus := range allGpus.ByLibrary() {
|
||||
var layerCount int
|
||||
layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
if opts.NumGPU < 0 {
|
||||
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
|
||||
return true, estimatedVRAM
|
||||
|
@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
|
|||
return false, estimatedVRAM
|
||||
}
|
||||
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||
// The GPUs provided must all be the same Library
|
||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
|
||||
if gpus[0].Library == "cpu" {
|
||||
return 0, 0
|
||||
}
|
||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
|
||||
var memoryAvailable uint64
|
||||
for _, info := range gpus {
|
||||
memoryAvailable += info.FreeMemory
|
||||
|
@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
|
||||
|
||||
if memoryRequiredPartial > memoryAvailable {
|
||||
slog.Debug("insufficient VRAM to load any model layers")
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
var memoryLayerOutput uint64
|
||||
if layer, ok := layers["output_norm"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
|
@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||
),
|
||||
),
|
||||
)
|
||||
return layerCount, uint64(memoryRequiredPartial)
|
||||
if gpus[0].Library == "cpu" {
|
||||
return 0, 0, memoryRequiredTotal
|
||||
}
|
||||
if memoryRequiredPartial > memoryAvailable {
|
||||
slog.Debug("insufficient VRAM to load any model layers")
|
||||
return 0, 0, memoryRequiredTotal
|
||||
}
|
||||
|
||||
return layerCount, memoryRequiredPartial, memoryRequiredTotal
|
||||
}
|
||||
|
|
|
@ -50,6 +50,9 @@ type llmServer struct {
|
|||
|
||||
// TODO - this should be broken down by GPU
|
||||
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
|
||||
estimatedTotal uint64 // Total size of model
|
||||
totalLayers uint64
|
||||
gpuCount int
|
||||
|
||||
sem *semaphore.Weighted
|
||||
}
|
||||
|
@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
|
||||
cpuRunner := ""
|
||||
var estimatedVRAM uint64
|
||||
var estimatedTotal uint64
|
||||
var systemMemory uint64
|
||||
gpuCount := len(gpus)
|
||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
|
||||
|
||||
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
|
||||
|
||||
cpuRunner = serverForCpu()
|
||||
gpuCount = 0
|
||||
} else {
|
||||
if gpus[0].Library == "metal" {
|
||||
memInfo, err := gpu.GetCPUMem()
|
||||
|
@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
}
|
||||
}
|
||||
var layers int
|
||||
layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
|
||||
if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
|
||||
// disable partial offloading when model is greater than total system memory as this
|
||||
|
@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
} else {
|
||||
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
|
||||
servers = []string{demandLib}
|
||||
if strings.HasPrefix(demandLib, "cpu") {
|
||||
// Omit the GPU flag to silence the warning
|
||||
opts.NumGPU = -1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(servers[i], "cpu") {
|
||||
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
|
||||
gpuCount = 0
|
||||
}
|
||||
|
||||
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
|
||||
port := 0
|
||||
if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
|
||||
|
@ -272,7 +287,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
estimatedVRAM: estimatedVRAM,
|
||||
estimatedTotal: estimatedTotal,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
totalLayers: ggml.KV().BlockCount() + 1,
|
||||
gpuCount: gpuCount,
|
||||
}
|
||||
|
||||
s.cmd.Env = os.Environ()
|
||||
|
|
Loading…
Reference in a new issue