Merge pull request #5106 from dhiltgen/clean_logs

Tighten up memory prediction logging
2024-06-18 09:24:38 -07:00 · 2024-06-18 09:24:38 -07:00 · b55958a587
commit b55958a587
parent c9c8c98bf6 7784ca33ce
2 changed files with 66 additions and 44 deletions
--- a/llm/memory.go
+++ b/llm/memory.go
@ -49,6 +49,18 @@ type MemoryEstimate struct {
 	// For multi-GPU scenarios, this is the size in bytes per GPU
 	GPUSizes []uint64
 	// internal fields for logging purposes
 	inferenceLibrary    string
 	layersRequested     int
 	layersModel         int
 	availableList       []string
 	kv                  uint64
 	allocationsList     []string
 	memoryWeights       uint64
 	memoryLayerOutput   uint64
 	graphFullOffload    uint64
 	graphPartialOffload uint64
 }
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
@ -252,78 +264,86 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		allocationsList = append(allocationsList, format.HumanBytes2(a))
 	}
 	estimate := MemoryEstimate{
 		TotalSize: memoryRequiredTotal,
 		Layers:    0,
 		Graph:     0,
 		VRAMSize:  0,
 		GPUSizes:  []uint64{},
 		inferenceLibrary:    gpus[0].Library,
 		layersRequested:     opts.NumGPU,
 		layersModel:         int(ggml.KV().BlockCount()) + 1,
 		availableList:       availableList,
 		kv:                  kv,
 		allocationsList:     allocationsList,
 		memoryWeights:       memoryWeights,
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
 	}
 	if gpus[0].Library == "cpu" {
 		return estimate
 	}
 	if layerCount == 0 {
 		slog.Debug("insufficient VRAM to load any model layers")
 		return estimate
 	}
 	estimate.Layers = layerCount
 	estimate.Graph = graphOffload
 	estimate.VRAMSize = memoryRequiredPartial
 	estimate.TotalSize = memoryRequiredTotal
 	estimate.TensorSplit = tensorSplit
 	estimate.GPUSizes = gpuAllocations
 	return estimate
 }
 func (m MemoryEstimate) log() {
 	slog.Info(
-		"offload to gpu",
+		"offload to "+m.inferenceLibrary,
 		slog.Group(
 			"layers",
 			// requested number of layers to offload
-			"requested", opts.NumGPU,
+			"requested", m.layersRequested,
 			// The number of layers the model has (including output)
-			"model", int(ggml.KV().BlockCount())+1,
+			"model", m.layersModel,
 			// estimated number of layers that can be offloaded
-			"offload", layerCount,
+			"offload", m.Layers,
-			// multi-gpu split for tesnors
+			// multi-gpu split for tensors
-			"split", tensorSplit,
+			"split", m.TensorSplit,
 		),
 		slog.Group(
 			"memory",
 			// memory available by GPU for offloading
-			"available", availableList,
+			"available", m.availableList,
 			slog.Group(
 				"required",
 				// memory required for full offloading
-				"full", format.HumanBytes2(memoryRequiredTotal),
+				"full", format.HumanBytes2(m.TotalSize),
 				// memory required to offload layers.estimate layers
-				"partial", format.HumanBytes2(memoryRequiredPartial),
+				"partial", format.HumanBytes2(m.VRAMSize),
 				// memory of KV cache
-				"kv", format.HumanBytes2(kv),
+				"kv", format.HumanBytes2(m.kv),
 				// Allocations across the GPUs
-				"allocations", allocationsList,
+				"allocations", m.allocationsList,
 			),
 			slog.Group(
 				"weights",
 				// memory of the weights
-				"total", format.HumanBytes2(memoryWeights),
+				"total", format.HumanBytes2(m.memoryWeights),
 				// memory of repeating layers
-				"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
+				"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
 				// memory of non-repeating layers
-				"nonrepeating", format.HumanBytes2(memoryLayerOutput),
+				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
 			),
 			slog.Group(
 				"graph",
 				// memory of graph when fully offloaded
-				"full", format.HumanBytes2(graphFullOffload),
+				"full", format.HumanBytes2(m.graphFullOffload),
 				// memory of graph when not fully offloaded
-				"partial", format.HumanBytes2(graphPartialOffload),
+				"partial", format.HumanBytes2(m.graphPartialOffload),
 			),
 		),
 	)
 	if gpus[0].Library == "cpu" {
 		return MemoryEstimate{
 			Layers:    0,
 			Graph:     0,
 			VRAMSize:  0,
 			TotalSize: memoryRequiredTotal,
 			GPUSizes:  []uint64{},
 		}
 	}
 	if layerCount == 0 {
 		slog.Debug("insufficient VRAM to load any model layers")
 		return MemoryEstimate{
 			Layers:    0,
 			Graph:     0,
 			VRAMSize:  0,
 			TotalSize: memoryRequiredTotal,
 			GPUSizes:  []uint64{},
 		}
 	}
 	return MemoryEstimate{
 		Layers:      layerCount,
 		Graph:       graphOffload,
 		VRAMSize:    memoryRequiredPartial,
 		TotalSize:   memoryRequiredTotal,
 		TensorSplit: tensorSplit,
 		GPUSizes:    gpuAllocations,
 	}
 }
--- a/llm/server.go
+++ b/llm/server.go
@ -116,6 +116,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 	}
 	estimate.log()
 	// Loop through potential servers
 	finalErr := errors.New("no suitable llama servers found")