Merge pull request #5106 from dhiltgen/clean_logs
Tighten up memory prediction logging
This commit is contained in:
commit
b55958a587
2 changed files with 66 additions and 44 deletions
108
llm/memory.go
108
llm/memory.go
|
@ -49,6 +49,18 @@ type MemoryEstimate struct {
|
||||||
|
|
||||||
// For multi-GPU scenarios, this is the size in bytes per GPU
|
// For multi-GPU scenarios, this is the size in bytes per GPU
|
||||||
GPUSizes []uint64
|
GPUSizes []uint64
|
||||||
|
|
||||||
|
// internal fields for logging purposes
|
||||||
|
inferenceLibrary string
|
||||||
|
layersRequested int
|
||||||
|
layersModel int
|
||||||
|
availableList []string
|
||||||
|
kv uint64
|
||||||
|
allocationsList []string
|
||||||
|
memoryWeights uint64
|
||||||
|
memoryLayerOutput uint64
|
||||||
|
graphFullOffload uint64
|
||||||
|
graphPartialOffload uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||||
|
@ -252,78 +264,86 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
allocationsList = append(allocationsList, format.HumanBytes2(a))
|
allocationsList = append(allocationsList, format.HumanBytes2(a))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
estimate := MemoryEstimate{
|
||||||
|
TotalSize: memoryRequiredTotal,
|
||||||
|
Layers: 0,
|
||||||
|
Graph: 0,
|
||||||
|
VRAMSize: 0,
|
||||||
|
GPUSizes: []uint64{},
|
||||||
|
|
||||||
|
inferenceLibrary: gpus[0].Library,
|
||||||
|
layersRequested: opts.NumGPU,
|
||||||
|
layersModel: int(ggml.KV().BlockCount()) + 1,
|
||||||
|
availableList: availableList,
|
||||||
|
kv: kv,
|
||||||
|
allocationsList: allocationsList,
|
||||||
|
memoryWeights: memoryWeights,
|
||||||
|
memoryLayerOutput: memoryLayerOutput,
|
||||||
|
graphFullOffload: graphFullOffload,
|
||||||
|
graphPartialOffload: graphPartialOffload,
|
||||||
|
}
|
||||||
|
|
||||||
|
if gpus[0].Library == "cpu" {
|
||||||
|
return estimate
|
||||||
|
}
|
||||||
|
if layerCount == 0 {
|
||||||
|
slog.Debug("insufficient VRAM to load any model layers")
|
||||||
|
return estimate
|
||||||
|
}
|
||||||
|
estimate.Layers = layerCount
|
||||||
|
estimate.Graph = graphOffload
|
||||||
|
estimate.VRAMSize = memoryRequiredPartial
|
||||||
|
estimate.TotalSize = memoryRequiredTotal
|
||||||
|
estimate.TensorSplit = tensorSplit
|
||||||
|
estimate.GPUSizes = gpuAllocations
|
||||||
|
return estimate
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m MemoryEstimate) log() {
|
||||||
slog.Info(
|
slog.Info(
|
||||||
"offload to gpu",
|
"offload to "+m.inferenceLibrary,
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"layers",
|
"layers",
|
||||||
// requested number of layers to offload
|
// requested number of layers to offload
|
||||||
"requested", opts.NumGPU,
|
"requested", m.layersRequested,
|
||||||
// The number of layers the model has (including output)
|
// The number of layers the model has (including output)
|
||||||
"model", int(ggml.KV().BlockCount())+1,
|
"model", m.layersModel,
|
||||||
// estimated number of layers that can be offloaded
|
// estimated number of layers that can be offloaded
|
||||||
"offload", layerCount,
|
"offload", m.Layers,
|
||||||
// multi-gpu split for tesnors
|
// multi-gpu split for tensors
|
||||||
"split", tensorSplit,
|
"split", m.TensorSplit,
|
||||||
),
|
),
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"memory",
|
"memory",
|
||||||
// memory available by GPU for offloading
|
// memory available by GPU for offloading
|
||||||
"available", availableList,
|
"available", m.availableList,
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"required",
|
"required",
|
||||||
// memory required for full offloading
|
// memory required for full offloading
|
||||||
"full", format.HumanBytes2(memoryRequiredTotal),
|
"full", format.HumanBytes2(m.TotalSize),
|
||||||
// memory required to offload layers.estimate layers
|
// memory required to offload layers.estimate layers
|
||||||
"partial", format.HumanBytes2(memoryRequiredPartial),
|
"partial", format.HumanBytes2(m.VRAMSize),
|
||||||
// memory of KV cache
|
// memory of KV cache
|
||||||
"kv", format.HumanBytes2(kv),
|
"kv", format.HumanBytes2(m.kv),
|
||||||
// Allocations across the GPUs
|
// Allocations across the GPUs
|
||||||
"allocations", allocationsList,
|
"allocations", m.allocationsList,
|
||||||
),
|
),
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"weights",
|
"weights",
|
||||||
// memory of the weights
|
// memory of the weights
|
||||||
"total", format.HumanBytes2(memoryWeights),
|
"total", format.HumanBytes2(m.memoryWeights),
|
||||||
// memory of repeating layers
|
// memory of repeating layers
|
||||||
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
|
"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
|
||||||
// memory of non-repeating layers
|
// memory of non-repeating layers
|
||||||
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
|
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
|
||||||
),
|
),
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"graph",
|
"graph",
|
||||||
// memory of graph when fully offloaded
|
// memory of graph when fully offloaded
|
||||||
"full", format.HumanBytes2(graphFullOffload),
|
"full", format.HumanBytes2(m.graphFullOffload),
|
||||||
// memory of graph when not fully offloaded
|
// memory of graph when not fully offloaded
|
||||||
"partial", format.HumanBytes2(graphPartialOffload),
|
"partial", format.HumanBytes2(m.graphPartialOffload),
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
if gpus[0].Library == "cpu" {
|
|
||||||
return MemoryEstimate{
|
|
||||||
Layers: 0,
|
|
||||||
Graph: 0,
|
|
||||||
VRAMSize: 0,
|
|
||||||
TotalSize: memoryRequiredTotal,
|
|
||||||
GPUSizes: []uint64{},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if layerCount == 0 {
|
|
||||||
slog.Debug("insufficient VRAM to load any model layers")
|
|
||||||
return MemoryEstimate{
|
|
||||||
Layers: 0,
|
|
||||||
Graph: 0,
|
|
||||||
VRAMSize: 0,
|
|
||||||
TotalSize: memoryRequiredTotal,
|
|
||||||
GPUSizes: []uint64{},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return MemoryEstimate{
|
|
||||||
Layers: layerCount,
|
|
||||||
Graph: graphOffload,
|
|
||||||
VRAMSize: memoryRequiredPartial,
|
|
||||||
TotalSize: memoryRequiredTotal,
|
|
||||||
TensorSplit: tensorSplit,
|
|
||||||
GPUSizes: gpuAllocations,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -116,6 +116,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
estimate.log()
|
||||||
|
|
||||||
// Loop through potential servers
|
// Loop through potential servers
|
||||||
finalErr := errors.New("no suitable llama servers found")
|
finalErr := errors.New("no suitable llama servers found")
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue