From 08f1e18965c15648504fc5ec367134898e92ec6d Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 8 Jan 2024 16:42:00 -0500 Subject: [PATCH] Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald * Update llm/llm.go Co-authored-by: Bruce MacDonald * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald --- gpu/gpu.go | 35 +++--------- gpu/gpu_darwin.go | 34 +++++------ llm/ext_server_common.go | 13 +---- llm/ext_server_default.go | 4 +- llm/ggml.go | 6 +- llm/gguf.go | 41 +++++++++++++- llm/llama.go | 61 +------------------- llm/llm.go | 115 ++++++++++++++++++++++++++++---------- llm/shim_darwin.go | 2 +- llm/shim_ext_server.go | 4 +- 10 files changed, 161 insertions(+), 154 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 45b55ffb..b9cb538a 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -16,8 +16,6 @@ import ( "runtime" "sync" "unsafe" - - "github.com/jmorganca/ollama/api" ) type handles struct { @@ -133,31 +131,14 @@ func getCPUMem() (memInfo, error) { func CheckVRAM() (int64, error) { gpuInfo := GetGPUInfo() if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { - return int64(gpuInfo.FreeMemory), nil + // allocate 384MiB for llama.cpp overhead (outside of model) + overhead := uint64(384 * 1024 * 1024) + if gpuInfo.FreeMemory <= overhead { + return 0, nil + } + + return int64(gpuInfo.FreeMemory - overhead), nil } + return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation } - -func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { - if opts.NumGPU != -1 { - return opts.NumGPU - } - info := GetGPUInfo() - if info.Library == "cpu" || info.Library == "default" { - return 0 - } - - /* - Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers. - We can store the model weights and the kv cache in vram, - to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file. - */ - bytesPerLayer := uint64(fileSizeBytes / numLayer) - - // 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors - layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4 - - log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Library, numLayer) - - return layers -} diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index ed63f718..b3556f90 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -6,18 +6,31 @@ import "C" import ( "runtime" - "github.com/jmorganca/ollama/api" + "github.com/pbnjay/memory" ) // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs func CheckVRAM() (int64, error) { - // TODO - assume metal, and return free memory? - return 0, nil + if runtime.GOARCH == "amd64" { + // gpu not supported, this may not be metal + return 0, nil + } + // on macOS, there's already buffer for available vram (see below) so just return the total + systemMemory := int64(memory.TotalMemory()) + + // macOS limits how much memory is available to the GPU based on the amount of system memory + // TODO: handle case where iogpu.wired_limit_mb is set to a higher value + if systemMemory <= 36*1024*1024*1024 { + systemMemory = systemMemory * 2 / 3 + } else { + systemMemory = systemMemory * 3 / 4 + } + + return systemMemory, nil } func GetGPUInfo() GpuInfo { - // TODO - Metal vs. x86 macs... mem, _ := getCPUMem() return GpuInfo{ Library: "default", @@ -32,19 +45,6 @@ func getCPUMem() (memInfo, error) { }, nil } -func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int { - if opts.NumGPU != -1 { - return opts.NumGPU - } - - // metal only supported on arm64 - if runtime.GOARCH == "arm64" { - return 1 - } - - return 0 -} - func nativeInit() error { return nil } diff --git a/llm/ext_server_common.go b/llm/ext_server_common.go index 2df32548..9a331742 100644 --- a/llm/ext_server_common.go +++ b/llm/ext_server_common.go @@ -35,14 +35,12 @@ import ( "encoding/json" "fmt" "log" - "os" "strings" "sync" "time" "unsafe" "github.com/jmorganca/ollama/api" - "github.com/jmorganca/ollama/gpu" ) type extServer interface { @@ -82,25 +80,20 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error { return fmt.Errorf(C.GoString(resp.msg)) } -func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newExtServer(server extServer, model string, adapters, projectors []string, opts api.Options) (extServer, error) { if !mutex.TryLock() { log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete") mutex.Lock() } - fileInfo, err := os.Stat(model) - if err != nil { - return nil, err - } + var sparams C.ext_server_params_t sparams.model = C.CString(model) defer C.free(unsafe.Pointer(sparams.model)) - numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts) - sparams.embedding = true sparams.n_ctx = C.uint(opts.NumCtx) sparams.n_batch = C.uint(opts.NumBatch) - sparams.n_gpu_layers = C.int(numGPU) + sparams.n_gpu_layers = C.int(opts.NumGPU) sparams.main_gpu = C.int(opts.MainGPU) sparams.n_parallel = 1 // TODO - wire up concurrency diff --git a/llm/ext_server_default.go b/llm/ext_server_default.go index 80e00081..05287383 100644 --- a/llm/ext_server_default.go +++ b/llm/ext_server_default.go @@ -54,9 +54,9 @@ func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) { C.llama_server_release_json_resp(json_resp) } -func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) { server := &llamaExtServer{opts} - return newExtServer(server, model, adapters, projectors, numLayers, opts) + return newExtServer(server, model, adapters, projectors, opts) } func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error { diff --git a/llm/ggml.go b/llm/ggml.go index f71328e1..7ba42962 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -78,7 +78,11 @@ type model interface { ModelFamily() string ModelType() string FileType() string - NumLayers() int64 + NumLayers() uint32 + NumGQA() uint32 + NumEmbed() uint32 + NumHead() uint32 + NumHeadKv() uint32 } type container interface { diff --git a/llm/gguf.go b/llm/gguf.go index d92932c5..96c3c7b9 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -272,14 +272,49 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error { return nil } -func (llm *ggufModel) NumLayers() int64 { +func (llm *ggufModel) NumLayers() uint32 { value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())] if !exists { return 0 } - v := value.(uint32) - return int64(v) + return value.(uint32) +} + +func (llm *ggufModel) NumHead() uint32 { + value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())] + if !exists { + return 0 + } + + return value.(uint32) +} + +func (llm *ggufModel) NumEmbed() uint32 { + value, exists := llm.kv[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())] + if !exists { + return 0 + } + + return value.(uint32) +} + +func (llm *ggufModel) NumHeadKv() uint32 { + value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())] + if !exists { + return 0 + } + + return value.(uint32) +} + +func (llm *ggufModel) NumGQA() uint32 { + numHeadKv := llm.NumHeadKv() + if numHeadKv == 0 { + return 0 + } + + return llm.NumHead() / numHeadKv } func (llm ggufModel) readU8(r io.Reader) uint8 { diff --git a/llm/llama.go b/llm/llama.go index 89616cf0..0856dca5 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -8,7 +8,6 @@ import ( "fmt" "os" "os/exec" - "sync" "time" "github.com/jmorganca/ollama/api" @@ -43,69 +42,11 @@ number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws ws ::= ([ \t\n] ws)? ` -type llamaModel struct { - hyperparameters llamaHyperparameters -} - -func (llm *llamaModel) ModelFamily() string { - return "llama" -} - -func llamaModelType(numLayer uint32) string { - switch numLayer { - case 26: - return "3B" - case 32: - return "7B" - case 40: - return "13B" - case 48: - return "34B" - case 60: - return "30B" - case 80: - return "65B" - default: - return "unknown" - } -} - -func (llm *llamaModel) ModelType() string { - return llamaModelType(llm.hyperparameters.NumLayer) -} - -func (llm *llamaModel) FileType() string { - return fileType(llm.hyperparameters.FileType) -} - -func (llm *llamaModel) NumLayers() int64 { - return int64(llm.hyperparameters.NumLayer) -} - -type llamaHyperparameters struct { - // NumVocab is the size of the model's vocabulary. - NumVocab uint32 - - // NumEmbd is the size of the model's embedding layer. - NumEmbd uint32 - NumMult uint32 - NumHead uint32 - - // NumLayer is the number of layers in the model. - NumLayer uint32 - NumRot uint32 - - // FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc. - FileType uint32 -} - type Running struct { Port int Cmd *exec.Cmd Cancel context.CancelFunc - exitOnce sync.Once - exitCh chan error // channel to receive the exit status of the subprocess - *StatusWriter // captures error messages from the llama runner process + *StatusWriter // captures error messages from the llama runner process } type ImageData struct { diff --git a/llm/llm.go b/llm/llm.go index 6179bb4e..3f1c0e2d 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -7,10 +7,7 @@ import ( "os" "runtime" - "github.com/pbnjay/memory" - "github.com/jmorganca/ollama/api" - "github.com/jmorganca/ollama/format" "github.com/jmorganca/ollama/gpu" ) @@ -40,32 +37,89 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) return nil, err } - if runtime.GOOS == "darwin" { - var requiredMemory int64 - var f16Multiplier int64 = 2 + if opts.NumCtx < 4 { + opts.NumCtx = 4 + } - switch ggml.ModelType() { - case "3B", "7B": - requiredMemory = 8 * format.GigaByte - case "13B": - requiredMemory = 16 * format.GigaByte - case "30B", "34B", "40B": - requiredMemory = 32 * format.GigaByte - case "47B": - requiredMemory = 48 * format.GigaByte - case "65B", "70B": - requiredMemory = 64 * format.GigaByte - case "180B": - requiredMemory = 128 * format.GigaByte - f16Multiplier = 4 - } + fmt.Println("size", ggml.Size) + fmt.Println("filetype", ggml.FileType()) + fmt.Println("architecture", ggml.ModelFamily()) + fmt.Println("type", ggml.ModelType()) + fmt.Println("name", ggml.Name()) + fmt.Println("embd", ggml.NumEmbed()) + fmt.Println("head", ggml.NumHead()) + fmt.Println("head_kv", ggml.NumHeadKv()) + fmt.Println("gqa", ggml.NumGQA()) - systemMemory := int64(memory.TotalMemory()) + available, _ := gpu.CheckVRAM() - if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > systemMemory { - return nil, fmt.Errorf("F16 model requires at least %s of memory", format.HumanBytes(requiredMemory)) - } else if requiredMemory > systemMemory { - return nil, fmt.Errorf("model requires at least %s of memory", format.HumanBytes(requiredMemory)) + // For now assume filesize = model size + // TODO: use actual model size + requiredModel := ggml.Size + + // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value + requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead()) + + // this amount is the overhead + tensors in memory + // TODO: get this from the llama.cpp's graph calcluations instead of + // guessing it's ~1/7th of the kv cache times gqa + requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 7 + + requiredTotal := requiredModel + requiredKv + requiredAlloc + + log.Println("system memory bytes:", available) + log.Println("required model bytes:", requiredModel) + log.Println("required kv bytes:", requiredKv) + log.Println("required alloc bytes:", requiredAlloc) + log.Println("required total bytes:", requiredTotal) + + info := gpu.GetGPUInfo() + library := info.Library + + if opts.NumGPU == -1 { + // default to offloading all layers + opts.NumGPU = int(ggml.NumLayers()) + 1 + } + + // decide how many layers to put on the GPU + if opts.NumGPU > 0 { + switch runtime.GOOS { + case "darwin": + if requiredTotal > available { + log.Println("not enough vram available, falling back to CPU only") + opts.NumGPU = 0 + } + default: + if library == "cpu" || library == "default" { + opts.NumGPU = 0 + break + } + + // no offloading required + if requiredTotal <= available { + break + } + + // This handles two cases: + // 1. overhead + tensors are always loaded into scratch memory even with num_gpu 0 + // 2. it seems llama.cpp always tries to allocate the entire kv cache (even if later split into layers) into vram or crashes + if requiredAlloc > available || requiredKv > available { + log.Printf("not enough vram available, falling back to CPU only") + library = "cpu" + opts.NumGPU = 0 + break + } + + available -= requiredAlloc + + // fill remaining vram with layers + log.Println("splitting", available, "of available memory bytes into layers") + bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers())) + log.Println("bytes per layer:", bytesPerLayer) + layers := available / bytesPerLayer + if layers < int64(opts.NumGPU) { + opts.NumGPU = int(layers) + } } } @@ -73,7 +127,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyScale = 0.0 gpuInfo := gpu.GetGPUInfo() - return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts) + return newLlmServer(gpuInfo.Library, model, adapters, projectors, opts) } // Give any native cgo implementations an opportunity to initialize @@ -81,9 +135,9 @@ func Init(workdir string) error { return nativeInit(workdir) } -func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) { if _, libPresent := AvailableShims[library]; libPresent && library != "default" { - srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts) + srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts) if err == nil { return srv, nil } @@ -91,6 +145,5 @@ func newLlmServer(library, model string, adapters, projectors []string, numLayer // TODO - update some state to indicate we were unable to load the GPU library for future "info" ux } - return newDefaultExtServer(model, adapters, projectors, numLayers, opts) - + return newDefaultExtServer(model, adapters, projectors, opts) } diff --git a/llm/shim_darwin.go b/llm/shim_darwin.go index 6e416b6d..3baafd1e 100644 --- a/llm/shim_darwin.go +++ b/llm/shim_darwin.go @@ -16,7 +16,7 @@ import ( //go:embed llama.cpp/ggml-metal.metal var libEmbed embed.FS -func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) { // should never happen... return nil, fmt.Errorf("Dynamic library loading not supported on Mac") } diff --git a/llm/shim_ext_server.go b/llm/shim_ext_server.go index 0282d5f7..dca7b38d 100644 --- a/llm/shim_ext_server.go +++ b/llm/shim_ext_server.go @@ -72,7 +72,7 @@ func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) { C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp) } -func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) { +func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) { shimMutex.Lock() defer shimMutex.Unlock() updatePath(filepath.Dir(library)) @@ -90,7 +90,7 @@ func newDynamicShimExtServer(library, model string, adapters, projectors []strin options: opts, } log.Printf("Loading Dynamic Shim llm server: %s", library) - return newExtServer(llm, model, adapters, projectors, numLayers, opts) + return newExtServer(llm, model, adapters, projectors, opts) } func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {