disable gpu for certain model architectures and fix divide-by-zero on memory estimation
This commit is contained in:
parent
ac64cd4ef9
commit
f9cd55c70b
1 changed files with 12 additions and 4 deletions
16
llm/llm.go
16
llm/llm.go
|
@ -6,6 +6,7 @@ import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"slices"
|
||||||
|
|
||||||
"github.com/jmorganca/ollama/api"
|
"github.com/jmorganca/ollama/api"
|
||||||
"github.com/jmorganca/ollama/gpu"
|
"github.com/jmorganca/ollama/gpu"
|
||||||
|
@ -19,6 +20,10 @@ type LLM interface {
|
||||||
Close()
|
Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var cpuOnlyFamilies = []string{
|
||||||
|
"mamba",
|
||||||
|
}
|
||||||
|
|
||||||
func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
if _, err := os.Stat(model); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -48,13 +53,18 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
||||||
size := ggml.Size
|
size := ggml.Size
|
||||||
|
|
||||||
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
|
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
|
||||||
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
|
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
|
||||||
|
|
||||||
// this amount is the overhead + tensors in memory
|
// this amount is the overhead + tensors in memory
|
||||||
// TODO: get this from the llama.cpp's graph calculations instead of
|
// TODO: get this from the llama.cpp's graph calculations instead of
|
||||||
// estimating it's 1/6 * kv_cache_size * num_gqa
|
// estimating it's 1/6 * kv_cache_size * num_gqa
|
||||||
graph := int64(ggml.NumGQA()) * kv / 6
|
graph := int64(ggml.NumGQA()) * kv / 6
|
||||||
|
|
||||||
|
// certain model architectures don't support gpu inference yet
|
||||||
|
if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
|
||||||
|
opts.NumGPU = 0
|
||||||
|
}
|
||||||
|
|
||||||
info := gpu.GetGPUInfo()
|
info := gpu.GetGPUInfo()
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
case "darwin":
|
case "darwin":
|
||||||
|
@ -63,9 +73,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
||||||
}
|
}
|
||||||
|
|
||||||
if size+kv+graph > vram {
|
if size+kv+graph > vram {
|
||||||
slog.Info("not enough vram available, falling back to CPU only")
|
slog.Info("not enough vram available, setting num_gpu=0")
|
||||||
info.Library = "cpu"
|
|
||||||
info.Variant = gpu.GetCPUVariant()
|
|
||||||
opts.NumGPU = 0
|
opts.NumGPU = 0
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue