ollama/llm/llm.go

package llm

import (
	"context"
	"fmt"
	"log/slog"
	"os"
	"runtime"

	"github.com/jmorganca/ollama/api"
	"github.com/jmorganca/ollama/gpu"
)

type LLM interface {
	Predict(context.Context, PredictOpts, func(PredictResult)) error
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
	Close()
}

func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	ggml, err := DecodeGGML(f)
	if err != nil {
		return nil, err
	}

	if opts.NumCtx > int(ggml.NumCtx()) {
		slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))
		opts.NumCtx = int(ggml.NumCtx())
	}

	if opts.NumCtx < 4 {
		opts.NumCtx = 4
	}

	vram, _ := gpu.CheckVRAM()
	size := ggml.Size

	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())

	// this amount is the overhead + tensors in memory
	// TODO: get this from the llama.cpp's graph calculations instead of
	// estimating it's 1/6 * kv_cache_size * num_gqa
	graph := int64(ggml.NumGQA()) * kv / 6

	info := gpu.GetGPUInfo()
	switch runtime.GOOS {
	case "darwin":
		if opts.NumGPU == 0 {
			break
		}

		if size+kv+graph > vram {
			slog.Info("not enough vram available, falling back to CPU only")
			info.Library = "cpu"
			info.Variant = gpu.GetCPUVariant()
			opts.NumGPU = 0
			break
		}

		// TODO: implement layer splitting on macOS
		opts.NumGPU = 999
	default:
		if info.Library == "cpu" {
			slog.Info("GPU not available, falling back to CPU")
			opts.NumGPU = 0
			break
		}

		// don't use GPU at all if no layers are loaded
		if opts.NumGPU == 0 {
			info.Library = "cpu"
			info.Variant = gpu.GetCPUVariant()
			break
		}

		// user-defined GPU count
		if opts.NumGPU != -1 {
			break
		}

		// the "main" GPU needs the most memory and determines the limit
		// of how many layers can be loaded. It needs to fit:
		// 1. the full compute graph allocation for all devices (graph)
		// 2. the proportional kv cache for all devices (kv * % layers)
		// 3. the proportional model (size * % layers / # devices)
		// This estimates the number of layers
		maxlayers := int64(ggml.NumLayers()) + 1
		devices := int64(info.DeviceCount)
		avg := vram / devices
		layers := maxlayers * (avg - graph) / (kv + size/devices)
		if layers > maxlayers {
			layers = maxlayers
		}

		// 1 + 2 must fit on the main gpu
		min := graph + kv*layers/maxlayers
		if layers <= 0 || min > avg {
			slog.Info("not enough vram available, falling back to CPU only")
			info.Library = "cpu"
			info.Variant = gpu.GetCPUVariant()
			opts.NumGPU = 0
			break
		}

		opts.NumGPU = int(layers)
	}

	opts.RopeFrequencyBase = 0.0
	opts.RopeFrequencyScale = 0.0
	return newLlmServer(info, model, adapters, projectors, opts)
}

// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
	return nativeInit(workdir)
}

func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
	dynLibs := getDynLibs(gpuInfo)

	// Check to see if the user has requested a specific library instead of auto-detecting
	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
	if demandLib != "" {
		libPath := availableDynLibs[demandLib]
		if libPath == "" {
			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
		} else {
			slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
			dynLibs = []string{libPath}
		}
	}

	err2 := fmt.Errorf("unable to locate suitable llm library")
	for _, dynLib := range dynLibs {
		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
		if err == nil {
			return srv, nil
		}
		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
		err2 = err
	}

	return nil, err2
}
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`package llm`

			`import (`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`"context"`
Fix up the CPU fallback selection The memory changes and multi-variant change had some merge glitches I missed. This fixes them so we actually get the cpu llm lib and best variant for the given system. 2024-01-11 22:43:16 +00:00			`"fmt"`
Mechanical switch from log to slog A few obvious levels were adjusted, but generally everything mapped to "info" level. 2024-01-18 18:52:01 +00:00			`"log/slog"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`"os"`
enable q8, q5, 5_1, and f32 for linux gpu (#699) 2023-10-05 16:53:47 +00:00			`"runtime"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00
			`"github.com/jmorganca/ollama/api"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`"github.com/jmorganca/ollama/gpu"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`)`

			`type LLM interface {`
chat api endpoint (#1392) 2023-12-05 19:57:33 +00:00			`Predict(context.Context, PredictOpts, func(PredictResult)) error`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`Embedding(context.Context, string) ([]float64, error)`
			`Encode(context.Context, string) ([]int, error)`
			`Decode(context.Context, []int) (string, error)`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`Close()`
			`}`

load projectors 2023-11-30 18:30:23 +00:00			`func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`if _, err := os.Stat(model); err != nil {`
			`return nil, err`
			`}`

			`f, err := os.Open(model)`
			`if err != nil {`
			`return nil, err`
			`}`
close open files 2023-08-14 23:08:02 +00:00			`defer f.Close()`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00
GGUF support (#441) 2023-09-07 17:55:37 +00:00			`ggml, err := DecodeGGML(f)`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`if err != nil {`
			`return nil, err`
			`}`

add max context length check 2024-01-12 22:54:01 +00:00			`if opts.NumCtx > int(ggml.NumCtx()) {`
Mechanical switch from log to slog A few obvious levels were adjusted, but generally everything mapped to "info" level. 2024-01-18 18:52:01 +00:00			`slog.Warn(fmt.Sprintf("requested context length is greater than model's max context length (%d > %d), using %d instead", opts.NumCtx, ggml.NumCtx(), ggml.NumCtx()))`
add max context length check 2024-01-12 22:54:01 +00:00			`opts.NumCtx = int(ggml.NumCtx())`
			`}`

Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 21:42:00 +00:00			`if opts.NumCtx < 4 {`
			`opts.NumCtx = 4`
			`}`

revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`vram, _ := gpu.CheckVRAM()`
			`size := ggml.Size`
Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 21:42:00 +00:00
			`// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value`
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())`
check total (system + video) memory 2023-10-12 17:36:23 +00:00
Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 21:42:00 +00:00			`// this amount is the overhead + tensors in memory`
typo 2024-01-09 17:45:42 +00:00			`// TODO: get this from the llama.cpp's graph calculations instead of`
better estimate scratch buffer size 2024-01-09 02:32:44 +00:00			`// estimating it's 1/6 * kv_cache_size * num_gqa`
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`graph := int64(ggml.NumGQA()) * kv / 6`
Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 21:42:00 +00:00
			`info := gpu.GetGPUInfo()`
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`switch runtime.GOOS {`
			`case "darwin":`
			`if opts.NumGPU == 0 {`
			`break`
			`}`
Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 21:42:00 +00:00
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`if size+kv+graph > vram {`
Mechanical switch from log to slog A few obvious levels were adjusted, but generally everything mapped to "info" level. 2024-01-18 18:52:01 +00:00			`slog.Info("not enough vram available, falling back to CPU only")`
Fix up the CPU fallback selection The memory changes and multi-variant change had some merge glitches I missed. This fixes them so we actually get the cpu llm lib and best variant for the given system. 2024-01-11 22:43:16 +00:00			`info.Library = "cpu"`
			`info.Variant = gpu.GetCPUVariant()`
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`opts.NumGPU = 0`
			`break`
			`}`

Load all layers on `arm64` macOS if model is small enough (#2149) 2024-01-23 01:40:06 +00:00			`// TODO: implement layer splitting on macOS`
			`opts.NumGPU = 999`
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`default:`
Fix up the CPU fallback selection The memory changes and multi-variant change had some merge glitches I missed. This fixes them so we actually get the cpu llm lib and best variant for the given system. 2024-01-11 22:43:16 +00:00			`if info.Library == "cpu" {`
Mechanical switch from log to slog A few obvious levels were adjusted, but generally everything mapped to "info" level. 2024-01-18 18:52:01 +00:00			`slog.Info("GPU not available, falling back to CPU")`
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`opts.NumGPU = 0`
			`break`
			`}`

			`// don't use GPU at all if no layers are loaded`
			`if opts.NumGPU == 0 {`
Fix up the CPU fallback selection The memory changes and multi-variant change had some merge glitches I missed. This fixes them so we actually get the cpu llm lib and best variant for the given system. 2024-01-11 22:43:16 +00:00			`info.Library = "cpu"`
			`info.Variant = gpu.GetCPUVariant()`
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`break`
			`}`

			`// user-defined GPU count`
			`if opts.NumGPU != -1 {`
			`break`
			`}`

			`// the "main" GPU needs the most memory and determines the limit`
			`// of how many layers can be loaded. It needs to fit:`
			`// 1. the full compute graph allocation for all devices (graph)`
			`// 2. the proportional kv cache for all devices (kv * % layers)`
			`// 3. the proportional model (size * % layers / # devices)`
			`// This estimates the number of layers`
			`maxlayers := int64(ggml.NumLayers()) + 1`
			`devices := int64(info.DeviceCount)`
			`avg := vram / devices`
			`layers := maxlayers * (avg - graph) / (kv + size/devices)`
			`if layers > maxlayers {`
			`layers = maxlayers`
			`}`
Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 21:42:00 +00:00
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`// 1 + 2 must fit on the main gpu`
			`min := graph + kv*layers/maxlayers`
			`if layers <= 0 \|\| min > avg {`
Mechanical switch from log to slog A few obvious levels were adjusted, but generally everything mapped to "info" level. 2024-01-18 18:52:01 +00:00			`slog.Info("not enough vram available, falling back to CPU only")`
Fix up the CPU fallback selection The memory changes and multi-variant change had some merge glitches I missed. This fixes them so we actually get the cpu llm lib and best variant for the given system. 2024-01-11 22:43:16 +00:00			`info.Library = "cpu"`
			`info.Variant = gpu.GetCPUVariant()`
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00			`opts.NumGPU = 0`
			`break`
only check system memory on macos 2023-10-13 21:41:51 +00:00			`}`
revisit memory allocation to account for full kv cache on main gpu 2024-01-11 06:45:31 +00:00
			`opts.NumGPU = int(layers)`
check memory requirements before loading 2023-08-03 22:47:36 +00:00			`}`

deprecate ggml - remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com> 2023-11-24 18:58:09 +00:00			`opts.RopeFrequencyBase = 0.0`
			`opts.RopeFrequencyScale = 0.0`
Fix up the CPU fallback selection The memory changes and multi-variant change had some merge glitches I missed. This fixes them so we actually get the cpu llm lib and best variant for the given system. 2024-01-11 22:43:16 +00:00			`return newLlmServer(info, model, adapters, projectors, opts)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`

			`// Give any native cgo implementations an opportunity to initialize`
			`func Init(workdir string) error {`
			`return nativeInit(workdir)`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`}`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00
Always dynamically load the llm server library This switches darwin to dynamic loading, and refactors the code now that no static linking of the library is used on any platform 2024-01-10 04:29:58 +00:00			`func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {`
			`dynLibs := getDynLibs(gpuInfo)`
Build multiple CPU variants and pick the best This reduces the built-in linux version to not use any vector extensions which enables the resulting builds to run under Rosetta on MacOS in Docker. Then at runtime it checks for the actual CPU vector extensions and loads the best CPU library available 2024-01-07 23:48:05 +00:00
			`// Check to see if the user has requested a specific library instead of auto-detecting`
			`demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")`
			`if demandLib != "" {`
Always dynamically load the llm server library This switches darwin to dynamic loading, and refactors the code now that no static linking of the library is used on any platform 2024-01-10 04:29:58 +00:00			`libPath := availableDynLibs[demandLib]`
Build multiple CPU variants and pick the best This reduces the built-in linux version to not use any vector extensions which enables the resulting builds to run under Rosetta on MacOS in Docker. Then at runtime it checks for the actual CPU vector extensions and loads the best CPU library available 2024-01-07 23:48:05 +00:00			`if libPath == "" {`
Mechanical switch from log to slog A few obvious levels were adjusted, but generally everything mapped to "info" level. 2024-01-18 18:52:01 +00:00			`slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))`
Build multiple CPU variants and pick the best This reduces the built-in linux version to not use any vector extensions which enables the resulting builds to run under Rosetta on MacOS in Docker. Then at runtime it checks for the actual CPU vector extensions and loads the best CPU library available 2024-01-07 23:48:05 +00:00			`} else {`
Mechanical switch from log to slog A few obvious levels were adjusted, but generally everything mapped to "info" level. 2024-01-18 18:52:01 +00:00			`slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))`
Always dynamically load the llm server library This switches darwin to dynamic loading, and refactors the code now that no static linking of the library is used on any platform 2024-01-10 04:29:58 +00:00			`dynLibs = []string{libPath}`
Build multiple CPU variants and pick the best This reduces the built-in linux version to not use any vector extensions which enables the resulting builds to run under Rosetta on MacOS in Docker. Then at runtime it checks for the actual CPU vector extensions and loads the best CPU library available 2024-01-07 23:48:05 +00:00			`}`
			`}`

Always dynamically load the llm server library This switches darwin to dynamic loading, and refactors the code now that no static linking of the library is used on any platform 2024-01-10 04:29:58 +00:00			`err2 := fmt.Errorf("unable to locate suitable llm library")`
			`for _, dynLib := range dynLibs {`
			`srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`if err == nil {`
			`return srv, nil`
			`}`
Mechanical switch from log to slog A few obvious levels were adjusted, but generally everything mapped to "info" level. 2024-01-18 18:52:01 +00:00			`slog.Warn(fmt.Sprintf("Failed to load dynamic library %s %s", dynLib, err))`
Always dynamically load the llm server library This switches darwin to dynamic loading, and refactors the code now that no static linking of the library is used on any platform 2024-01-10 04:29:58 +00:00			`err2 = err`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`}`

Always dynamically load the llm server library This switches darwin to dynamic loading, and refactors the code now that no static linking of the library is used on any platform 2024-01-10 04:29:58 +00:00			`return nil, err2`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`}`