ollama/llm/llm.go

package llm

import (
	"context"
	"fmt"
	"log"
	"os"
	"runtime"

	"github.com/pbnjay/memory"

	"github.com/jmorganca/ollama/api"
	"github.com/jmorganca/ollama/format"
	"github.com/jmorganca/ollama/gpu"
)

type LLM interface {
	Predict(context.Context, PredictOpts, func(PredictResult)) error
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
	Close()
}

var AvailableShims = map[string]string{}

func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	ggml, err := DecodeGGML(f)
	if err != nil {
		return nil, err
	}

	if runtime.GOOS == "darwin" {
		var requiredMemory int64
		var f16Multiplier int64 = 2

		switch ggml.ModelType() {
		case "3B", "7B":
			requiredMemory = 8 * format.GigaByte
		case "13B":
			requiredMemory = 16 * format.GigaByte
		case "30B", "34B", "40B":
			requiredMemory = 32 * format.GigaByte
		case "47B":
			requiredMemory = 48 * format.GigaByte
		case "65B", "70B":
			requiredMemory = 64 * format.GigaByte
		case "180B":
			requiredMemory = 128 * format.GigaByte
			f16Multiplier = 4
		}

		systemMemory := int64(memory.TotalMemory())

		if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > systemMemory {
			return nil, fmt.Errorf("F16 model requires at least %s of memory", format.HumanBytes(requiredMemory))
		} else if requiredMemory > systemMemory {
			return nil, fmt.Errorf("model requires at least %s of memory", format.HumanBytes(requiredMemory))
		}
	}

	opts.NumGQA = 0
	opts.RopeFrequencyBase = 0.0
	opts.RopeFrequencyScale = 0.0
	gpuInfo := gpu.GetGPUInfo()
	return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)
}

// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
	return nativeInit(workdir)
}

func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
	if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
		srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)
		if err == nil {
			return srv, nil
		}
		log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)
		// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
	}

	return newDefaultExtServer(model, adapters, projectors, numLayers, opts)

}
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`package llm`

			`import (`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`"context"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`"fmt"`
disable gpu for q5_0, q5_1, q8_0 quants 2023-08-03 22:40:16 +00:00			`"log"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`"os"`
enable q8, q5, 5_1, and f32 for linux gpu (#699) 2023-10-05 16:53:47 +00:00			`"runtime"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00
check memory requirements before loading 2023-08-03 22:47:36 +00:00			`"github.com/pbnjay/memory"`

partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`"github.com/jmorganca/ollama/api"`
fix memory check 2023-10-12 16:34:16 +00:00			`"github.com/jmorganca/ollama/format"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`"github.com/jmorganca/ollama/gpu"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`)`

			`type LLM interface {`
chat api endpoint (#1392) 2023-12-05 19:57:33 +00:00			`Predict(context.Context, PredictOpts, func(PredictResult)) error`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`Embedding(context.Context, string) ([]float64, error)`
			`Encode(context.Context, string) ([]int, error)`
			`Decode(context.Context, []int) (string, error)`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`Close()`
			`}`

Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`var AvailableShims = map[string]string{}`
Refine handling of shim presence This allows the CPU only builds to work on systems with Radeon cards 2023-12-15 22:27:27 +00:00
load projectors 2023-11-30 18:30:23 +00:00			`func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`if _, err := os.Stat(model); err != nil {`
			`return nil, err`
			`}`

			`f, err := os.Open(model)`
			`if err != nil {`
			`return nil, err`
			`}`
close open files 2023-08-14 23:08:02 +00:00			`defer f.Close()`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00
GGUF support (#441) 2023-09-07 17:55:37 +00:00			`ggml, err := DecodeGGML(f)`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`if err != nil {`
			`return nil, err`
			`}`

enable q8, q5, 5_1, and f32 for linux gpu (#699) 2023-10-05 16:53:47 +00:00			`if runtime.GOOS == "darwin" {`
only check system memory on macos 2023-10-13 21:41:51 +00:00			`var requiredMemory int64`
			`var f16Multiplier int64 = 2`
check total (system + video) memory 2023-10-12 17:36:23 +00:00
only check system memory on macos 2023-10-13 21:41:51 +00:00			`switch ggml.ModelType() {`
			`case "3B", "7B":`
			`requiredMemory = 8 * format.GigaByte`
			`case "13B":`
			`requiredMemory = 16 * format.GigaByte`
			`case "30B", "34B", "40B":`
			`requiredMemory = 32 * format.GigaByte`
add macOS memory check for 47B models 2024-01-04 00:46:16 +00:00			`case "47B":`
			`requiredMemory = 48 * format.GigaByte`
only check system memory on macos 2023-10-13 21:41:51 +00:00			`case "65B", "70B":`
			`requiredMemory = 64 * format.GigaByte`
			`case "180B":`
			`requiredMemory = 128 * format.GigaByte`
			`f16Multiplier = 4`
			`}`
check total (system + video) memory 2023-10-12 17:36:23 +00:00
only check system memory on macos 2023-10-13 21:41:51 +00:00			`systemMemory := int64(memory.TotalMemory())`
check total (system + video) memory 2023-10-12 17:36:23 +00:00
only check system memory on macos 2023-10-13 21:41:51 +00:00			`if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > systemMemory {`
tweak memory requirements error text 2024-01-04 00:47:18 +00:00			`return nil, fmt.Errorf("F16 model requires at least %s of memory", format.HumanBytes(requiredMemory))`
only check system memory on macos 2023-10-13 21:41:51 +00:00			`} else if requiredMemory > systemMemory {`
tweak memory requirements error text 2024-01-04 00:47:18 +00:00			`return nil, fmt.Errorf("model requires at least %s of memory", format.HumanBytes(requiredMemory))`
only check system memory on macos 2023-10-13 21:41:51 +00:00			`}`
check memory requirements before loading 2023-08-03 22:47:36 +00:00			`}`

deprecate ggml - remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com> 2023-11-24 18:58:09 +00:00			`opts.NumGQA = 0`
			`opts.RopeFrequencyBase = 0.0`
			`opts.RopeFrequencyScale = 0.0`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`gpuInfo := gpu.GetGPUInfo()`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`

			`// Give any native cgo implementations an opportunity to initialize`
			`func Init(workdir string) error {`
			`return nativeInit(workdir)`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`}`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00
			`func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {`
			`if _, libPresent := AvailableShims[library]; libPresent && library != "default" {`
			`srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)`
			`if err == nil {`
			`return srv, nil`
			`}`
Load dynamic cpu lib on windows On linux, we link the CPU library in to the Go app and fall back to it when no GPU match is found. On windows we do not link in the CPU library so that we can better control our dependencies for the CLI. This fixes the logic so we correctly fallback to the dynamic CPU library on windows. 2024-01-04 16:41:41 +00:00			`log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)`
			`// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`}`

			`return newDefaultExtServer(model, adapters, projectors, numLayers, opts)`

			`}`