ollama/llm/llm.go

package llm

import (
	"context"
	"fmt"
	"log"
	"os"
	"runtime"

	"github.com/pbnjay/memory"

	"github.com/jmorganca/ollama/api"
	"github.com/jmorganca/ollama/format"
	"github.com/jmorganca/ollama/gpu"
)

type LLM interface {
	Predict(context.Context, PredictOpts, func(PredictResult)) error
	Embedding(context.Context, string) ([]float64, error)
	Encode(context.Context, string) ([]int, error)
	Decode(context.Context, []int) (string, error)
	Close()
}

var AvailableShims = map[string]string{}

func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
	if _, err := os.Stat(model); err != nil {
		return nil, err
	}

	f, err := os.Open(model)
	if err != nil {
		return nil, err
	}
	defer f.Close()

	ggml, err := DecodeGGML(f)
	if err != nil {
		return nil, err
	}

	if runtime.GOOS == "darwin" {
		switch ggml.FileType() {
		case "F32", "Q5_0", "Q5_1", "Q8_0":
			if ggml.Name() != "gguf" && opts.NumGPU != 0 {
				// GGML Q8_0 do not support Metal API and will
				// cause the runner to segmentation fault so disable GPU
				log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")
				opts.NumGPU = 0
			}
		}

		var requiredMemory int64
		var f16Multiplier int64 = 2

		switch ggml.ModelType() {
		case "3B", "7B":
			requiredMemory = 8 * format.GigaByte
		case "13B":
			requiredMemory = 16 * format.GigaByte
		case "30B", "34B", "40B":
			requiredMemory = 32 * format.GigaByte
		case "65B", "70B":
			requiredMemory = 64 * format.GigaByte
		case "180B":
			requiredMemory = 128 * format.GigaByte
			f16Multiplier = 4
		}

		systemMemory := int64(memory.TotalMemory())

		if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > systemMemory {
			return nil, fmt.Errorf("F16 model requires at least %s of total memory", format.HumanBytes(requiredMemory))
		} else if requiredMemory > systemMemory {
			return nil, fmt.Errorf("model requires at least %s of total memory", format.HumanBytes(requiredMemory))
		}
	}

	opts.NumGQA = 0
	opts.RopeFrequencyBase = 0.0
	opts.RopeFrequencyScale = 0.0
	gpuInfo := gpu.GetGPUInfo()
	return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)
}

// Give any native cgo implementations an opportunity to initialize
func Init(workdir string) error {
	return nativeInit(workdir)
}

func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
	if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
		srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)
		if err == nil {
			return srv, nil
		}
		log.Printf("Failed to load dynamic library - falling back to CPU mode %s", err)
	}

	return newDefaultExtServer(model, adapters, projectors, numLayers, opts)

}
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`package llm`

			`import (`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`"context"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`"fmt"`
disable gpu for q5_0, q5_1, q8_0 quants 2023-08-03 22:40:16 +00:00			`"log"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`"os"`
enable q8, q5, 5_1, and f32 for linux gpu (#699) 2023-10-05 16:53:47 +00:00			`"runtime"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00
check memory requirements before loading 2023-08-03 22:47:36 +00:00			`"github.com/pbnjay/memory"`

partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`"github.com/jmorganca/ollama/api"`
fix memory check 2023-10-12 16:34:16 +00:00			`"github.com/jmorganca/ollama/format"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`"github.com/jmorganca/ollama/gpu"`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`)`

			`type LLM interface {`
chat api endpoint (#1392) 2023-12-05 19:57:33 +00:00			`Predict(context.Context, PredictOpts, func(PredictResult)) error`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`Embedding(context.Context, string) ([]float64, error)`
			`Encode(context.Context, string) ([]int, error)`
			`Decode(context.Context, []int) (string, error)`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`Close()`
			`}`

Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`var AvailableShims = map[string]string{}`
Refine handling of shim presence This allows the CPU only builds to work on systems with Radeon cards 2023-12-15 22:27:27 +00:00
load projectors 2023-11-30 18:30:23 +00:00			`func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`if _, err := os.Stat(model); err != nil {`
			`return nil, err`
			`}`

			`f, err := os.Open(model)`
			`if err != nil {`
			`return nil, err`
			`}`
close open files 2023-08-14 23:08:02 +00:00			`defer f.Close()`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00
GGUF support (#441) 2023-09-07 17:55:37 +00:00			`ggml, err := DecodeGGML(f)`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`if err != nil {`
			`return nil, err`
			`}`

enable q8, q5, 5_1, and f32 for linux gpu (#699) 2023-10-05 16:53:47 +00:00			`if runtime.GOOS == "darwin" {`
			`switch ggml.FileType() {`
recent llama.cpp update added kernels for fp32, q5_0, and q5_1 2023-11-20 21:44:12 +00:00			`case "F32", "Q5_0", "Q5_1", "Q8_0":`
enable q8, q5, 5_1, and f32 for linux gpu (#699) 2023-10-05 16:53:47 +00:00			`if ggml.Name() != "gguf" && opts.NumGPU != 0 {`
			`// GGML Q8_0 do not support Metal API and will`
			`// cause the runner to segmentation fault so disable GPU`
			`log.Printf("WARNING: GPU disabled for F32, Q5_0, Q5_1, and Q8_0")`
			`opts.NumGPU = 0`
			`}`
disable gpu for q5_0, q5_1, q8_0 quants 2023-08-03 22:40:16 +00:00			`}`
refactor memory check 2023-10-12 16:47:17 +00:00
only check system memory on macos 2023-10-13 21:41:51 +00:00			`var requiredMemory int64`
			`var f16Multiplier int64 = 2`
check total (system + video) memory 2023-10-12 17:36:23 +00:00
only check system memory on macos 2023-10-13 21:41:51 +00:00			`switch ggml.ModelType() {`
			`case "3B", "7B":`
			`requiredMemory = 8 * format.GigaByte`
			`case "13B":`
			`requiredMemory = 16 * format.GigaByte`
			`case "30B", "34B", "40B":`
			`requiredMemory = 32 * format.GigaByte`
			`case "65B", "70B":`
			`requiredMemory = 64 * format.GigaByte`
			`case "180B":`
			`requiredMemory = 128 * format.GigaByte`
			`f16Multiplier = 4`
			`}`
check total (system + video) memory 2023-10-12 17:36:23 +00:00
only check system memory on macos 2023-10-13 21:41:51 +00:00			`systemMemory := int64(memory.TotalMemory())`
check total (system + video) memory 2023-10-12 17:36:23 +00:00
only check system memory on macos 2023-10-13 21:41:51 +00:00			`if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > systemMemory {`
			`return nil, fmt.Errorf("F16 model requires at least %s of total memory", format.HumanBytes(requiredMemory))`
			`} else if requiredMemory > systemMemory {`
			`return nil, fmt.Errorf("model requires at least %s of total memory", format.HumanBytes(requiredMemory))`
			`}`
check memory requirements before loading 2023-08-03 22:47:36 +00:00			`}`

deprecate ggml - remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com> 2023-11-24 18:58:09 +00:00			`opts.NumGQA = 0`
			`opts.RopeFrequencyBase = 0.0`
			`opts.RopeFrequencyScale = 0.0`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`gpuInfo := gpu.GetGPUInfo()`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`

			`// Give any native cgo implementations an opportunity to initialize`
			`func Init(workdir string) error {`
			`return nativeInit(workdir)`
partial decode ggml bin for more info 2023-07-21 20:33:56 +00:00			`}`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00
			`func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {`
			`if _, libPresent := AvailableShims[library]; libPresent && library != "default" {`
			`srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)`
			`if err == nil {`
			`return srv, nil`
			`}`
			`log.Printf("Failed to load dynamic library - falling back to CPU mode %s", err)`
			`}`

			`return newDefaultExtServer(model, adapters, projectors, numLayers, opts)`

			`}`