ollama/llm/llama.go

package llm

import (
	"bytes"
	"context"
	_ "embed"
	"errors"
	"fmt"
	"os"
	"os/exec"
	"sync"
	"time"

	"github.com/jmorganca/ollama/api"
	"github.com/jmorganca/ollama/format"
)

const jsonGrammar = `
root   ::= object
value  ::= object | array | string | number | ("true" | "false" | "null") ws

object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}" ws

array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]" ws

string ::=
  "\"" (
    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws

number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws

# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?
`

type llamaModel struct {
	hyperparameters llamaHyperparameters
}

func (llm *llamaModel) ModelFamily() string {
	return "llama"
}

func llamaModelType(numLayer uint32) string {
	switch numLayer {
	case 26:
		return "3B"
	case 32:
		return "7B"
	case 40:
		return "13B"
	case 48:
		return "34B"
	case 60:
		return "30B"
	case 80:
		return "65B"
	default:
		return "unknown"
	}
}

func (llm *llamaModel) ModelType() string {
	return llamaModelType(llm.hyperparameters.NumLayer)
}

func (llm *llamaModel) FileType() string {
	return fileType(llm.hyperparameters.FileType)
}

func (llm *llamaModel) NumLayers() int64 {
	return int64(llm.hyperparameters.NumLayer)
}

type llamaHyperparameters struct {
	// NumVocab is the size of the model's vocabulary.
	NumVocab uint32

	// NumEmbd is the size of the model's embedding layer.
	NumEmbd uint32
	NumMult uint32
	NumHead uint32

	// NumLayer is the number of layers in the model.
	NumLayer uint32
	NumRot   uint32

	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
	FileType uint32
}

type Running struct {
	Port          int
	Cmd           *exec.Cmd
	Cancel        context.CancelFunc
	exitOnce      sync.Once
	exitCh        chan error // channel to receive the exit status of the subprocess
	*StatusWriter            // captures error messages from the llama runner process
}

type ImageData struct {
	Data []byte `json:"data"`
	ID   int    `json:"id"`
}

var (
	errNvidiaSMI     = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
	errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
	payloadMissing   = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
)

// StatusWriter is a writer that captures error messages from the llama runner process
type StatusWriter struct {
	ErrCh      chan error
	LastErrMsg string
}

func NewStatusWriter() *StatusWriter {
	return &StatusWriter{
		ErrCh: make(chan error, 1),
	}
}

func (w *StatusWriter) Write(b []byte) (int, error) {
	var errMsg string
	if _, after, ok := bytes.Cut(b, []byte("error:")); ok {
		errMsg = string(bytes.TrimSpace(after))
	} else if _, after, ok := bytes.Cut(b, []byte("CUDA error")); ok {
		errMsg = string(bytes.TrimSpace(after))
	}

	if errMsg != "" {
		w.LastErrMsg = errMsg
		w.ErrCh <- fmt.Errorf("llama runner: %s", errMsg)
	}

	return os.Stderr.Write(b)
}

type prediction struct {
	Content string `json:"content"`
	Model   string `json:"model"`
	Prompt  string `json:"prompt"`
	Stop    bool   `json:"stop"`

	Timings struct {
		PredictedN  int     `json:"predicted_n"`
		PredictedMS float64 `json:"predicted_ms"`
		PromptN     int     `json:"prompt_n"`
		PromptMS    float64 `json:"prompt_ms"`
	}
}

const maxBufferSize = 512 * format.KiloByte
const maxRetries = 3
const retryDelay = 1 * time.Second

type PredictOpts struct {
	Prompt  string
	Format  string
	Images  []api.ImageData
	Options api.Options
}

type PredictResult struct {
	Content            string
	Done               bool
	PromptEvalCount    int
	PromptEvalDuration time.Duration
	EvalCount          int
	EvalDuration       time.Duration
}

type TokenizeRequest struct {
	Content string `json:"content"`
}

type TokenizeResponse struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeRequest struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeResponse struct {
	Content string `json:"content"`
}

type EmbeddingRequest struct {
	Content string `json:"content"`
}

type EmbeddingResponse struct {
	Embedding []float64 `json:"embedding"`
}
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`package llm`

			`import (`
			`"bytes"`
			`"context"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`_ "embed"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`"errors"`
			`"fmt"`
			`"os"`
			`"os/exec"`
prevent waiting on exited command (#752) * prevent waiting on exited command * close llama runner once 2023-10-11 16:32:13 +00:00			`"sync"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`"time"`

			`"github.com/jmorganca/ollama/api"`
update checkvram 2023-10-13 21:45:50 +00:00			`"github.com/jmorganca/ollama/format"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`)`

JSON mode: add `"format" as an api parameter (#1051) * add `"format": "json"` as an API parameter --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2023-11-10 00:44:02 +00:00			const jsonGrammar = `
			`root ::= object`
			`value ::= object \| array \| string \| number \| ("true" \| "false" \| "null") ws`

			`object ::=`
			`"{" ws (`
			`string ":" ws value`
			`("," ws string ":" ws value)*`
			`)? "}" ws`

			`array ::=`
			`"[" ws (`
			`value`
			`("," ws value)*`
			`)? "]" ws`

			`string ::=`
			`"\"" (`
			`[^"\\] \|`
			`"\\" (["\\/bfnrt] \| "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes`
			`)* "\"" ws`

			`number ::= ("-"? ([0-9] \| [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws`

			`# Optional space: by convention, applied in this grammar after literal chars when allowed`
			`ws ::= ([ \t\n] ws)?`
			`

subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`type llamaModel struct {`
			`hyperparameters llamaHyperparameters`
			`}`

fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`func (llm *llamaModel) ModelFamily() string {`
			`return "llama"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`func llamaModelType(numLayer uint32) string {`
			`switch numLayer {`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 26:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "3B"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 32:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "7B"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 40:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "13B"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 48:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "34B"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 60:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "30B"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 80:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "65B"`
			`default:`
starcoder 2023-10-03 02:52:25 +00:00			`return "unknown"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`}`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`func (llm *llamaModel) ModelType() string {`
			`return llamaModelType(llm.hyperparameters.NumLayer)`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`func (llm *llamaModel) FileType() string {`
			`return fileType(llm.hyperparameters.FileType)`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

unbound max num gpu layers (#591) --------- Co-authored-by: Michael Yang <mxyng@pm.me> 2023-09-25 22:36:46 +00:00			`func (llm *llamaModel) NumLayers() int64 {`
			`return int64(llm.hyperparameters.NumLayer)`
			`}`

subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`type llamaHyperparameters struct {`
			`// NumVocab is the size of the model's vocabulary.`
			`NumVocab uint32`

			`// NumEmbd is the size of the model's embedding layer.`
			`NumEmbd uint32`
			`NumMult uint32`
			`NumHead uint32`

			`// NumLayer is the number of layers in the model.`
			`NumLayer uint32`
			`NumRot uint32`

			`// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`FileType uint32`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

			`type Running struct {`
relay CUDA errors to the client (#825) 2023-10-18 19:36:56 +00:00			`Port int`
			`Cmd *exec.Cmd`
			`Cancel context.CancelFunc`
			`exitOnce sync.Once`
			`exitCh chan error // channel to receive the exit status of the subprocess`
			`*StatusWriter // captures error messages from the llama runner process`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

Multimodal support (#1216) --------- Co-authored-by: Matt Apperson <mattapperson@Matts-MacBook-Pro.local> 2023-12-11 21:56:22 +00:00			`type ImageData struct {`
			Data []byte `json:"data"`
			ID int `json:"id"`
			`}`

skip gpu if less than 2GB VRAM are available (#1059) 2023-11-09 21:16:16 +00:00			`var (`
Update llm/llama.go 2023-11-19 02:24:59 +00:00			`errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")`
skip gpu if less than 2GB VRAM are available (#1059) 2023-11-09 21:16:16 +00:00			`errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")`
skip gpu if less than 2GB VRAM are available (#1059) 2023-11-09 21:16:16 +00:00			`)`
first pass at linux gpu support (#454) * linux gpu support * handle multiple gpus * add cuda docker image (#488) --------- Co-authored-by: Michael Yang <mxyng@pm.me> 2023-09-12 15:04:35 +00:00
relay model runner error message to client (#720) * give direction to user when runner fails * also relay errors from timeout * increase timeout to 3 minutes 2023-10-12 15:16:37 +00:00			`// StatusWriter is a writer that captures error messages from the llama runner process`
			`type StatusWriter struct {`
relay CUDA errors to the client (#825) 2023-10-18 19:36:56 +00:00			`ErrCh chan error`
			`LastErrMsg string`
relay model runner error message to client (#720) * give direction to user when runner fails * also relay errors from timeout * increase timeout to 3 minutes 2023-10-12 15:16:37 +00:00			`}`

			`func NewStatusWriter() *StatusWriter {`
			`return &StatusWriter{`
			`ErrCh: make(chan error, 1),`
			`}`
			`}`

			`func (w *StatusWriter) Write(b []byte) (int, error) {`
relay CUDA errors to the client (#825) 2023-10-18 19:36:56 +00:00			`var errMsg string`
relay model runner error message to client (#720) * give direction to user when runner fails * also relay errors from timeout * increase timeout to 3 minutes 2023-10-12 15:16:37 +00:00			`if _, after, ok := bytes.Cut(b, []byte("error:")); ok {`
relay CUDA errors to the client (#825) 2023-10-18 19:36:56 +00:00			`errMsg = string(bytes.TrimSpace(after))`
			`} else if _, after, ok := bytes.Cut(b, []byte("CUDA error")); ok {`
			`errMsg = string(bytes.TrimSpace(after))`
relay model runner error message to client (#720) * give direction to user when runner fails * also relay errors from timeout * increase timeout to 3 minutes 2023-10-12 15:16:37 +00:00			`}`
relay CUDA errors to the client (#825) 2023-10-18 19:36:56 +00:00
			`if errMsg != "" {`
			`w.LastErrMsg = errMsg`
			`w.ErrCh <- fmt.Errorf("llama runner: %s", errMsg)`
			`}`

relay model runner error message to client (#720) * give direction to user when runner fails * also relay errors from timeout * increase timeout to 3 minutes 2023-10-12 15:16:37 +00:00			`return os.Stderr.Write(b)`
			`}`

remove unused struct 2023-10-16 23:31:29 +00:00			`type prediction struct {`
fix not forwarding last token 2023-09-03 21:46:35 +00:00			Content string `json:"content"`
			Model string `json:"model"`
			Prompt string `json:"prompt"`
			Stop bool `json:"stop"`

remove unused struct 2023-10-16 23:31:29 +00:00			`Timings struct {`
			PredictedN int `json:"predicted_n"`
			PredictedMS float64 `json:"predicted_ms"`
			PromptN int `json:"prompt_n"`
			PromptMS float64 `json:"prompt_ms"`
			`}`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

fix memory check 2023-10-12 16:34:16 +00:00			`const maxBufferSize = 512 * format.KiloByte`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`const maxRetries = 3`
			`const retryDelay = 1 * time.Second`
increase streaming buffer size (#692) 2023-10-04 18:09:00 +00:00
chat api endpoint (#1392) 2023-12-05 19:57:33 +00:00			`type PredictOpts struct {`
fix: relay request opts to loaded llm prediction (#1761) 2024-01-03 17:01:42 +00:00			`Prompt string`
			`Format string`
			`Images []api.ImageData`
			`Options api.Options`
chat api endpoint (#1392) 2023-12-05 19:57:33 +00:00			`}`
Revert "chat api (#991)" while context variable is fixed This reverts commit 7a0899d62dee8a55810446dd7655b9e682ddf8ac. 2023-12-05 05:16:27 +00:00
chat api endpoint (#1392) 2023-12-05 19:57:33 +00:00			`type PredictResult struct {`
			`Content string`
			`Done bool`
			`PromptEvalCount int`
			`PromptEvalDuration time.Duration`
			`EvalCount int`
			`EvalDuration time.Duration`
			`}`
remove marshalPrompt which is no longer needed 2023-09-03 18:10:03 +00:00
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`type TokenizeRequest struct {`
			Content string `json:"content"`
			`}`

			`type TokenizeResponse struct {`
			Tokens []int `json:"tokens"`
			`}`

			`type DetokenizeRequest struct {`
			Tokens []int `json:"tokens"`
			`}`

			`type DetokenizeResponse struct {`
			Content string `json:"content"`
			`}`

			`type EmbeddingRequest struct {`
			Content string `json:"content"`
			`}`

			`type EmbeddingResponse struct {`
			Embedding []float64 `json:"embedding"`
			`}`