ollama/llm/llama.go

package llm

import (
	"bytes"
	"context"
	_ "embed"
	"errors"
	"fmt"
	"os"
	"os/exec"
	"sync"
	"time"

	"github.com/jmorganca/ollama/api"
	"github.com/jmorganca/ollama/format"
)

const jsonGrammar = `
root   ::= object
value  ::= object | array | string | number | ("true" | "false" | "null") ws

object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}" ws

array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]" ws

string ::=
  "\"" (
    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws

number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws

# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?
`

type llamaModel struct {
	hyperparameters llamaHyperparameters
}

func (llm *llamaModel) ModelFamily() string {
	return "llama"
}

func llamaModelType(numLayer uint32) string {
	switch numLayer {
	case 26:
		return "3B"
	case 32:
		return "7B"
	case 40:
		return "13B"
	case 48:
		return "34B"
	case 60:
		return "30B"
	case 80:
		return "65B"
	default:
		return "unknown"
	}
}

func (llm *llamaModel) ModelType() string {
	return llamaModelType(llm.hyperparameters.NumLayer)
}

func (llm *llamaModel) FileType() string {
	return fileType(llm.hyperparameters.FileType)
}

func (llm *llamaModel) NumLayers() int64 {
	return int64(llm.hyperparameters.NumLayer)
}

type llamaHyperparameters struct {
	// NumVocab is the size of the model's vocabulary.
	NumVocab uint32

	// NumEmbd is the size of the model's embedding layer.
	NumEmbd uint32
	NumMult uint32
	NumHead uint32

	// NumLayer is the number of layers in the model.
	NumLayer uint32
	NumRot   uint32

	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
	FileType uint32
}

type Running struct {
	Port          int
	Cmd           *exec.Cmd
	Cancel        context.CancelFunc
	exitOnce      sync.Once
	exitCh        chan error // channel to receive the exit status of the subprocess
	*StatusWriter            // captures error messages from the llama runner process
}

type ImageData struct {
	Data []byte `json:"data"`
	ID   int    `json:"id"`
}

var (
	errNvidiaSMI     = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
	errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
)

// StatusWriter is a writer that captures error messages from the llama runner process
type StatusWriter struct {
	ErrCh      chan error
	LastErrMsg string
}

func NewStatusWriter() *StatusWriter {
	return &StatusWriter{
		ErrCh: make(chan error, 1),
	}
}

func (w *StatusWriter) Write(b []byte) (int, error) {
	var errMsg string
	if _, after, ok := bytes.Cut(b, []byte("error:")); ok {
		errMsg = string(bytes.TrimSpace(after))
	} else if _, after, ok := bytes.Cut(b, []byte("CUDA error")); ok {
		errMsg = string(bytes.TrimSpace(after))
	}

	if errMsg != "" {
		w.LastErrMsg = errMsg
		w.ErrCh <- fmt.Errorf("llama runner: %s", errMsg)
	}

	return os.Stderr.Write(b)
}

type prediction struct {
	Content string `json:"content"`
	Model   string `json:"model"`
	Prompt  string `json:"prompt"`
	Stop    bool   `json:"stop"`

	Timings struct {
		PredictedN  int     `json:"predicted_n"`
		PredictedMS float64 `json:"predicted_ms"`
		PromptN     int     `json:"prompt_n"`
		PromptMS    float64 `json:"prompt_ms"`
	}
}

const maxBufferSize = 512 * format.KiloByte
const maxRetries = 3
const retryDelay = 1 * time.Second

type PredictOpts struct {
	Prompt string
	Format string
	Images []api.ImageData
}

type PredictResult struct {
	Content            string
	Done               bool
	PromptEvalCount    int
	PromptEvalDuration time.Duration
	EvalCount          int
	EvalDuration       time.Duration
}

type TokenizeRequest struct {
	Content string `json:"content"`
}

type TokenizeResponse struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeRequest struct {
	Tokens []int `json:"tokens"`
}

type DetokenizeResponse struct {
	Content string `json:"content"`
}

type EmbeddingRequest struct {
	Content string `json:"content"`
}

type EmbeddingResponse struct {
	Embedding []float64 `json:"embedding"`
}
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`package llm`

			`import (`
			`"bytes"`
			`"context"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`_ "embed"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`"errors"`
			`"fmt"`
			`"os"`
			`"os/exec"`
prevent waiting on exited command (#752) * prevent waiting on exited command * close llama runner once 2023-10-11 16:32:13 +00:00			`"sync"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`"time"`

			`"github.com/jmorganca/ollama/api"`
update checkvram 2023-10-13 21:45:50 +00:00			`"github.com/jmorganca/ollama/format"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`)`

JSON mode: add `"format" as an api parameter (#1051) * add `"format": "json"` as an API parameter --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2023-11-10 00:44:02 +00:00			const jsonGrammar = `
			`root ::= object`
			`value ::= object \| array \| string \| number \| ("true" \| "false" \| "null") ws`

			`object ::=`
			`"{" ws (`
			`string ":" ws value`
			`("," ws string ":" ws value)*`
			`)? "}" ws`

			`array ::=`
			`"[" ws (`
			`value`
			`("," ws value)*`
			`)? "]" ws`

			`string ::=`
			`"\"" (`
			`[^"\\] \|`
			`"\\" (["\\/bfnrt] \| "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes`
			`)* "\"" ws`

			`number ::= ("-"? ([0-9] \| [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws`

			`# Optional space: by convention, applied in this grammar after literal chars when allowed`
			`ws ::= ([ \t\n] ws)?`
			`

subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`type llamaModel struct {`
			`hyperparameters llamaHyperparameters`
			`}`

fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`func (llm *llamaModel) ModelFamily() string {`
			`return "llama"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`func llamaModelType(numLayer uint32) string {`
			`switch numLayer {`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 26:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "3B"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 32:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "7B"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 40:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "13B"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 48:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "34B"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 60:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "30B"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`case 80:`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`return "65B"`
			`default:`
starcoder 2023-10-03 02:52:25 +00:00			`return "unknown"`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`}`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`func (llm *llamaModel) ModelType() string {`
			`return llamaModelType(llm.hyperparameters.NumLayer)`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`func (llm *llamaModel) FileType() string {`
			`return fileType(llm.hyperparameters.FileType)`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

unbound max num gpu layers (#591) --------- Co-authored-by: Michael Yang <mxyng@pm.me> 2023-09-25 22:36:46 +00:00			`func (llm *llamaModel) NumLayers() int64 {`
			`return int64(llm.hyperparameters.NumLayer)`
			`}`

subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`type llamaHyperparameters struct {`
			`// NumVocab is the size of the model's vocabulary.`
			`NumVocab uint32`

			`// NumEmbd is the size of the model's embedding layer.`
			`NumEmbd uint32`
			`NumMult uint32`
			`NumHead uint32`

			`// NumLayer is the number of layers in the model.`
			`NumLayer uint32`
			`NumRot uint32`

			`// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.`
fix falcon decode get model and file type from bin file 2023-09-12 17:01:20 +00:00			`FileType uint32`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

			`type Running struct {`
relay CUDA errors to the client (#825) 2023-10-18 19:36:56 +00:00			`Port int`
			`Cmd *exec.Cmd`
			`Cancel context.CancelFunc`
			`exitOnce sync.Once`
			`exitCh chan error // channel to receive the exit status of the subprocess`
			`*StatusWriter // captures error messages from the llama runner process`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

Multimodal support (#1216) --------- Co-authored-by: Matt Apperson <mattapperson@Matts-MacBook-Pro.local> 2023-12-11 21:56:22 +00:00			`type ImageData struct {`
			Data []byte `json:"data"`
			ID int `json:"id"`
			`}`

skip gpu if less than 2GB VRAM are available (#1059) 2023-11-09 21:16:16 +00:00			`var (`
Update llm/llama.go 2023-11-19 02:24:59 +00:00			`errNvidiaSMI = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")`
skip gpu if less than 2GB VRAM are available (#1059) 2023-11-09 21:16:16 +00:00			`errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")`
			`)`
first pass at linux gpu support (#454) * linux gpu support * handle multiple gpus * add cuda docker image (#488) --------- Co-authored-by: Michael Yang <mxyng@pm.me> 2023-09-12 15:04:35 +00:00
relay model runner error message to client (#720) * give direction to user when runner fails * also relay errors from timeout * increase timeout to 3 minutes 2023-10-12 15:16:37 +00:00			`// StatusWriter is a writer that captures error messages from the llama runner process`
			`type StatusWriter struct {`
relay CUDA errors to the client (#825) 2023-10-18 19:36:56 +00:00			`ErrCh chan error`
			`LastErrMsg string`
relay model runner error message to client (#720) * give direction to user when runner fails * also relay errors from timeout * increase timeout to 3 minutes 2023-10-12 15:16:37 +00:00			`}`

			`func NewStatusWriter() *StatusWriter {`
			`return &StatusWriter{`
			`ErrCh: make(chan error, 1),`
			`}`
			`}`

			`func (w *StatusWriter) Write(b []byte) (int, error) {`
relay CUDA errors to the client (#825) 2023-10-18 19:36:56 +00:00			`var errMsg string`
relay model runner error message to client (#720) * give direction to user when runner fails * also relay errors from timeout * increase timeout to 3 minutes 2023-10-12 15:16:37 +00:00			`if _, after, ok := bytes.Cut(b, []byte("error:")); ok {`
relay CUDA errors to the client (#825) 2023-10-18 19:36:56 +00:00			`errMsg = string(bytes.TrimSpace(after))`
			`} else if _, after, ok := bytes.Cut(b, []byte("CUDA error")); ok {`
			`errMsg = string(bytes.TrimSpace(after))`
relay model runner error message to client (#720) * give direction to user when runner fails * also relay errors from timeout * increase timeout to 3 minutes 2023-10-12 15:16:37 +00:00			`}`
relay CUDA errors to the client (#825) 2023-10-18 19:36:56 +00:00
			`if errMsg != "" {`
			`w.LastErrMsg = errMsg`
			`w.ErrCh <- fmt.Errorf("llama runner: %s", errMsg)`
			`}`

relay model runner error message to client (#720) * give direction to user when runner fails * also relay errors from timeout * increase timeout to 3 minutes 2023-10-12 15:16:37 +00:00			`return os.Stderr.Write(b)`
			`}`

remove unused struct 2023-10-16 23:31:29 +00:00			`type prediction struct {`
fix not forwarding last token 2023-09-03 21:46:35 +00:00			Content string `json:"content"`
			Model string `json:"model"`
			Prompt string `json:"prompt"`
			Stop bool `json:"stop"`

remove unused struct 2023-10-16 23:31:29 +00:00			`Timings struct {`
			PredictedN int `json:"predicted_n"`
			PredictedMS float64 `json:"predicted_ms"`
			PromptN int `json:"prompt_n"`
			PromptMS float64 `json:"prompt_ms"`
			`}`
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`}`

fix memory check 2023-10-12 16:34:16 +00:00			`const maxBufferSize = 512 * format.KiloByte`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`const maxRetries = 3`
			`const retryDelay = 1 * time.Second`
increase streaming buffer size (#692) 2023-10-04 18:09:00 +00:00
chat api endpoint (#1392) 2023-12-05 19:57:33 +00:00			`type PredictOpts struct {`
restore model load duration on generate response (#1524) * restore model load duration on generate response - set model load duration on generate and chat done response - calculate createAt time when response created * remove checkpoints predict opts * Update routes.go 2023-12-14 17:15:50 +00:00			`Prompt string`
			`Format string`
			`Images []api.ImageData`
chat api endpoint (#1392) 2023-12-05 19:57:33 +00:00			`}`
Revert "chat api (#991)" while context variable is fixed This reverts commit 7a0899d62dee8a55810446dd7655b9e682ddf8ac. 2023-12-05 05:16:27 +00:00
chat api endpoint (#1392) 2023-12-05 19:57:33 +00:00			`type PredictResult struct {`
			`Content string`
			`Done bool`
			`PromptEvalCount int`
			`PromptEvalDuration time.Duration`
			`EvalCount int`
			`EvalDuration time.Duration`
			`}`
remove marshalPrompt which is no longer needed 2023-09-03 18:10:03 +00:00
subprocess llama.cpp server (#401) * remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm 2023-08-30 20:35:03 +00:00			`type TokenizeRequest struct {`
			Content string `json:"content"`
			`}`

			`type TokenizeResponse struct {`
			Tokens []int `json:"tokens"`
			`}`

			`type DetokenizeRequest struct {`
			Tokens []int `json:"tokens"`
			`}`

			`type DetokenizeResponse struct {`
			Content string `json:"content"`
			`}`

			`type EmbeddingRequest struct {`
			Content string `json:"content"`
			`}`

			`type EmbeddingResponse struct {`
			Embedding []float64 `json:"embedding"`
			`}`