From ebc529cbb3f0b27f6c154fa90e724db8243a7614 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 5 Jul 2024 17:31:23 -0700 Subject: [PATCH 01/46] autodetect stop parameters from template --- server/model.go | 21 ++++++++++++++++++--- server/routes_create_test.go | 3 ++- template/alfred.json | 8 ++++++++ template/alpaca.json | 6 ++++++ template/chatml.json | 6 ++++++ template/chatqa.json | 8 ++++++++ template/codellama-70b-instruct.json | 7 +++++++ template/falcon-instruct.json | 6 ++++++ template/gemma-instruct.json | 6 ++++++ template/granite-instruct.json | 7 +++++++ template/llama2-chat.json | 8 ++++++++ template/llama3-instruct.json | 7 +++++++ template/magicoder.json | 6 ++++++ template/mistral-instruct.json | 6 ++++++ template/openchat.json | 5 +++++ template/phi-3.json | 8 ++++++++ template/solar-instruct.json | 7 +++++++ template/starcoder2-instruct.json | 7 +++++++ template/template.go | 14 ++++++++++++++ template/vicuna.json | 6 ++++++ template/zephyr.json | 8 ++++++++ 21 files changed, 156 insertions(+), 4 deletions(-) create mode 100644 template/alfred.json create mode 100644 template/alpaca.json create mode 100644 template/chatml.json create mode 100644 template/chatqa.json create mode 100644 template/codellama-70b-instruct.json create mode 100644 template/falcon-instruct.json create mode 100644 template/gemma-instruct.json create mode 100644 template/granite-instruct.json create mode 100644 template/llama2-chat.json create mode 100644 template/llama3-instruct.json create mode 100644 template/magicoder.json create mode 100644 template/mistral-instruct.json create mode 100644 template/openchat.json create mode 100644 template/phi-3.json create mode 100644 template/solar-instruct.json create mode 100644 template/starcoder2-instruct.json create mode 100644 template/vicuna.json create mode 100644 template/zephyr.json diff --git a/server/model.go b/server/model.go index a79f549a..d33ffaec 100644 --- a/server/model.go +++ b/server/model.go @@ -4,6 +4,7 @@ import ( "archive/zip" "bytes" "context" + "encoding/json" "errors" "fmt" "io" @@ -259,13 +260,27 @@ func detectChatTemplate(layers []*layerGGML) ([]*layerGGML, error) { if t, err := template.Named(s); err != nil { slog.Debug("template detection", "error", err) } else { - tmpl, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template") + layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template") if err != nil { return nil, err } - tmpl.status = fmt.Sprintf("using autodetected template %s", t.Name) - layers = append(layers, &layerGGML{tmpl, nil}) + layer.status = fmt.Sprintf("using autodetected template %s", t.Name) + layers = append(layers, &layerGGML{layer, nil}) + + if t.Parameters != nil { + var b bytes.Buffer + if err := json.NewEncoder(&b).Encode(t.Parameters); err != nil { + return nil, err + } + + layer, err := NewLayer(&b, "application/vnd.ollama.image.params") + if err != nil { + return nil, err + } + + layers = append(layers, &layerGGML{layer, nil}) + } } } } diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 04174b92..84672087 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -545,9 +545,10 @@ func TestCreateDetectTemplate(t *testing.T) { } checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{ + filepath.Join(p, "blobs", "sha256-0d79f567714c62c048378f2107fb332dabee0135d080c302d884317da9433cc5"), filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"), filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"), - filepath.Join(p, "blobs", "sha256-f836ee110db21567f826332e4cedd746c06d10664fd5a9ea3659e3683a944510"), + filepath.Join(p, "blobs", "sha256-ea34c57ba5b78b740aafe2aeb74dc6507fc3ad14170b64c26a04fb9e36c88d75"), }) }) diff --git a/template/alfred.json b/template/alfred.json new file mode 100644 index 00000000..edac21af --- /dev/null +++ b/template/alfred.json @@ -0,0 +1,8 @@ +{ + "stop": [ + "", + "", + "", + "" + ] +} diff --git a/template/alpaca.json b/template/alpaca.json new file mode 100644 index 00000000..eafe2b8a --- /dev/null +++ b/template/alpaca.json @@ -0,0 +1,6 @@ +{ + "stop": [ + "### Instruction:", + "### Response" + ] +} diff --git a/template/chatml.json b/template/chatml.json new file mode 100644 index 00000000..7afeb3de --- /dev/null +++ b/template/chatml.json @@ -0,0 +1,6 @@ +{ + "stop": [ + "<|im_start|>", + "<|im_end|>" + ] +} diff --git a/template/chatqa.json b/template/chatqa.json new file mode 100644 index 00000000..64dd0f33 --- /dev/null +++ b/template/chatqa.json @@ -0,0 +1,8 @@ +{ + "stop": [ + "System:", + "User:", + "Assistant:", + "<|begin_of_text|>" + ] +} diff --git a/template/codellama-70b-instruct.json b/template/codellama-70b-instruct.json new file mode 100644 index 00000000..a56a63f1 --- /dev/null +++ b/template/codellama-70b-instruct.json @@ -0,0 +1,7 @@ +{ + "stop": [ + "Source:", + "Destination:", + "" + ] +} diff --git a/template/falcon-instruct.json b/template/falcon-instruct.json new file mode 100644 index 00000000..a0da0e81 --- /dev/null +++ b/template/falcon-instruct.json @@ -0,0 +1,6 @@ +{ + "stop": [ + "User:", + "Assistant:" + ] +} diff --git a/template/gemma-instruct.json b/template/gemma-instruct.json new file mode 100644 index 00000000..f4ad415c --- /dev/null +++ b/template/gemma-instruct.json @@ -0,0 +1,6 @@ +{ + "stop": [ + "", + "" + ] +} diff --git a/template/granite-instruct.json b/template/granite-instruct.json new file mode 100644 index 00000000..0933e4b5 --- /dev/null +++ b/template/granite-instruct.json @@ -0,0 +1,7 @@ +{ + "stop": [ + "System:", + "Question:", + "Answer:" + ] +} diff --git a/template/llama2-chat.json b/template/llama2-chat.json new file mode 100644 index 00000000..17590ab4 --- /dev/null +++ b/template/llama2-chat.json @@ -0,0 +1,8 @@ +{ + "stop": [ + "[INST]", + "[/INST]", + "<>", + "<>" + ] +} diff --git a/template/llama3-instruct.json b/template/llama3-instruct.json new file mode 100644 index 00000000..c4e9d448 --- /dev/null +++ b/template/llama3-instruct.json @@ -0,0 +1,7 @@ +{ + "stop": [ + "<|start_header_id|>", + "<|end_header_id|>", + "<|eot_id|>" + ] +} diff --git a/template/magicoder.json b/template/magicoder.json new file mode 100644 index 00000000..6f67cab0 --- /dev/null +++ b/template/magicoder.json @@ -0,0 +1,6 @@ +{ + "stop": [ + "@@ Instruction", + "@@ Response" + ] +} diff --git a/template/mistral-instruct.json b/template/mistral-instruct.json new file mode 100644 index 00000000..7afeb3de --- /dev/null +++ b/template/mistral-instruct.json @@ -0,0 +1,6 @@ +{ + "stop": [ + "<|im_start|>", + "<|im_end|>" + ] +} diff --git a/template/openchat.json b/template/openchat.json new file mode 100644 index 00000000..0edc341f --- /dev/null +++ b/template/openchat.json @@ -0,0 +1,5 @@ +{ + "stop": [ + "<|end_of_turn|>" + ] +} diff --git a/template/phi-3.json b/template/phi-3.json new file mode 100644 index 00000000..27bf7664 --- /dev/null +++ b/template/phi-3.json @@ -0,0 +1,8 @@ +{ + "stop": [ + "<|end|>", + "<|system|>", + "<|user|>", + "<|assistant|>" + ] +} diff --git a/template/solar-instruct.json b/template/solar-instruct.json new file mode 100644 index 00000000..7b7a9050 --- /dev/null +++ b/template/solar-instruct.json @@ -0,0 +1,7 @@ +{ + "stop": [ + "### System:", + "### User:", + "### Assistant" + ] +} diff --git a/template/starcoder2-instruct.json b/template/starcoder2-instruct.json new file mode 100644 index 00000000..31348908 --- /dev/null +++ b/template/starcoder2-instruct.json @@ -0,0 +1,7 @@ +{ + "stop": [ + "### Instruction", + "### Response", + "<|endoftext|>" + ] +} diff --git a/template/template.go b/template/template.go index 9b351666..9bb6a399 100644 --- a/template/template.go +++ b/template/template.go @@ -23,6 +23,7 @@ import ( var indexBytes []byte //go:embed *.gotmpl +//go:embed *.json var templatesFS embed.FS var templatesOnce = sync.OnceValues(func() ([]*named, error) { @@ -39,6 +40,15 @@ var templatesOnce = sync.OnceValues(func() ([]*named, error) { // normalize line endings t.Bytes = bytes.ReplaceAll(bts, []byte("\r\n"), []byte("\n")) + + params, err := templatesFS.ReadFile(t.Name + ".json") + if err != nil { + continue + } + + if err := json.Unmarshal(params, &t.Parameters); err != nil { + return nil, err + } } return templates, nil @@ -48,6 +58,10 @@ type named struct { Name string `json:"name"` Template string `json:"template"` Bytes []byte + + Parameters *struct { + Stop []string `json:"stop"` + } } func (t named) Reader() io.Reader { diff --git a/template/vicuna.json b/template/vicuna.json new file mode 100644 index 00000000..ed7bfb0f --- /dev/null +++ b/template/vicuna.json @@ -0,0 +1,6 @@ +{ + "stop": [ + "USER:", + "ASSISTANT:" + ] +} diff --git a/template/zephyr.json b/template/zephyr.json new file mode 100644 index 00000000..f9c0115c --- /dev/null +++ b/template/zephyr.json @@ -0,0 +1,8 @@ +{ + "stop": [ + "<|system|>", + "", + "<|user|>", + "<|assistant|>" + ] +} From f02f83660c2e6f0741932bb31a28b82950144dfc Mon Sep 17 00:00:00 2001 From: lreed Date: Wed, 17 Jul 2024 21:44:19 +0000 Subject: [PATCH 02/46] bump go version to 1.22.5 to fix security vulnerabilities --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index ca393496..c8efdd8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG GOLANG_VERSION=1.22.1 +ARG GOLANG_VERSION=1.22.5 ARG CMAKE_VERSION=3.22.1 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md ARG CUDA_VERSION=11.3.1 From a3c20e3f181607760ee86893baaf31b3c7fd3012 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 22 Jul 2024 08:52:16 -0700 Subject: [PATCH 03/46] Refine error reporting for subprocess crash On windows, the exit status winds up being the search term many users search for and end up piling in on issues that are unrelated. This refines the reporting so that if we have a more detailed message we'll suppress the exit status portion of the message. --- llm/server.go | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/llm/server.go b/llm/server.go index ba7eab03..08463ef0 100644 --- a/llm/server.go +++ b/llm/server.go @@ -417,7 +417,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // reap subprocess when it exits go func() { - s.done <- s.cmd.Wait() + err := s.cmd.Wait() + // Favor a more detailed message over the process exit status + if err != nil && s.status != nil && s.status.LastErrMsg != "" { + slog.Debug("llama runner terminated", "error", err) + if strings.Contains(s.status.LastErrMsg, "unknown model") { + s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade" + } + s.done <- fmt.Errorf(s.status.LastErrMsg) + } else { + s.done <- err + } }() return s, nil @@ -580,14 +590,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error { slog.Warn("client connection closed before server finished loading, aborting load") return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err()) case err := <-s.done: - msg := "" - if s.status != nil && s.status.LastErrMsg != "" { - msg = s.status.LastErrMsg - } - if strings.Contains(msg, "unknown model") { - return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade") - } - return fmt.Errorf("llama runner process has terminated: %v %s", err, msg) + return fmt.Errorf("llama runner process has terminated: %w", err) default: } if time.Now().After(stallTimer) { From cc269ba0943ee1fa0bddcce8027d0a6d1b86fec5 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 22 Jul 2024 09:08:11 -0700 Subject: [PATCH 04/46] Remove no longer supported max vram var The OLLAMA_MAX_VRAM env var was a temporary workaround for OOM scenarios. With Concurrency this was no longer wired up, and the simplistic value doesn't map to multi-GPU setups. Users can still set `num_gpu` to limit memory usage to avoid OOM if we get our predictions wrong. --- cmd/cmd.go | 1 - envconfig/config.go | 13 ------------- integration/concurrency_test.go | 4 ++-- 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index 2252a905..b761d018 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -1344,7 +1344,6 @@ func NewCLI() *cobra.Command { envVars["OLLAMA_TMPDIR"], envVars["OLLAMA_FLASH_ATTENTION"], envVars["OLLAMA_LLM_LIBRARY"], - envVars["OLLAMA_MAX_VRAM"], }) default: appendEnvDocs(cmd, envs) diff --git a/envconfig/config.go b/envconfig/config.go index 62d661eb..0abc6968 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -43,8 +43,6 @@ var ( MaxRunners int // Set via OLLAMA_MAX_QUEUE in the environment MaxQueuedRequests int - // Set via OLLAMA_MAX_VRAM in the environment - MaxVRAM uint64 // Set via OLLAMA_MODELS in the environment ModelsDir string // Set via OLLAMA_NOHISTORY in the environment @@ -89,7 +87,6 @@ func AsMap() map[string]EnvVar { "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"}, "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"}, "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"}, - "OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"}, "OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"}, "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"}, @@ -194,16 +191,6 @@ func LoadConfig() { TmpDir = clean("OLLAMA_TMPDIR") - userLimit := clean("OLLAMA_MAX_VRAM") - if userLimit != "" { - avail, err := strconv.ParseUint(userLimit, 10, 64) - if err != nil { - slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err) - } else { - MaxVRAM = avail - } - } - LLMLibrary = clean("OLLAMA_LLM_LIBRARY") if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" { diff --git a/integration/concurrency_test.go b/integration/concurrency_test.go index d66ba9f0..8593285b 100644 --- a/integration/concurrency_test.go +++ b/integration/concurrency_test.go @@ -69,7 +69,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { reqLimit := len(req) iterLimit := 5 - vram := os.Getenv("OLLAMA_MAX_VRAM") + vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM if vram != "" { max, err := strconv.ParseUint(vram, 10, 64) require.NoError(t, err) @@ -106,7 +106,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) { // Stress the system if we know how much VRAM it has, and attempt to load more models than will fit func TestMultiModelStress(t *testing.T) { - vram := os.Getenv("OLLAMA_MAX_VRAM") + vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM if vram == "" { t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test") } From b3e5491e41811294de9d81649a96581af6522d08 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 22 Jul 2024 12:38:03 -0400 Subject: [PATCH 05/46] server: collect nested tool call objects when parsing (#5824) --- server/model.go | 43 +++++++++++++++++++++-------- server/model_test.go | 1 + server/routes.go | 4 +-- server/testdata/tools/xlam.gotmpl | 45 +++++++++++++++++++++++++++++++ server/testdata/tools/xlam.out | 40 +++++++++++++++++++++++++++ 5 files changed, 120 insertions(+), 13 deletions(-) create mode 100644 server/testdata/tools/xlam.gotmpl create mode 100644 server/testdata/tools/xlam.out diff --git a/server/model.go b/server/model.go index a084dd8c..bf38c415 100644 --- a/server/model.go +++ b/server/model.go @@ -344,6 +344,10 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) { } } + if name == "" || arguments == "" { + return nil, false + } + var objs []map[string]any for offset := 0; offset < len(s); { var obj map[string]any @@ -361,23 +365,40 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) { return nil, false } else { offset += int(decoder.InputOffset()) - objs = append(objs, obj) + + // collect all nested objects + var collect func(any) []map[string]any + collect = func(obj any) (all []map[string]any) { + switch o := obj.(type) { + case map[string]any: + all = append(all, o) + for _, v := range o { + all = append(all, collect(v)...) + } + case []any: + for _, v := range o { + all = append(all, collect(v)...) + } + } + + return all + } + objs = append(objs, collect(obj)...) } } var toolCalls []api.ToolCall for _, kv := range objs { - var call api.ToolCall - for k, v := range kv { - switch k { - case name: - call.Function.Name = v.(string) - case arguments: - call.Function.Arguments = v.(map[string]any) - } + n, nok := kv[name].(string) + a, aok := kv[arguments].(map[string]any) + if nok && aok { + toolCalls = append(toolCalls, api.ToolCall{ + Function: api.ToolCallFunction{ + Name: n, + Arguments: a, + }, + }) } - - toolCalls = append(toolCalls, call) } return toolCalls, len(toolCalls) > 0 diff --git a/server/model_test.go b/server/model_test.go index 7c826b06..5829adfc 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -166,6 +166,7 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`, {"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}} {"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}} `, true}, + {"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true}, } var tools []api.Tool diff --git a/server/routes.go b/server/routes.go index 85db7924..0d7ca003 100644 --- a/server/routes.go +++ b/server/routes.go @@ -611,10 +611,10 @@ func (s *Server) CreateModelHandler(c *gin.Context) { quantization := cmp.Or(r.Quantize, r.Quantization) if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil { if errors.Is(err, errBadTemplate) { - ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest} + ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest} } ch <- gin.H{"error": err.Error()} - } + } }() if r.Stream != nil && !*r.Stream { diff --git a/server/testdata/tools/xlam.gotmpl b/server/testdata/tools/xlam.gotmpl new file mode 100644 index 00000000..51513d69 --- /dev/null +++ b/server/testdata/tools/xlam.gotmpl @@ -0,0 +1,45 @@ +{{- if .System }}{{ .System }} +{{ end }} +{{- range $i, $_ := .Messages }} +{{- if eq .Role "user" }}### Instruction: +{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }} +[BEGIN OF TASK INSTRUCTION] +You are an expert in composing functions. You are given a question and a set of possible functions. +Based on the question, you will need to make one or more function/tool calls to achieve the purpose. +If none of the functions can be used, point it out and refuse to answer. +If the given question lacks the parameters required by the function, also point it out. +[END OF TASK INSTRUCTION] + +[BEGIN OF AVAILABLE TOOLS] +{{ $.Tools }} +[END OF AVAILABLE TOOLS] + +[BEGIN OF FORMAT INSTRUCTION] +The output MUST strictly adhere to the following JSON format, and NO other text MUST be included. +The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'. +``` +{ + "tool_calls": [ + {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}}, + ... (more tool calls as required) + ] +} +``` +[END OF FORMAT INSTRUCTION] + +[BEGIN OF QUERY] +{{ .Content }} +[END OF QUERY] + + +{{ else }} +{{ .Content }} +{{ end }} +{{- else if .ToolCalls }}### Response: +{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]} +<|EOT|> +{{ else if eq .Role "assistant" }}### Response: +{{ .Content }} +<|EOT|> +{{ end }} +{{- end }}### Response: \ No newline at end of file diff --git a/server/testdata/tools/xlam.out b/server/testdata/tools/xlam.out new file mode 100644 index 00000000..a4a9952f --- /dev/null +++ b/server/testdata/tools/xlam.out @@ -0,0 +1,40 @@ +You are a knowledgable assistant. You can answer questions and perform tasks. +### Instruction: +What's the weather like today in Paris? +### Response: +{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]} +<|EOT|> +### Response: +The current temperature in Paris, France is 22 degrees Celsius. +<|EOT|> +### Instruction: +[BEGIN OF TASK INSTRUCTION] +You are an expert in composing functions. You are given a question and a set of possible functions. +Based on the question, you will need to make one or more function/tool calls to achieve the purpose. +If none of the functions can be used, point it out and refuse to answer. +If the given question lacks the parameters required by the function, also point it out. +[END OF TASK INSTRUCTION] + +[BEGIN OF AVAILABLE TOOLS] +[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}] +[END OF AVAILABLE TOOLS] + +[BEGIN OF FORMAT INSTRUCTION] +The output MUST strictly adhere to the following JSON format, and NO other text MUST be included. +The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'. +``` +{ + "tool_calls": [ + {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}}, + ... (more tool calls as required) + ] +} +``` +[END OF FORMAT INSTRUCTION] + +[BEGIN OF QUERY] +What's the weather like today in San Francisco and Toronto? +[END OF QUERY] + + +### Response: \ No newline at end of file From f8fedbda20b1b2531499ba64758642b0568b6f01 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 22 Jul 2024 12:42:00 -0400 Subject: [PATCH 06/46] Update llama.cpp submodule commit to `d94c6e0c` (#5805) --- llm/llama.cpp | 2 +- llm/patches/05-default-pretokenizer.diff | 10 +- ...{07-embeddings.diff => 06-embeddings.diff} | 0 llm/patches/06-qwen2.diff | 13 - ...clip-unicode.diff => 07-clip-unicode.diff} | 0 .../{09-pooling.diff => 08-pooling.diff} | 0 llm/patches/09-lora.diff | 360 ++++++++++++++++++ llm/patches/10-tekken.diff | 43 --- llm/patches/11-embd_kv.diff | 19 - 9 files changed, 366 insertions(+), 81 deletions(-) rename llm/patches/{07-embeddings.diff => 06-embeddings.diff} (100%) delete mode 100644 llm/patches/06-qwen2.diff rename llm/patches/{08-clip-unicode.diff => 07-clip-unicode.diff} (100%) rename llm/patches/{09-pooling.diff => 08-pooling.diff} (100%) create mode 100644 llm/patches/09-lora.diff delete mode 100644 llm/patches/10-tekken.diff delete mode 100644 llm/patches/11-embd_kv.diff diff --git a/llm/llama.cpp b/llm/llama.cpp index a8db2a9c..d94c6e0c 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584 +Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index 341a6f59..646bc49c 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,8 +1,8 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index 2b9ace28..172640e2 100644 +index 8fe51971..7113ba64 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -5357,16 +5357,7 @@ static void llm_load_vocab( +@@ -5433,16 +5433,7 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { vocab.tokenizer_add_space_prefix = false; vocab.tokenizer_clean_spaces = true; @@ -20,9 +20,9 @@ index 2b9ace28..172640e2 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -5439,7 +5430,8 @@ static void llm_load_vocab( - tokenizer_pre == "jais") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; +@@ -5526,7 +5517,8 @@ static void llm_load_vocab( + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM; + vocab.tokenizer_clean_spaces = false; } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); diff --git a/llm/patches/07-embeddings.diff b/llm/patches/06-embeddings.diff similarity index 100% rename from llm/patches/07-embeddings.diff rename to llm/patches/06-embeddings.diff diff --git a/llm/patches/06-qwen2.diff b/llm/patches/06-qwen2.diff deleted file mode 100644 index 1c7109f6..00000000 --- a/llm/patches/06-qwen2.diff +++ /dev/null @@ -1,13 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index 40d2ec2c..f34eb79a 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv( - struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); - cb(kq, "kq", il); - -- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) { -+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) { - // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs - // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); diff --git a/llm/patches/08-clip-unicode.diff b/llm/patches/07-clip-unicode.diff similarity index 100% rename from llm/patches/08-clip-unicode.diff rename to llm/patches/07-clip-unicode.diff diff --git a/llm/patches/09-pooling.diff b/llm/patches/08-pooling.diff similarity index 100% rename from llm/patches/09-pooling.diff rename to llm/patches/08-pooling.diff diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff new file mode 100644 index 00000000..fc1017a6 --- /dev/null +++ b/llm/patches/09-lora.diff @@ -0,0 +1,360 @@ +diff --git a/common/common.cpp b/common/common.cpp +index dbb724fb..c26fe6ee 100644 +--- a/common/common.cpp ++++ b/common/common.cpp +@@ -2087,14 +2087,29 @@ std::tuple llama_init_from_gpt_par + for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { + const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); + float lora_scale = std::get<1>(params.lora_adapter[i]); ++ ++ // try to load as gguf + auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); + if (adapter == nullptr) { +- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); +- llama_free(lctx); +- llama_free_model(model); +- return std::make_tuple(nullptr, nullptr); ++ fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__); ++ ++ // if that fails, try loading as ggla for compatibility ++ int err = llama_model_apply_lora_from_file(model, ++ lora_adapter.c_str(), ++ lora_scale, ++ ((i > 0) || params.lora_base.empty()) ++ ? NULL ++ : params.lora_base.c_str(), ++ params.n_threads); ++ if (err != 0) { ++ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); ++ llama_free(lctx); ++ llama_free_model(model); ++ return std::make_tuple(nullptr, nullptr); ++ } ++ } else { ++ llama_lora_adapter_set(lctx, adapter, lora_scale); + } +- llama_lora_adapter_set(lctx, adapter, lora_scale); + } + + if (params.ignore_eos) { +diff --git a/include/llama.h b/include/llama.h +index 93fd77ca..b0fb37a6 100644 +--- a/include/llama.h ++++ b/include/llama.h +@@ -1160,6 +1160,20 @@ extern "C" { + + LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); + ++ // Apply a LoRA adapter to a loaded model ++ // path_base_model is the path to a higher quality model to use as a base for ++ // the layers modified by the adapter. Can be NULL to use the current loaded model. ++ // The model needs to be reloaded before applying a new adapter, otherwise the adapter ++ // will be applied on top of the previous one ++ // Returns 0 on success ++ LLAMA_API int32_t llama_model_apply_lora_from_file( ++ const struct llama_model * model, ++ const char * path_lora, ++ float scale, ++ const char * path_base_model, ++ int32_t n_threads); ++ ++ + #ifdef __cplusplus + } + #endif +diff --git a/src/llama.cpp b/src/llama.cpp +index 80a0dd0f..9d7b0e17 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text, + fputs(text, stderr); + fflush(stderr); + } ++ ++static int llama_apply_lora_from_file_internal( ++ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads ++) { ++ LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); ++ ++ const int64_t t_start_lora_us = ggml_time_us(); ++ ++ llama_file fin(path_lora, "rb"); ++ ++ // verify magic and version ++ { ++ uint32_t magic = fin.read_u32(); ++ if (magic != LLAMA_FILE_MAGIC_GGLA) { ++ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__); ++ return 1; ++ } ++ ++ uint32_t format_version = fin.read_u32(); ++ if (format_version != 1) { ++ LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ ); ++ return 1; ++ } ++ } ++ ++ int32_t lora_r = fin.read_u32(); ++ int32_t lora_alpha = fin.read_u32(); ++ float scaling = scale * (float)lora_alpha / (float)lora_r; ++ ++ LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); ++ ++ // load base model ++ std::unique_ptr ml; ++ if (path_base_model) { ++ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); ++ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr)); ++ ml->init_mappings(/*prefetch*/ false); // no prefetching ++ } ++ ++ struct tensor_meta { ++ std::string name; ++ ggml_type type; ++ int32_t ne[2]; ++ size_t offset; ++ }; ++ std::map tensor_meta_map; ++ ++ // load all tensor meta ++ while (true) { ++ if (fin.tell() == fin.size) { ++ // eof ++ break; ++ } ++ ++ int32_t n_dims; ++ int32_t name_len; ++ int32_t ftype; ++ ++ fin.read_raw(&n_dims, sizeof(n_dims)); ++ fin.read_raw(&name_len, sizeof(name_len)); ++ fin.read_raw(&ftype, sizeof(ftype)); ++ ++ if (n_dims != 1 && n_dims != 2) { ++ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); ++ return 1; ++ } ++ ++ int32_t ne[2] = { 1, 1 }; ++ for (int i = 0; i < n_dims; ++i) { ++ fin.read_raw(&ne[i], sizeof(ne[i])); ++ } ++ ++ std::string name; ++ { ++ GGML_ASSERT(name_len < GGML_MAX_NAME); ++ char buf[GGML_MAX_NAME]; ++ fin.read_raw(buf, name_len); ++ name = std::string(buf, name_len); ++ } ++ ++ // check for lora suffix ++ std::string lora_suffix; ++ if (name.length() > 6) { ++ lora_suffix = name.substr(name.length() - 6); ++ } ++ if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { ++ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); ++ return 1; ++ } ++ ++ // tensor type ++ ggml_type wtype; ++ switch (ftype) { ++ case 0: wtype = GGML_TYPE_F32; break; ++ case 1: wtype = GGML_TYPE_F16; break; ++ default: ++ { ++ LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n", ++ __func__, ftype); ++ return 1; ++ } ++ } ++ ++ // data offset ++ size_t offset = fin.tell(); ++ offset = (offset + 31) & -32; ++ ++ // skip tensor data ++ fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); ++ ++ tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); ++ } ++ ++ bool warned = false; ++ int n_tensors = 0; ++ ++ // apply ++ ggml_backend_t backend_cpu = ggml_backend_cpu_init(); ++ if (backend_cpu == nullptr) { ++ LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); ++ return 1; ++ } ++ ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); ++ ++ std::vector> read_buf; ++ for (const auto & it : model.tensors_by_name) { ++ const std::string & base_name = it.first; ++ ggml_tensor * model_t = it.second; ++ ++ if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || ++ tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { ++ continue; ++ } ++ ++ tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); ++ tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); ++ ++ ggml_init_params lora_init_params = { ++ /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), ++ /* .mem_buffer */ nullptr, ++ /* .no_alloc */ true, ++ }; ++ ggml_context * lora_ctx = ggml_init(lora_init_params); ++ if (lora_ctx == nullptr) { ++ LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); ++ ggml_backend_free(backend_cpu); ++ return 1; ++ } ++ ++ // create tensors ++ ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); ++ ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); ++ ggml_set_name(loraA, metaA.name.c_str()); ++ ggml_set_name(loraB, metaB.name.c_str()); ++ ++ ggml_tensor * base_t; ++ if (ml) { ++ if (!ml->get_tensor_meta(base_name.c_str())) { ++ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); ++ return 1; ++ } ++ base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); ++ } else { ++ base_t = ggml_dup_tensor(lora_ctx, model_t); ++ } ++ ggml_set_name(base_t, base_name.c_str()); ++ ++ // allocate in backend buffer ++ ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); ++ if (lora_buf == nullptr) { ++ LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); ++ return 1; ++ } ++ ++ // load tensor data ++ auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { ++ read_buf.resize(ggml_nbytes(tensor)); ++ fin.seek(tensor_meta.offset, SEEK_SET); ++ fin.read_raw(read_buf.data(), ggml_nbytes(tensor)); ++ ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); ++ }; ++ load_tensor(metaA, loraA); ++ load_tensor(metaB, loraB); ++ ++ // load base model tensor data ++ if (ml) { ++ ml->load_data_for(base_t); ++ } else { ++ ggml_backend_tensor_copy(model_t, base_t); ++ } ++ ++ if (ggml_is_quantized(base_t->type) && !warned) { ++ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " ++ "use a f16 or f32 base model with --lora-base\n", __func__); ++ warned = true; ++ } ++ ++ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { ++ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" ++ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); ++ ggml_free(lora_ctx); ++ ggml_backend_buffer_free(lora_buf); ++ ggml_backend_free(backend_cpu); ++ return 1; ++ } ++ ++ auto build_lora_graph = [&]() { ++ // w = w + BA*s ++ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); ++ ggml_set_name(BA, "BA"); ++ ++ if (scaling != 1.0f) { ++ BA = ggml_scale(lora_ctx, BA, scaling); ++ ggml_set_name(BA, "BA_scaled"); ++ } ++ ++ ggml_tensor * r; ++ r = ggml_add_inplace(lora_ctx, base_t, BA); ++ ggml_set_name(r, "r_add"); ++ ++ if (base_t->type != model_t->type) { ++ // convert the result to the model type ++ r = ggml_cast(lora_ctx, r, model_t->type); ++ ggml_set_name(r, "r_cast"); ++ } ++ ++ return r; ++ }; ++ ++ ggml_cgraph * gf = ggml_new_graph(lora_ctx); ++ ggml_tensor * r = build_lora_graph(); ++ ggml_build_forward_expand(gf, r); ++ ++ ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); ++ if (graph_buf == nullptr) { ++ LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); ++ ggml_free(lora_ctx); ++ ggml_backend_buffer_free(lora_buf); ++ ggml_backend_free(backend_cpu); ++ return 1; ++ } ++ ++ ggml_backend_graph_compute(backend_cpu, gf); ++ ++ ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); ++ ++#if 0 ++ // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU ++ //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE); ++ ++ // sched compute ++ ggml_build_forward_expand(gf, build_graph()); ++ ggml_backend_sched_init_measure(sched, gf); ++ ++ // create the graph again, since the previous one was destroyed by the measure ++ ggml_graph_clear(gf); ++ ggml_build_forward_expand(gf, build_graph()); ++ ggml_backend_sched_graph_compute(sched, gf); ++ ggml_backend_sched_free(sched); ++#endif ++ ++ ggml_backend_buffer_free(lora_buf); ++ ggml_backend_buffer_free(graph_buf); ++ ggml_free(lora_ctx); ++ ++ n_tensors++; ++ if (n_tensors % 4 == 0) { ++ LLAMA_LOG_INFO("."); ++ } ++ } ++ ++ ggml_backend_free(backend_cpu); ++ ++ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; ++ LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); ++ ++ return 0; ++} ++ ++int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) { ++ try { ++ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); ++ } catch (const std::exception & err) { ++ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what()); ++ return 1; ++ } ++} +\ No newline at end of file diff --git a/llm/patches/10-tekken.diff b/llm/patches/10-tekken.diff deleted file mode 100644 index 56a583e0..00000000 --- a/llm/patches/10-tekken.diff +++ /dev/null @@ -1,43 +0,0 @@ -diff --git a/include/llama.h b/include/llama.h -index bb4b05ba..a92174e0 100644 ---- a/include/llama.h -+++ b/include/llama.h -@@ -92,6 +92,7 @@ extern "C" { - LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17, - LLAMA_VOCAB_PRE_TYPE_VIKING = 18, - LLAMA_VOCAB_PRE_TYPE_JAIS = 19, -+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, - }; - - // note: these values should be synchronized with ggml_rope -diff --git a/src/llama.cpp b/src/llama.cpp -index 18364976..435b6fe5 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -5429,6 +5429,12 @@ static void llm_load_vocab( - } else if ( - tokenizer_pre == "jais") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; -+ } else if ( -+ tokenizer_pre == "tekken") { -+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN; -+ vocab.tokenizer_clean_spaces = false; -+ vocab.tokenizer_ignore_merges = true; -+ vocab.tokenizer_add_bos = true; - } else { - LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; -@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe { - " ?[^(\\s|.,!?…。,、।۔،)]+", - }; - break; -+ case LLAMA_VOCAB_PRE_TYPE_TEKKEN: -+ // original regex from tokenizer.json -+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" -+ regex_exprs = { -+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", -+ }; -+ break; - default: - // default regex for BPE tokenization pre-processing - regex_exprs = { diff --git a/llm/patches/11-embd_kv.diff b/llm/patches/11-embd_kv.diff deleted file mode 100644 index ad17a700..00000000 --- a/llm/patches/11-embd_kv.diff +++ /dev/null @@ -1,19 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index 2b9ace28..e60d3d8d 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -6052,10 +6052,10 @@ static bool llm_load_tensors( - - layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - -- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); -- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); -- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); -- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); -+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}); -+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}); -+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}); -+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}); - - // optional bias tensors - layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); From d835368eb8599b4f4c2f8a766bad5b57498a988d Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 22 Jul 2024 16:16:22 -0400 Subject: [PATCH 07/46] convert: capture `head_dim` for mistral (#5818) --- convert/mistral.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/convert/mistral.go b/convert/mistral.go index da6874cf..8fe066d6 100644 --- a/convert/mistral.go +++ b/convert/mistral.go @@ -71,6 +71,11 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error { "tokenizer.ggml.unknown_token_id": uint32(0), } + if m.Params.HeadDimension > 0 { + kv["llama.attention.key_length"] = uint32(m.Params.HeadDimension) + kv["llama.attention.value_length"] = uint32(m.Params.HeadDimension) + } + return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors) } From c0648233f2236f82f6830d2aaed552ae0f72379b Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Mon, 22 Jul 2024 13:37:08 -0700 Subject: [PATCH 08/46] api embed docs (#5282) --- docs/api.md | 84 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 8 deletions(-) diff --git a/docs/api.md b/docs/api.md index c577bb1a..4381c376 100644 --- a/docs/api.md +++ b/docs/api.md @@ -1026,7 +1026,7 @@ If `stream` is set to `false`, then the response is a single JSON object: ## Generate Embeddings ```shell -POST /api/embeddings +POST /api/embed ``` Generate embeddings from a model @@ -1034,10 +1034,11 @@ Generate embeddings from a model ### Parameters - `model`: name of model to generate embeddings from -- `prompt`: text to generate embeddings for +- `input`: text or list of text to generate embeddings for Advanced parameters: +- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true` - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) @@ -1046,9 +1047,9 @@ Advanced parameters: #### Request ```shell -curl http://localhost:11434/api/embeddings -d '{ +curl http://localhost:11434/api/embed -d '{ "model": "all-minilm", - "prompt": "Here is an article about llamas..." + "input": "Why is the sky blue?" }' ``` @@ -1056,10 +1057,35 @@ curl http://localhost:11434/api/embeddings -d '{ ```json { - "embedding": [ - 0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313, - 0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281 - ] + "model": "all-minilm", + "embeddings": [[ + 0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814, + 0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348 + ]] +} +``` + +#### Request (Multiple input) + +```shell +curl http://localhost:11434/api/embed -d '{ + "model": "all-minilm", + "input": ["Why is the sky blue?", "Why is the grass green?"] +}' +``` + +#### Response + +```json +{ + "model": "all-minilm", + "embeddings": [[ + 0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814, + 0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348 + ],[ + -0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725, + 0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481 + ]] } ``` @@ -1106,3 +1132,45 @@ A single JSON object will be returned. ] } ``` + +## Generate Embedding + +> Note: this endpoint has been superseded by `/api/embed` + +```shell +POST /api/embeddings +``` + +Generate embeddings from a model + +### Parameters + +- `model`: name of model to generate embeddings from +- `prompt`: text to generate embeddings for + +Advanced parameters: + +- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature` +- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`) + +### Examples + +#### Request + +```shell +curl http://localhost:11434/api/embeddings -d '{ + "model": "all-minilm", + "prompt": "Here is an article about llamas..." +}' +``` + +#### Response + +```json +{ + "embedding": [ + 0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313, + 0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281 + ] +} +``` \ No newline at end of file From 83a0cb8d88561b4302baa8b6ea0623c426483e5d Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 2 Jul 2024 14:52:18 -0700 Subject: [PATCH 09/46] docs --- docs/template.md | 173 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 docs/template.md diff --git a/docs/template.md b/docs/template.md new file mode 100644 index 00000000..8f41e8fb --- /dev/null +++ b/docs/template.md @@ -0,0 +1,173 @@ +# Template + +Ollama provides a powerful templating engine backed by Go's built-in templating engine to construct prompts for your large language model. This feature is a valuable tool to get the most out of your models. + +## Basic Template Structure + +A basic Go template consists of three main parts: + +* **Layout**: The overall structure of the template. +* **Variables**: Placeholders for dynamic data that will be replaced with actual values when the template is rendered. +* **Functions**: Custom functions or logic that can be used to manipulate the template's content. + +Here's an example of a simple chat template: + +```gotmpl +{{- range .Messages }} +{{ .Role }}: {{ .Content }} +{{- end }} +``` + +In this example, we have: + +* A basic messages structure (layout) +* Three variables: `Messages`, `Role`, and `Content` (variables) +* A custom function (action) that iterates over an array of items (`range .Messages`) and displays each item + +## Adding Templates to Your Model + +By default, models imported into Ollama have a default template of `{{ .Prompt }}`, i.e. user inputs are sent verbatim to the LLM. This is appropriate for text or code completion models but lacks essential markers for chat or instruction models. + +Omitting a template in these models puts the responsibility of correctly templating input onto the user. Adding a template allows users to easily get the best results from the model. + +To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3. + +```dockerfile +FROM llama3 + +TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|> + +{{ .System }}<|eot_id|> +{{- end }} +{{- range .Messages }}<|start_header_id|>{{ .Role }}<|end_header_id|> + +{{ .Content }}<|eot_id|> +{{- end }}<|start_header_id|>assistant<|end_header_id|> + +""" +``` + +## Variables + +`System` (string): system prompt + +`Prompt` (string): user prompt + +`Response` (string): assistant response + +`Suffix` (string): text inserted after the assistant's response + +`Messages` (list): list of messages + +`Messages[].Role` (string): role which can be one of `system`, `user`, `assistant`, or `tool` + +`Messages[].Content` (string): message content + +`Messages[].ToolCalls` (list): list of tools the model wants to call + +`Messages[].ToolCalls[].Function` (object): function to call + +`Messages[].ToolCalls[].Function.Name` (string): function name + +`Messages[].ToolCalls[].Function.Arguments` (map): mapping of argument name to argument value + +`Tools` (list): list of tools the model can access + +`Tools[].Type` (string): schema type. `type` is always `function` + +`Tools[].Function` (object): function definition + +`Tools[].Function.Name` (string): function name + +`Tools[].Function.Description` (string): function description + +`Tools[].Function.Parameters` (object): function parameters + +`Tools[].Function.Parameters.Type` (string): schema type. `type` is always `object` + +`Tools[].Function.Parameters.Required` (list): list of required properties + +`Tools[].Function.Parameters.Properties` (map): mapping of property name to property definition + +`Tools[].Function.Parameters.Properties[].Type` (string): property type + +`Tools[].Function.Parameters.Properties[].Description` (string): property description + +`Tools[].Function.Parameters.Properties[].Enum` (list): list of valid values + +## Tips and Best Practices + +Keep the following tips and best practices in mind when working with Go templates: + +* **Be mindful of dot**: Control flow structures like `range` and `with` changes the value `.` +* **Out-of-scope variables**: Use `$.` to reference variables not currently in scope, starting from the root +* **Whitespace control**: Use `-` to trim leading (`{{-`) and trailing (`-}}`) whitespace + +## Examples + +### Example Messages + +#### ChatML + +ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2. + +```gotmpl +{{- if .System }}<|im_start|>system +{{ .System }}<|im_end|> +{{ end }} +{{- range .Messages }}<|im_start|>{{ .Role }} +{{ .Content }}<|im_end|> +{{ end }}<|im_start|>assistant +{{ else }} +{{ if .System }}<|im_start|>system +{{ .System }}<|im_end|> +``` + +### Example Tools + +Tools support can be added to a model by adding a `{{ .Tools }}` node to the template. This feature is useful for models trained to call external tools and can a powerful tool for retrieving real-time data or performing complex tasks. + +#### Mistral + +Mistral v0.3 and Mixtral 8x22B supports tool calling. + +```gotmpl +{{- range $index, $_ := .Messages }} +{{- if eq .Role "user" }} +{{- if and (le (len (slice $.Messages $index)) 2) $.Tools }}[AVAILABLE_TOOLS] {{ json $.Tools }}[/AVAILABLE_TOOLS] +{{- end }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }} + +{{ end }}{{ .Content }}[/INST] +{{- else if eq .Role "assistant" }} +{{- if .Content }} {{ .Content }} +{{- else if .ToolCalls }}[TOOL_CALLS] [ +{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ json .Function.Arguments }}} +{{- end }}] +{{- end }} +{{- else if eq .Role "tool" }}[TOOL_RESULTS] {"content": {{ .Content }}}[/TOOL_RESULTS] +{{- end }} +{{- end }} +``` + +### Example Fill-in-Middle + +Fill-in-middle support can be added to a model by adding a `{{ .Suffix }}` node to the template. This feature is useful for models that are trained to generate text in the middle of user input, such as code completion models. + +#### CodeLlama + +CodeLlama [7B](https://ollama.com/library/codellama:7b-code) and [13B](https://ollama.com/library/codellama:13b-code) code completion models support fill-in-middle. + +```gotmpl +
 {{ .Prompt }} {{ .Suffix }} 
+```
+
+> [!NOTE]
+> CodeLlama 34B and 70B code completion and all instruct and Python fine-tuned models do not support fill-in-middle.
+
+#### Codestral
+
+Codestral [22B](https://ollama.com/library/codestral:22b) supports fill-in-middle.
+
+```gotmpl
+[SUFFIX]{{ .Suffix }}[PREFIX] {{ .Prompt }}
+```

From 9b60a038e5169c4a69bc513ae6e7ea1816f9fc11 Mon Sep 17 00:00:00 2001
From: Michael Yang 
Date: Mon, 22 Jul 2024 13:34:56 -0700
Subject: [PATCH 10/46] update api.md

---
 README.md         |   3 +-
 docs/api.md       | 117 +++++++++++++++++++++++++++++++++++++++++++++-
 docs/modelfile.md |   3 +-
 3 files changed, 119 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b96f4c16..02ab7051 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,8 @@ Here are some example models that can be downloaded:
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
 | Solar              | 10.7B      | 6.1GB | `ollama run solar`             |
 
-> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
+> [!NOTE]
+> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
 
 ## Customize a model
 
diff --git a/docs/api.md b/docs/api.md
index c577bb1a..bf4c8ce8 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -40,6 +40,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
 
 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
+- `suffix`: the text after the model response
 - `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
 
 Advanced parameters (optional):
@@ -57,7 +58,8 @@ Advanced parameters (optional):
 
 Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
 
-> Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.
+> [!IMPORTANT]
+> It's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.
 
 ### Examples
 
@@ -148,8 +150,44 @@ If `stream` is set to `false`, the response will be a single JSON object:
 }
 ```
 
+#### Request (with suffix)
+
+##### Request
+
+```shell
+curl http://localhost:11434/api/generate -d '{
+  "model": "codellama:code",
+  "prompt": "def compute_gcd(a, b):",
+  "suffix": "    return result",
+  "options": {
+    "temperature": 0
+  },
+  "stream": false
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "codellama:code",
+  "created_at": "2024-07-22T20:47:51.147561Z",
+  "response": "\n  if a == 0:\n    return b\n  else:\n    return compute_gcd(b % a, a)\n\ndef compute_lcm(a, b):\n  result = (a * b) / compute_gcd(a, b)\n",
+  "done": true,
+  "done_reason": "stop",
+  "context": [...],
+  "total_duration": 1162761250,
+  "load_duration": 6683708,
+  "prompt_eval_count": 17,
+  "prompt_eval_duration": 201222000,
+  "eval_count": 63,
+  "eval_duration": 953997000
+}
+```
+
 #### Request (JSON mode)
 
+> [!IMPORTANT]
 > When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON.
 
 ##### Request
@@ -383,9 +421,10 @@ Generate the next message in a chat with a provided model. This is a streaming e
 
 The `message` object has the following fields:
 
-- `role`: the role of the message, either `system`, `user` or `assistant`
+- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
 - `content`: the content of the message
 - `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
+- `tool_calls` (optional): a list of tools the model wants to use
 
 Advanced parameters (optional):
 
@@ -393,6 +432,7 @@ Advanced parameters (optional):
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+- `tools`: external tools the model can use. Not all models support this feature.
 
 ### Examples
 
@@ -622,6 +662,79 @@ curl http://localhost:11434/api/chat -d '{
 }
 ```
 
+#### Chat request (with tools)
+
+##### Request
+
+```
+curl http://localhost:11434/api/chat -d '{
+  "model": "mistral",
+  "messages": [
+    {
+      "role": "user",
+      "content": "What is the weather today in Paris?"
+    }
+  ],
+  "stream": false,
+  "tools": [
+    {
+      "type": "function",
+      "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather for a location",
+        "parameters": {
+          "type": "object",
+          "properties": {
+            "location": {
+              "type": "string",
+              "description": "The location to get the weather for, e.g. San Francisco, CA"
+            },
+            "format": {
+              "type": "string",
+              "description": "The format to return the weather in, e.g. 'celsius' or 'fahrenheit'",
+              "enum": ["celsius", "fahrenheit"]
+            }
+          },
+          "required": ["location", "format"]
+        }
+      }
+    }
+  ]
+}'
+```
+
+##### Response
+
+```json
+{
+  "model": "mistral:7b-instruct-v0.3-q4_K_M",
+  "created_at": "2024-07-22T20:33:28.123648Z",
+  "message": {
+    "role": "assistant",
+    "content": "",
+    "tool_calls": [
+      {
+        "function": {
+          "name": "get_current_weather",
+          "arguments": {
+            "format": "celsius",
+            "location": "Paris, FR"
+          }
+        }
+      }
+    ]
+  },
+  "done_reason": "stop",
+  "done": true,
+  "total_duration": 885095291,
+  "load_duration": 3753500,
+  "prompt_eval_count": 122,
+  "prompt_eval_duration": 328493000,
+  "eval_count": 33,
+  "eval_duration": 552222000
+}
+```
+
 ## Create a Model
 
 ```shell
diff --git a/docs/modelfile.md b/docs/modelfile.md
index 21ee1826..c3645b06 100644
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -1,6 +1,7 @@
 # Ollama Model File
 
-> Note: `Modelfile` syntax is in development
+> [!NOTE]
+> `Modelfile` syntax is in development
 
 A model file is the blueprint to create and share models with Ollama.
 

From e12fff8810e37bfabe4416f7f41902387ff3aae1 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen 
Date: Mon, 15 Jul 2024 09:25:56 -0700
Subject: [PATCH 11/46] Enable windows error dialog for subprocess startup

Make sure if something goes wrong spawning the process, the user gets
enough info to be able to try to self correct, or at least file a bug
with details so we can fix it.  Once the process starts, we immediately
change back to the recommended setting to prevent the blocking dialog.
This ensures if the model fails to load (OOM, unsupported model type,
etc.) the process will exit quickly and we can scan the stdout/stderr
of the subprocess for the reason to report via API.
---
 llm/ext_server/server.cpp |  4 ++++
 llm/llm_darwin_amd64.go   |  3 +++
 llm/llm_darwin_arm64.go   |  3 +++
 llm/llm_linux.go          |  7 ++++++-
 llm/llm_windows.go        | 16 +++++++++++++++-
 llm/server.go             |  1 +
 6 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index e8a076c4..14d921c0 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -41,6 +41,7 @@
 
 #if defined(_WIN32)
 #include 
+#include 
 #endif
 
 #include 
@@ -2737,6 +2738,9 @@ int wmain(int argc, wchar_t **wargv) {
     for (int i = 0; i < argc; ++i) {
         argv[i] = wchar_to_char(wargv[i]);
     }
+
+    // Adjust error mode to avoid error dialog after we start.
+    SetErrorMode(SEM_FAILCRITICALERRORS);
 #else
 int main(int argc, char **argv) {
 #endif
diff --git a/llm/llm_darwin_amd64.go b/llm/llm_darwin_amd64.go
index 3093e1ad..60eed719 100644
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@@ -2,7 +2,10 @@ package llm
 
 import (
 	"embed"
+	"syscall"
 )
 
 //go:embed build/darwin/x86_64/*/bin/*
 var libEmbed embed.FS
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
diff --git a/llm/llm_darwin_arm64.go b/llm/llm_darwin_arm64.go
index 928f0b82..20ce8552 100644
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@@ -2,7 +2,10 @@ package llm
 
 import (
 	"embed"
+	"syscall"
 )
 
 //go:embed build/darwin/arm64/*/bin/*
 var libEmbed embed.FS
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
diff --git a/llm/llm_linux.go b/llm/llm_linux.go
index c2c5c4cb..928b4e79 100644
--- a/llm/llm_linux.go
+++ b/llm/llm_linux.go
@@ -1,6 +1,11 @@
 package llm
 
-import "embed"
+import (
+	"embed"
+	"syscall"
+)
 
 //go:embed build/linux/*/*/bin/*
 var libEmbed embed.FS
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
diff --git a/llm/llm_windows.go b/llm/llm_windows.go
index e44f4b95..763cccf9 100644
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@@ -1,6 +1,20 @@
 package llm
 
-import "embed"
+import (
+	"embed"
+	"syscall"
+)
 
 // unused on windows
 var libEmbed embed.FS
+
+const CREATE_DEFAULT_ERROR_MODE = 0x04000000
+
+var LlamaServerSysProcAttr = &syscall.SysProcAttr{
+	// Wire up the default error handling logic If for some reason a DLL is
+	// missing in the path this will pop up a GUI Dialog explaining the fault so
+	// the user can either fix their PATH, or report a bug. Without this
+	// setting, the process exits immediately with a generic exit status but no
+	// way to (easily) figure out what the actual missing DLL was.
+	CreationFlags: CREATE_DEFAULT_ERROR_MODE,
+}
diff --git a/llm/server.go b/llm/server.go
index 08463ef0..55732773 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -346,6 +346,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		s.cmd.Env = os.Environ()
 		s.cmd.Stdout = os.Stdout
 		s.cmd.Stderr = s.status
+		s.cmd.SysProcAttr = LlamaServerSysProcAttr
 
 		envWorkarounds := [][2]string{}
 		for _, gpu := range gpus {

From db0968f30c895b9f2059da48800018739ef9bca7 Mon Sep 17 00:00:00 2001
From: Josh <76125168+joshyan1@users.noreply.github.com>
Date: Mon, 22 Jul 2024 15:48:15 -0700
Subject: [PATCH 12/46] fix dupe err message (#5857)

---
 server/routes.go | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/server/routes.go b/server/routes.go
index 0d7ca003..e6ffe526 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -609,10 +609,9 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
 		defer cancel()
 
 		quantization := cmp.Or(r.Quantize, r.Quantization)
-		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil {
-			if errors.Is(err, errBadTemplate) {
-				ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
-			}
+		if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); errors.Is(err, errBadTemplate) {
+			ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
+		} else if err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()

From 5d604eec5bbaba840fcee8cac8574807f3656ea8 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen 
Date: Mon, 22 Jul 2024 16:16:28 -0700
Subject: [PATCH 13/46] Bump Go patch version

---
 .github/workflows/release.yaml | 10 +++++-----
 .github/workflows/test.yaml    | 10 +++++-----
 Dockerfile                     |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 5ae630c3..f0c6db5d 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -31,7 +31,7 @@ jobs:
           security set-keychain-settings -lut 3600 build.keychain
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: Build Darwin
         env:
@@ -87,7 +87,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get ./...
       - run: |
@@ -141,7 +141,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install ROCm'
         run: |
@@ -218,7 +218,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install CUDA'
         run: |
@@ -306,7 +306,7 @@ jobs:
           write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get
       - uses: actions/download-artifact@v4
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 90fef6e5..5e002a22 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -63,7 +63,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: go get ./...
       - run: |
@@ -163,7 +163,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install ROCm'
         run: |
@@ -200,7 +200,7 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - name: 'Install CUDA'
         run: |
@@ -255,7 +255,7 @@ jobs:
           submodules: recursive
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: false
       - run: |
           case ${{ matrix.arch }} in
@@ -297,7 +297,7 @@ jobs:
           submodules: recursive
       - uses: actions/setup-go@v5
         with:
-          go-version-file: go.mod
+          go-version: "stable"
           cache: true
       - run: |
           case ${{ matrix.arch }} in
diff --git a/Dockerfile b/Dockerfile
index ca393496..c8efdd8a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-ARG GOLANG_VERSION=1.22.1
+ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md
 ARG CUDA_VERSION=11.3.1

From a6cd8f6169c029c92105962017562274bd90626b Mon Sep 17 00:00:00 2001
From: Ajay Chintala 
Date: Tue, 23 Jul 2024 11:40:23 -0700
Subject: [PATCH 14/46] Update README.md to add LLMStack integration (#5799)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b96f4c16..6a06b819 100644
--- a/README.md
+++ b/README.md
@@ -296,6 +296,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
 - [AI Studio](https://github.com/MindWorkAI/AI-Studio)
 - [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
+- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
 
 ### Terminal
 

From 830fdd271536ee257db72c29c2be5b5629e58389 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen 
Date: Tue, 23 Jul 2024 15:14:28 -0700
Subject: [PATCH 15/46] Better explain multi-gpu behavior

---
 cmd/cmd.go  | 1 +
 docs/faq.md | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/cmd/cmd.go b/cmd/cmd.go
index b761d018..610fddcb 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1341,6 +1341,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_NUM_PARALLEL"],
 				envVars["OLLAMA_NOPRUNE"],
 				envVars["OLLAMA_ORIGINS"],
+				envVars["OLLAMA_SCHED_SPREAD"],
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
diff --git a/docs/faq.md b/docs/faq.md
index da1848f7..16c80549 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -272,4 +272,8 @@ The following server settings may be used to adjust how Ollama handles concurren
 - `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time.  The default will auto-select either 4 or 1 based on available memory.
 - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
 
-Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
\ No newline at end of file
+Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting.  Once ROCm v6.2 is available, Windows Radeon will follow the defaults above.  You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
+
+## How does Ollama load models on multiple GPUs?
+
+Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
\ No newline at end of file

From ac33aa7d3782887878e6e24fb4a6238356a489a6 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Wed, 24 Jul 2024 11:15:46 -0700
Subject: [PATCH 16/46] Fix Embed Test Flakes (#5893)

* float cmp

* increase tolerance
---
 integration/embed_test.go | 59 +++++++++++++++++++++++++++++++++++----
 1 file changed, 54 insertions(+), 5 deletions(-)

diff --git a/integration/embed_test.go b/integration/embed_test.go
index aeafa57b..61b36fa2 100644
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -4,12 +4,45 @@ package integration
 
 import (
 	"context"
+	"math"
 	"testing"
 	"time"
 
 	"github.com/ollama/ollama/api"
 )
 
+func floatsEqual32(a, b float32) bool {
+	return math.Abs(float64(a-b)) <= 1e-4
+}
+
+func floatsEqual64(a, b float64) bool {
+	return math.Abs(a-b) <= 1e-4
+}
+
+func TestAllMiniLMEmbeddings(t *testing.T) {
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
+	defer cancel()
+
+	req := api.EmbeddingRequest{
+		Model:  "all-minilm",
+		Prompt: "why is the sky blue?",
+	}
+
+	res, err := embeddingTestHelper(ctx, t, req)
+
+	if err != nil {
+		t.Fatalf("error: %v", err)
+	}
+
+	if len(res.Embedding) != 384 {
+		t.Fatalf("expected 384 floats, got %d", len(res.Embedding))
+	}
+
+	if !floatsEqual64(res.Embedding[0], 0.06642947345972061) {
+		t.Fatalf("expected 0.06642947345972061, got %.16f", res.Embedding[0])
+	}
+}
+
 func TestAllMiniLMEmbed(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
@@ -33,8 +66,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 		t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
 	}
 
-	if res.Embeddings[0][0] != 0.010071031 {
-		t.Fatalf("expected 0.010071031, got %f", res.Embeddings[0][0])
+	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
+		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}
 }
 
@@ -61,12 +94,12 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 		t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
 	}
 
-	if res.Embeddings[0][0] != 0.010071031 || res.Embeddings[1][0] != -0.009802706 {
-		t.Fatalf("expected 0.010071031 and -0.009802706, got %f and %f", res.Embeddings[0][0], res.Embeddings[1][0])
+	if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
+		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}
 }
 
-func TestAllMiniLmEmbedTruncate(t *testing.T) {
+func TestAllMiniLMEmbedTruncate(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
 	defer cancel()
 
@@ -135,6 +168,22 @@ func TestAllMiniLmEmbedTruncate(t *testing.T) {
 	}
 }
 
+func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
+	client, _, cleanup := InitServerConnection(ctx, t)
+	defer cleanup()
+	if err := PullIfMissing(ctx, client, req.Model); err != nil {
+		t.Fatalf("failed to pull model %s: %v", req.Model, err)
+	}
+
+	response, err := client.Embeddings(ctx, &req)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return response, nil
+}
+
 func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
 	client, _, cleanup := InitServerConnection(ctx, t)
 	defer cleanup()

From bb46bbcf5e90e5efab5ff946a6c798131907ba2d Mon Sep 17 00:00:00 2001
From: Michael Yang 
Date: Wed, 24 Jul 2024 13:05:59 -0700
Subject: [PATCH 17/46] llm(llama): pass rope factors (#5924)

---
 llm/patches/0001-llama-3.1-rope-scaling.diff | 71 ++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 llm/patches/0001-llama-3.1-rope-scaling.diff

diff --git a/llm/patches/0001-llama-3.1-rope-scaling.diff b/llm/patches/0001-llama-3.1-rope-scaling.diff
new file mode 100644
index 00000000..45dcb4f5
--- /dev/null
+++ b/llm/patches/0001-llama-3.1-rope-scaling.diff
@@ -0,0 +1,71 @@
+From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
+From: Michael Yang 
+Date: Tue, 23 Jul 2024 14:33:29 -0700
+Subject: [PATCH] llama 3.1 rope scaling
+
+---
+ src/llama.cpp | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 8fe51971..a9969df8 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -2472,6 +2472,7 @@ struct llama_layer {
+     // long rope factors
+     struct ggml_tensor * rope_long  = nullptr;
+     struct ggml_tensor * rope_short = nullptr;
++    struct ggml_tensor * rope_freqs = nullptr;
+ 
+     // bitnet scale
+     struct ggml_tensor * wq_scale;
+@@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
+ 
+                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ 
++                        layer.rope_freqs  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS,  "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
++
+                         if (n_expert == 0) {
+                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+@@ -8620,6 +8623,10 @@ struct llm_build_context {
+         // choose long/short freq factors based on the context size
+         const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
+ 
++        if (model.layers[il].rope_freqs != nullptr) {
++            return model.layers[il].rope_freqs;
++        }
++
+         if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
+             return model.layers[il].rope_long;
+         }
+@@ -8814,6 +8821,9 @@ struct llm_build_context {
+ 
+             // self-attention
+             {
++                // rope freq factors for llama3; may return nullptr for llama2 and other models
++                struct ggml_tensor * rope_factors = build_rope_factors(il);
++
+                 // compute Q and K and RoPE them
+                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                 cb(Qcur, "Qcur", il);
+@@ -8837,14 +8847,14 @@ struct llm_build_context {
+                 }
+ 
+                 Qcur = ggml_rope_ext(
+-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
++                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                     ext_factor, attn_factor, beta_fast, beta_slow
+                 );
+                 cb(Qcur, "Qcur", il);
+ 
+                 Kcur = ggml_rope_ext(
+-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
++                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                     ext_factor, attn_factor, beta_fast, beta_slow
+                 );
+-- 
+2.45.2
+

From 7c2a157ca4a9188c9d0e0c0a03a6bd9d163ba464 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen 
Date: Wed, 24 Jul 2024 13:43:26 -0700
Subject: [PATCH 18/46] Ensure amd gpu nodes are numerically sorted

For systems that enumerate over 10 CPUs the default lexicographical
sort order interleaves CPUs and GPUs.
---
 gpu/amd_linux.go | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go
index 15b6fc61..6493af9e 100644
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -10,6 +10,7 @@ import (
 	"path/filepath"
 	"regexp"
 	"slices"
+	"sort"
 	"strconv"
 	"strings"
 
@@ -82,6 +83,20 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
 	// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
 	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
+	sort.Slice(matches, func(i, j int) bool {
+		// /sys/class/kfd/kfd/topology/nodes//properties
+		a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64)
+		if err != nil {
+			slog.Debug("parse err", "error", err, "match", matches[i])
+			return false
+		}
+		b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64)
+		if err != nil {
+			slog.Debug("parse err", "error", err, "match", matches[i])
+			return false
+		}
+		return a < b
+	})
 	cpuCount := 0
 	for _, match := range matches {
 		slog.Debug("evaluating amdgpu node " + match)

From 6c2129d5d0692f18e677c48d5ea7e015ecae5015 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen 
Date: Wed, 24 Jul 2024 15:22:00 -0700
Subject: [PATCH 19/46] Explain font problems on windows 10

---
 docs/windows.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/windows.md b/docs/windows.md
index 69c2aa6d..dbfc1440 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -23,6 +23,8 @@ Logs will often be helpful in diagnosing the problem (see
 * NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
 * AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
 
+Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
+
 ## API Access
 
 Here's a quick example showing API access from `powershell`

From ce3c93b08f0b90496e86b9e0a5753334c2d21419 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen 
Date: Wed, 24 Jul 2024 17:09:20 -0700
Subject: [PATCH 20/46] Report better error on cuda unsupported os/arch

If we detect an NVIDIA GPU, but nvidia doesn't support the os/arch,
this will report a better error for the user and point them to docs
to self-install the drivers if possible.
---
 scripts/install.sh | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/scripts/install.sh b/scripts/install.sh
index 2a06c350..aa8b3e5e 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -198,19 +198,29 @@ if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
     exit 0
 fi
 
+CUDA_REPO_ERR_MSG="NVIDIA GPU detected, but your OS and Architecture are not supported by NVIDIA.  Please install the CUDA driver manually https://docs.nvidia.com/cuda/cuda-installation-guide-linux/"
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
 install_cuda_driver_yum() {
     status 'Installing NVIDIA repository...'
+    
     case $PACKAGE_MANAGER in
         yum)
             $SUDO $PACKAGE_MANAGER -y install yum-utils
-            $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
+                $SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            else
+                error $CUDA_REPO_ERR_MSG
+            fi
             ;;
         dnf)
-            $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
+                $SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
+            else
+                error $CUDA_REPO_ERR_MSG
+            fi
             ;;
     esac
 
@@ -235,7 +245,11 @@ install_cuda_driver_yum() {
 # ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
 install_cuda_driver_apt() {
     status 'Installing NVIDIA repository...'
-    curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
+    if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb" >/dev/null ; then
+        curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
+    else
+        error $CUDA_REPO_ERR_MSG
+    fi
 
     case $1 in
         debian)

From bbf8f102ee06bd6b149e4999571c0844aa47b12f Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan 
Date: Thu, 25 Jul 2024 18:24:55 -0400
Subject: [PATCH 21/46] Revert "llm(llama): pass rope factors (#5924)" (#5963)

This reverts commit bb46bbcf5e90e5efab5ff946a6c798131907ba2d.
---
 llm/patches/0001-llama-3.1-rope-scaling.diff | 71 --------------------
 1 file changed, 71 deletions(-)
 delete mode 100644 llm/patches/0001-llama-3.1-rope-scaling.diff

diff --git a/llm/patches/0001-llama-3.1-rope-scaling.diff b/llm/patches/0001-llama-3.1-rope-scaling.diff
deleted file mode 100644
index 45dcb4f5..00000000
--- a/llm/patches/0001-llama-3.1-rope-scaling.diff
+++ /dev/null
@@ -1,71 +0,0 @@
-From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
-From: Michael Yang 
-Date: Tue, 23 Jul 2024 14:33:29 -0700
-Subject: [PATCH] llama 3.1 rope scaling
-
----
- src/llama.cpp | 14 ++++++++++++--
- 1 file changed, 12 insertions(+), 2 deletions(-)
-
-diff --git a/src/llama.cpp b/src/llama.cpp
-index 8fe51971..a9969df8 100644
---- a/src/llama.cpp
-+++ b/src/llama.cpp
-@@ -2472,6 +2472,7 @@ struct llama_layer {
-     // long rope factors
-     struct ggml_tensor * rope_long  = nullptr;
-     struct ggml_tensor * rope_short = nullptr;
-+    struct ggml_tensor * rope_freqs = nullptr;
- 
-     // bitnet scale
-     struct ggml_tensor * wq_scale;
-@@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
- 
-                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- 
-+                        layer.rope_freqs  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS,  "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
-+
-                         if (n_expert == 0) {
-                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
-                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
-@@ -8620,6 +8623,10 @@ struct llm_build_context {
-         // choose long/short freq factors based on the context size
-         const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
- 
-+        if (model.layers[il].rope_freqs != nullptr) {
-+            return model.layers[il].rope_freqs;
-+        }
-+
-         if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
-             return model.layers[il].rope_long;
-         }
-@@ -8814,6 +8821,9 @@ struct llm_build_context {
- 
-             // self-attention
-             {
-+                // rope freq factors for llama3; may return nullptr for llama2 and other models
-+                struct ggml_tensor * rope_factors = build_rope_factors(il);
-+
-                 // compute Q and K and RoPE them
-                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
-                 cb(Qcur, "Qcur", il);
-@@ -8837,14 +8847,14 @@ struct llm_build_context {
-                 }
- 
-                 Qcur = ggml_rope_ext(
--                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
-                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                     ext_factor, attn_factor, beta_fast, beta_slow
-                 );
-                 cb(Qcur, "Qcur", il);
- 
-                 Kcur = ggml_rope_ext(
--                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
-                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                     ext_factor, attn_factor, beta_fast, beta_slow
-                 );
--- 
-2.45.2
-

From 4de1370a9dcc88b79ddc2d4af2e8c954bdfa67a1 Mon Sep 17 00:00:00 2001
From: royjhan <65097070+royjhan@users.noreply.github.com>
Date: Thu, 25 Jul 2024 15:34:06 -0700
Subject: [PATCH 22/46] openai tools doc (#5617)

---
 docs/openai.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/openai.md b/docs/openai.md
index 248ba74a..e51d3194 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -79,7 +79,7 @@ curl http://localhost:11434/v1/chat/completions \
 - [x] JSON mode
 - [x] Reproducible outputs
 - [ ] Vision
-- [ ] Function calling
+- [x] Tools
 - [ ] Logprobs
 
 #### Supported request fields
@@ -97,9 +97,9 @@ curl http://localhost:11434/v1/chat/completions \
 - [x] `temperature`
 - [x] `top_p`
 - [x] `max_tokens`
-- [ ] `logit_bias`
-- [ ] `tools`
+- [x] `tools`
 - [ ] `tool_choice`
+- [ ] `logit_bias`
 - [ ] `user`
 - [ ] `n`
 

From 455e61170d12d2b29ac2dfe5fa6444ae40a9ef7f Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan 
Date: Thu, 25 Jul 2024 18:34:47 -0400
Subject: [PATCH 23/46] Update openai.md

---
 docs/openai.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/openai.md b/docs/openai.md
index e51d3194..04d56bd6 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -78,8 +78,8 @@ curl http://localhost:11434/v1/chat/completions \
 - [x] Streaming
 - [x] JSON mode
 - [x] Reproducible outputs
-- [ ] Vision
 - [x] Tools
+- [ ] Vision
 - [ ] Logprobs
 
 #### Supported request fields

From c8af3c2d969a99618eecf169bd75aa112573ac27 Mon Sep 17 00:00:00 2001
From: Blake Mizerany 
Date: Thu, 25 Jul 2024 15:58:30 -0700
Subject: [PATCH 24/46] server: reuse original download URL for images (#5962)

This changes the registry client to reuse the original download URL
it gets on the first redirect response for all subsequent requests,
preventing thundering herd issues when hot new LLMs are released.
---
 server/download.go | 75 +++++++++++++++++++++++++++++++++++++++++++++-
 server/images.go   |  6 +++-
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/server/download.go b/server/download.go
index d93cd3b4..8b5b577f 100644
--- a/server/download.go
+++ b/server/download.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"log/slog"
 	"math"
+	"math/rand/v2"
 	"net/http"
 	"net/url"
 	"os"
@@ -141,6 +142,32 @@ func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *regis
 	b.err = b.run(ctx, requestURL, opts)
 }
 
+func newBackoff(maxBackoff time.Duration) func(ctx context.Context) error {
+	var n int
+	return func(ctx context.Context) error {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+
+		n++
+
+		// n^2 backoff timer is a little smoother than the
+		// common choice of 2^n.
+		d := min(time.Duration(n*n)*10*time.Millisecond, maxBackoff)
+		// Randomize the delay between 0.5-1.5 x msec, in order
+		// to prevent accidental "thundering herd" problems.
+		d = time.Duration(float64(d) * (rand.Float64() + 0.5))
+		t := time.NewTimer(d)
+		defer t.Stop()
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-t.C:
+			return nil
+		}
+	}
+}
+
 func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *registryOptions) error {
 	defer blobDownloadManager.Delete(b.Digest)
 	ctx, b.CancelFunc = context.WithCancel(ctx)
@@ -153,6 +180,52 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 
 	_ = file.Truncate(b.Total)
 
+	directURL, err := func() (*url.URL, error) {
+		ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
+		defer cancel()
+
+		backoff := newBackoff(10 * time.Second)
+		for {
+			// shallow clone opts to be used in the closure
+			// without affecting the outer opts.
+			newOpts := new(registryOptions)
+			*newOpts = *opts
+
+			newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error {
+				if len(via) > 10 {
+					return errors.New("maxium redirects exceeded (10) for directURL")
+				}
+
+				// if the hostname is the same, allow the redirect
+				if req.URL.Hostname() == requestURL.Hostname() {
+					return nil
+				}
+
+				// stop at the first redirect that is not
+				// the same hostname as the original
+				// request.
+				return http.ErrUseLastResponse
+			}
+
+			resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, nil, nil, newOpts)
+			if err != nil {
+				slog.Warn("failed to get direct URL; backing off and retrying", "err", err)
+				if err := backoff(ctx); err != nil {
+					return nil, err
+				}
+				continue
+			}
+			defer resp.Body.Close()
+			if resp.StatusCode != http.StatusTemporaryRedirect {
+				return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
+			}
+			return resp.Location()
+		}
+	}()
+	if err != nil {
+		return err
+	}
+
 	g, inner := errgroup.WithContext(ctx)
 	g.SetLimit(numDownloadParts)
 	for i := range b.Parts {
@@ -165,7 +238,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 			var err error
 			for try := 0; try < maxRetries; try++ {
 				w := io.NewOffsetWriter(file, part.StartsAt())
-				err = b.downloadChunk(inner, requestURL, w, part, opts)
+				err = b.downloadChunk(inner, directURL, w, part, opts)
 				switch {
 				case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
 					// return immediately if the context is canceled or the device is out of space
diff --git a/server/images.go b/server/images.go
index 574dec19..836dbcc2 100644
--- a/server/images.go
+++ b/server/images.go
@@ -54,6 +54,8 @@ type registryOptions struct {
 	Username string
 	Password string
 	Token    string
+
+	CheckRedirect func(req *http.Request, via []*http.Request) error
 }
 
 type Model struct {
@@ -1131,7 +1133,9 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
 		req.ContentLength = contentLength
 	}
 
-	resp, err := http.DefaultClient.Do(req)
+	resp, err := (&http.Client{
+		CheckRedirect: regOpts.CheckRedirect,
+	}).Do(req)
 	if err != nil {
 		return nil, err
 	}

From 997c903884b08aef53d0f92634f74bdb64f05c0a Mon Sep 17 00:00:00 2001
From: Michael Yang 
Date: Thu, 25 Jul 2024 16:23:40 -0700
Subject: [PATCH 25/46] Update docs/template.md

Co-authored-by: Jeffrey Morgan 
---
 docs/template.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/template.md b/docs/template.md
index 8f41e8fb..f6ce06ba 100644
--- a/docs/template.md
+++ b/docs/template.md
@@ -24,7 +24,7 @@ In this example, we have:
 * Three variables: `Messages`, `Role`, and `Content` (variables)
 * A custom function (action) that iterates over an array of items (`range .Messages`) and displays each item
 
-## Adding Templates to Your Model
+## Adding templates to your model
 
 By default, models imported into Ollama have a default template of `{{ .Prompt }}`, i.e. user inputs are sent verbatim to the LLM. This is appropriate for text or code completion models but lacks essential markers for chat or instruction models.
 

From ae27d9dcfd32b7fbaa0d5a1fb0126106873332bf Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan 
Date: Thu, 25 Jul 2024 20:27:33 -0400
Subject: [PATCH 26/46] Update openai.md

---
 docs/openai.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/openai.md b/docs/openai.md
index 04d56bd6..fee30f71 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -78,7 +78,7 @@ curl http://localhost:11434/v1/chat/completions \
 - [x] Streaming
 - [x] JSON mode
 - [x] Reproducible outputs
-- [x] Tools
+- [x] Tools (streaming support coming soon)
 - [ ] Vision
 - [ ] Logprobs
 

From f5e3939220e9cd3d7a636708bc9df031ebfd4854 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan 
Date: Thu, 25 Jul 2024 23:10:18 -0400
Subject: [PATCH 27/46] Update api.md (#5968)

---
 docs/api.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/api.md b/docs/api.md
index 0ab70383..2d4fe28f 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -418,6 +418,7 @@ Generate the next message in a chat with a provided model. This is a streaming e
 
 - `model`: (required) the [model name](#model-names)
 - `messages`: the messages of the chat, this can be used to keep a chat memory
+- `tools`: tools for the model to use if supported. Requires `stream` to be set to `false`
 
 The `message` object has the following fields:
 
@@ -432,7 +433,6 @@ Advanced parameters (optional):
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
-- `tools`: external tools the model can use. Not all models support this feature.
 
 ### Examples
 
@@ -1286,4 +1286,4 @@ curl http://localhost:11434/api/embeddings -d '{
     0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
   ]
 }
-```
\ No newline at end of file
+```

From 3d9de805b777ca43746a6ae951b34689aa16e8e9 Mon Sep 17 00:00:00 2001
From: Michael Yang 
Date: Fri, 26 Jul 2024 13:19:01 -0700
Subject: [PATCH 28/46] fix: model save

stop parameter is saved as a slice which is incompatible with modelfile
parsing
---
 cmd/interactive.go      | 46 ++++++++++++++-----------
 cmd/interactive_test.go | 75 +++++++++++++++++++----------------------
 2 files changed, 60 insertions(+), 61 deletions(-)

diff --git a/cmd/interactive.go b/cmd/interactive.go
index adbc3e9f..2f83269e 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -1,6 +1,7 @@
 package cmd
 
 import (
+	"cmp"
 	"errors"
 	"fmt"
 	"io"
@@ -9,13 +10,14 @@ import (
 	"path/filepath"
 	"regexp"
 	"slices"
-	"sort"
 	"strings"
 
 	"github.com/spf13/cobra"
+	"golang.org/x/exp/maps"
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/types/errtypes"
@@ -375,9 +377,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 					return err
 				}
 				req := &api.ShowRequest{
-					Name:     opts.Model,
-					System:   opts.System,
-					Options:  opts.Options,
+					Name:    opts.Model,
+					System:  opts.System,
+					Options: opts.Options,
 				}
 				resp, err := client.Show(cmd.Context(), req)
 				if err != nil {
@@ -506,31 +508,35 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 }
 
 func buildModelfile(opts runOptions) string {
-	var mf strings.Builder
-	model := opts.ParentModel
-	if model == "" {
-		model = opts.Model
-	}
-	fmt.Fprintf(&mf, "FROM %s\n", model)
+	var f parser.File
+	f.Commands = append(f.Commands, parser.Command{Name: "model", Args: cmp.Or(opts.ParentModel, opts.Model)})
+
 	if opts.System != "" {
-		fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System)
+		f.Commands = append(f.Commands, parser.Command{Name: "system", Args: opts.System})
 	}
 
-	keys := make([]string, 0)
-	for k := range opts.Options {
-		keys = append(keys, k)
-	}
-	sort.Strings(keys)
+	keys := maps.Keys(opts.Options)
+	slices.Sort(keys)
 	for _, k := range keys {
-		fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k])
+		v := opts.Options[k]
+		var cmds []parser.Command
+		switch t := v.(type) {
+		case []string:
+			for _, s := range t {
+				cmds = append(cmds, parser.Command{Name: k, Args: s})
+			}
+		default:
+			cmds = append(cmds, parser.Command{Name: k, Args: fmt.Sprintf("%v", t)})
+		}
+
+		f.Commands = append(f.Commands, cmds...)
 	}
-	fmt.Fprintln(&mf)
 
 	for _, msg := range opts.Messages {
-		fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content)
+		f.Commands = append(f.Commands, parser.Command{Name: "message", Args: fmt.Sprintf("%s: %s", msg.Role, msg.Content)})
 	}
 
-	return mf.String()
+	return f.String()
 }
 
 func normalizeFilePath(fp string) string {
diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go
index 711f3860..bb7e0aba 100644
--- a/cmd/interactive_test.go
+++ b/cmd/interactive_test.go
@@ -1,12 +1,10 @@
 package cmd
 
 import (
-	"bytes"
 	"testing"
-	"text/template"
 
+	"github.com/google/go-cmp/cmp"
 	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
 
 	"github.com/ollama/ollama/api"
 )
@@ -57,58 +55,53 @@ d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8
 
 func TestModelfileBuilder(t *testing.T) {
 	opts := runOptions{
-		Model:    "hork",
-		System:   "You are part horse and part shark, but all hork. Do horklike things",
+		Model:  "hork",
+		System: "You are part horse and part shark, but all hork. Do horklike things",
 		Messages: []api.Message{
 			{Role: "user", Content: "Hey there hork!"},
 			{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
 		},
-		Options: map[string]interface{}{},
+		Options: map[string]any{
+			"temperature":      0.9,
+			"seed":             42,
+			"penalize_newline": false,
+			"stop":             []string{"hi", "there"},
+		},
 	}
 
-	opts.Options["temperature"] = 0.9
-	opts.Options["seed"] = 42
-	opts.Options["penalize_newline"] = false
-	opts.Options["stop"] = []string{"hi", "there"}
-
-	mf := buildModelfile(opts)
-	expectedModelfile := `FROM {{.Model}}
-SYSTEM """{{.System}}"""
+	t.Run("model", func(t *testing.T) {
+		expect := `FROM hork
+SYSTEM You are part horse and part shark, but all hork. Do horklike things
 PARAMETER penalize_newline false
 PARAMETER seed 42
-PARAMETER stop [hi there]
+PARAMETER stop hi
+PARAMETER stop there
 PARAMETER temperature 0.9
-
-MESSAGE user """Hey there hork!"""
-MESSAGE assistant """Yes it is true, I am half horse, half shark."""
+MESSAGE user Hey there hork!
+MESSAGE assistant Yes it is true, I am half horse, half shark.
 `
 
-	tmpl, err := template.New("").Parse(expectedModelfile)
-	require.NoError(t, err)
+		actual := buildModelfile(opts)
+		if diff := cmp.Diff(expect, actual); diff != "" {
+			t.Errorf("mismatch (-want +got):\n%s", diff)
+		}
+	})
 
-	var buf bytes.Buffer
-	err = tmpl.Execute(&buf, opts)
-	require.NoError(t, err)
-	assert.Equal(t, buf.String(), mf)
-
-	opts.ParentModel = "horseshark"
-	mf = buildModelfile(opts)
-	expectedModelfile = `FROM {{.ParentModel}}
-SYSTEM """{{.System}}"""
+	t.Run("parent model", func(t *testing.T) {
+		opts.ParentModel = "horseshark"
+		expect := `FROM horseshark
+SYSTEM You are part horse and part shark, but all hork. Do horklike things
 PARAMETER penalize_newline false
 PARAMETER seed 42
-PARAMETER stop [hi there]
+PARAMETER stop hi
+PARAMETER stop there
 PARAMETER temperature 0.9
-
-MESSAGE user """Hey there hork!"""
-MESSAGE assistant """Yes it is true, I am half horse, half shark."""
+MESSAGE user Hey there hork!
+MESSAGE assistant Yes it is true, I am half horse, half shark.
 `
-
-	tmpl, err = template.New("").Parse(expectedModelfile)
-	require.NoError(t, err)
-
-	var parentBuf bytes.Buffer
-	err = tmpl.Execute(&parentBuf, opts)
-	require.NoError(t, err)
-	assert.Equal(t, parentBuf.String(), mf)
+		actual := buildModelfile(opts)
+		if diff := cmp.Diff(expect, actual); diff != "" {
+			t.Errorf("mismatch (-want +got):\n%s", diff)
+		}
+	})
 }

From a622c47bd32e4c7d8d6cd12ba8c7556fcc492524 Mon Sep 17 00:00:00 2001
From: Michael Yang 
Date: Fri, 26 Jul 2024 14:10:18 -0700
Subject: [PATCH 29/46] fix nil deref in auth.go

---
 server/auth.go   | 2 +-
 server/upload.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/auth.go b/server/auth.go
index e92a5b65..dcef5bf9 100644
--- a/server/auth.go
+++ b/server/auth.go
@@ -67,7 +67,7 @@ func getAuthorizationToken(ctx context.Context, challenge registryChallenge) (st
 
 	headers.Add("Authorization", signature)
 
-	response, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, nil)
+	response, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, ®istryOptions{})
 	if err != nil {
 		return "", err
 	}
diff --git a/server/upload.go b/server/upload.go
index 73ce78ce..c4078c22 100644
--- a/server/upload.go
+++ b/server/upload.go
@@ -254,7 +254,7 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
 
 		// retry uploading to the redirect URL
 		for try := range maxRetries {
-			err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, nil)
+			err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, ®istryOptions{})
 			switch {
 			case errors.Is(err, context.Canceled):
 				return err

From 750c1c55f7ea65219e4e24d6107a4a3ad519b53f Mon Sep 17 00:00:00 2001
From: Blake Mizerany 
Date: Fri, 26 Jul 2024 14:24:24 -0700
Subject: [PATCH 30/46] server: fix race conditions during download (#5994)

This fixes various data races scattered throughout the download/pull
client where the client was accessing the download state concurrently.

This commit is mostly a hot-fix and will be replaced by a new client one
day soon.

Also, remove the unnecessary opts argument from downloadChunk.
---
 server/download.go | 59 ++++++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/server/download.go b/server/download.go
index 8b5b577f..45483ba6 100644
--- a/server/download.go
+++ b/server/download.go
@@ -44,17 +44,19 @@ type blobDownload struct {
 
 	context.CancelFunc
 
-	done       bool
+	done       chan struct{}
 	err        error
 	references atomic.Int32
 }
 
 type blobDownloadPart struct {
-	N           int
-	Offset      int64
-	Size        int64
-	Completed   int64
-	lastUpdated time.Time
+	N         int
+	Offset    int64
+	Size      int64
+	Completed atomic.Int64
+
+	lastUpdatedMu sync.Mutex
+	lastUpdated   time.Time
 
 	*blobDownload `json:"-"`
 }
@@ -72,7 +74,7 @@ func (p *blobDownloadPart) Name() string {
 }
 
 func (p *blobDownloadPart) StartsAt() int64 {
-	return p.Offset + p.Completed
+	return p.Offset + p.Completed.Load()
 }
 
 func (p *blobDownloadPart) StopsAt() int64 {
@@ -82,7 +84,9 @@ func (p *blobDownloadPart) StopsAt() int64 {
 func (p *blobDownloadPart) Write(b []byte) (n int, err error) {
 	n = len(b)
 	p.blobDownload.Completed.Add(int64(n))
+	p.lastUpdatedMu.Lock()
 	p.lastUpdated = time.Now()
+	p.lastUpdatedMu.Unlock()
 	return n, nil
 }
 
@@ -92,6 +96,8 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 		return err
 	}
 
+	b.done = make(chan struct{})
+
 	for _, partFilePath := range partFilePaths {
 		part, err := b.readPart(partFilePath)
 		if err != nil {
@@ -99,7 +105,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 		}
 
 		b.Total += part.Size
-		b.Completed.Add(part.Completed)
+		b.Completed.Add(part.Completed.Load())
 		b.Parts = append(b.Parts, part)
 	}
 
@@ -139,6 +145,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
 }
 
 func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *registryOptions) {
+	defer close(b.done)
 	b.err = b.run(ctx, requestURL, opts)
 }
 
@@ -230,7 +237,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 	g.SetLimit(numDownloadParts)
 	for i := range b.Parts {
 		part := b.Parts[i]
-		if part.Completed == part.Size {
+		if part.Completed.Load() == part.Size {
 			continue
 		}
 
@@ -238,7 +245,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 			var err error
 			for try := 0; try < maxRetries; try++ {
 				w := io.NewOffsetWriter(file, part.StartsAt())
-				err = b.downloadChunk(inner, directURL, w, part, opts)
+				err = b.downloadChunk(inner, directURL, w, part)
 				switch {
 				case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
 					// return immediately if the context is canceled or the device is out of space
@@ -279,29 +286,31 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
 		return err
 	}
 
-	b.done = true
 	return nil
 }
 
-func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *registryOptions) error {
+func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart) error {
 	g, ctx := errgroup.WithContext(ctx)
 	g.Go(func() error {
-		headers := make(http.Header)
-		headers.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1))
-		resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, opts)
+		req, err := http.NewRequestWithContext(ctx, http.MethodGet, requestURL.String(), nil)
+		if err != nil {
+			return err
+		}
+		req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1))
+		resp, err := http.DefaultClient.Do(req)
 		if err != nil {
 			return err
 		}
 		defer resp.Body.Close()
 
-		n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed)
+		n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed.Load())
 		if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) {
 			// rollback progress
 			b.Completed.Add(-n)
 			return err
 		}
 
-		part.Completed += n
+		part.Completed.Add(n)
 		if err := b.writePart(part.Name(), part); err != nil {
 			return err
 		}
@@ -315,15 +324,21 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
 		for {
 			select {
 			case <-ticker.C:
-				if part.Completed >= part.Size {
+				if part.Completed.Load() >= part.Size {
 					return nil
 				}
 
-				if !part.lastUpdated.IsZero() && time.Since(part.lastUpdated) > 5*time.Second {
+				part.lastUpdatedMu.Lock()
+				lastUpdated := part.lastUpdated
+				part.lastUpdatedMu.Unlock()
+
+				if !lastUpdated.IsZero() && time.Since(lastUpdated) > 5*time.Second {
 					const msg = "%s part %d stalled; retrying. If this persists, press ctrl-c to exit, then 'ollama pull' to find a faster connection."
 					slog.Info(fmt.Sprintf(msg, b.Digest[7:19], part.N))
 					// reset last updated
+					part.lastUpdatedMu.Lock()
 					part.lastUpdated = time.Time{}
+					part.lastUpdatedMu.Unlock()
 					return errPartStalled
 				}
 			case <-ctx.Done():
@@ -388,6 +403,8 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse))
 	ticker := time.NewTicker(60 * time.Millisecond)
 	for {
 		select {
+		case <-b.done:
+			return b.err
 		case <-ticker.C:
 			fn(api.ProgressResponse{
 				Status:    fmt.Sprintf("pulling %s", b.Digest[7:19]),
@@ -395,10 +412,6 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse))
 				Total:     b.Total,
 				Completed: b.Completed.Load(),
 			})
-
-			if b.done || b.err != nil {
-				return b.err
-			}
 		case <-ctx.Done():
 			return ctx.Err()
 		}

From f2a96c7d778249a7f911471b6a1532339e42fcf5 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan 
Date: Fri, 26 Jul 2024 18:20:52 -0400
Subject: [PATCH 31/46] llm: keep patch for llama 3 rope factors (#5987)

---
 llm/patches/10-llama3-rope.diff | 70 +++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 llm/patches/10-llama3-rope.diff

diff --git a/llm/patches/10-llama3-rope.diff b/llm/patches/10-llama3-rope.diff
new file mode 100644
index 00000000..39f38fea
--- /dev/null
+++ b/llm/patches/10-llama3-rope.diff
@@ -0,0 +1,70 @@
+From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
+From: Michael Yang 
+Date: Tue, 23 Jul 2024 14:33:29 -0700
+Subject: [PATCH] llama 3.1 rope scaling
+
+---
+ src/llama.cpp | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+diff --git a/src/llama.cpp b/src/llama.cpp
+index 8fe51971..a9969df8 100644
+--- a/src/llama.cpp
++++ b/src/llama.cpp
+@@ -2472,6 +2472,7 @@ struct llama_layer {
+     // long rope factors
+     struct ggml_tensor * rope_long  = nullptr;
+     struct ggml_tensor * rope_short = nullptr;
++    struct ggml_tensor * rope_freqs = nullptr;
+ 
+     // bitnet scale
+     struct ggml_tensor * wq_scale;
+@@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
+ 
+                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ 
++                        layer.rope_freqs  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS,  "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
++
+                         if (n_expert == 0) {
+                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+@@ -8620,6 +8623,10 @@ struct llm_build_context {
+         // choose long/short freq factors based on the context size
+         const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
+ 
++        if (model.layers[il].rope_freqs != nullptr) {
++            return model.layers[il].rope_freqs;
++        }
++
+         if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
+             return model.layers[il].rope_long;
+         }
+@@ -8814,6 +8821,9 @@ struct llm_build_context {
+ 
+             // self-attention
+             {
++                // rope freq factors for llama3; may return nullptr for llama2 and other models
++                struct ggml_tensor * rope_factors = build_rope_factors(il);
++
+                 // compute Q and K and RoPE them
+                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                 cb(Qcur, "Qcur", il);
+@@ -8837,14 +8847,14 @@ struct llm_build_context {
+                 }
+ 
+                 Qcur = ggml_rope_ext(
+-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
++                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                     ext_factor, attn_factor, beta_fast, beta_slow
+                 );
+                 cb(Qcur, "Qcur", il);
+ 
+                 Kcur = ggml_rope_ext(
+-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
++                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                     ext_factor, attn_factor, beta_fast, beta_slow
+                 );
+-- 
+2.45.2

From f3d7a481b75e0af89ae946d3923a239a3d835643 Mon Sep 17 00:00:00 2001
From: Tibor Schmidt 
Date: Sat, 27 Jul 2024 23:37:40 +0200
Subject: [PATCH 32/46] feat: add support for min_p (resolve #1142) (#1825)

---
 api/types.go          | 1 +
 cmd/interactive.go    | 1 +
 docs/api.md           | 1 +
 docs/modelfile.md     | 1 +
 llm/server.go         | 1 +
 parser/parser_test.go | 1 +
 6 files changed, 6 insertions(+)

diff --git a/api/types.go b/api/types.go
index 65a99c76..35121813 100644
--- a/api/types.go
+++ b/api/types.go
@@ -209,6 +209,7 @@ type Options struct {
 	NumPredict       int      `json:"num_predict,omitempty"`
 	TopK             int      `json:"top_k,omitempty"`
 	TopP             float32  `json:"top_p,omitempty"`
+	MinP             float32  `json:"min_p,omitempty"`
 	TFSZ             float32  `json:"tfs_z,omitempty"`
 	TypicalP         float32  `json:"typical_p,omitempty"`
 	RepeatLastN      int      `json:"repeat_last_n,omitempty"`
diff --git a/cmd/interactive.go b/cmd/interactive.go
index adbc3e9f..c3cdf629 100644
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@@ -138,6 +138,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		fmt.Fprintln(os.Stderr, "  /set parameter num_predict       Max number of tokens to predict")
 		fmt.Fprintln(os.Stderr, "  /set parameter top_k             Pick from top k num of tokens")
 		fmt.Fprintln(os.Stderr, "  /set parameter top_p           Pick token based on sum of probabilities")
+		fmt.Fprintln(os.Stderr, "  /set parameter min_p           Pick token based on top token probability * min_p")
 		fmt.Fprintln(os.Stderr, "  /set parameter num_ctx           Set the context size")
 		fmt.Fprintln(os.Stderr, "  /set parameter temperature     Set creativity level")
 		fmt.Fprintln(os.Stderr, "  /set parameter repeat_penalty  How strongly to penalize repetitions")
diff --git a/docs/api.md b/docs/api.md
index 2d4fe28f..90b41f3e 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -336,6 +336,7 @@ curl http://localhost:11434/api/generate -d '{
     "num_predict": 100,
     "top_k": 20,
     "top_p": 0.9,
+    "min_p": 0.0,
     "tfs_z": 0.5,
     "typical_p": 0.7,
     "repeat_last_n": 33,
diff --git a/docs/modelfile.md b/docs/modelfile.md
index c3645b06..852bf96c 100644
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -141,6 +141,7 @@ PARAMETER  
 | num_predict    | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)                                                                                                                                   | int        | num_predict 42       |
 | top_k          | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)                                                                        | int        | top_k 40             |
 | top_p          | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)                                                                 | float      | top_p 0.9            |
+| min_p          | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float      | min_p 0.05            |
 
 ### TEMPLATE
 
diff --git a/llm/server.go b/llm/server.go
index 55732773..8127960f 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -727,6 +727,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 		"temperature":       req.Options.Temperature,
 		"top_k":             req.Options.TopK,
 		"top_p":             req.Options.TopP,
+		"min_p":             req.Options.MinP,
 		"tfs_z":             req.Options.TFSZ,
 		"typical_p":         req.Options.TypicalP,
 		"repeat_last_n":     req.Options.RepeatLastN,
diff --git a/parser/parser_test.go b/parser/parser_test.go
index 2b5c4c88..48044bc0 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -451,6 +451,7 @@ func TestParseFileParameters(t *testing.T) {
 		"num_predict 1":                {"num_predict", "1"},
 		"top_k 1":                      {"top_k", "1"},
 		"top_p 1.0":                    {"top_p", "1.0"},
+		"min_p 0.05":                   {"min_p", "0.05"},
 		"tfs_z 1.0":                    {"tfs_z", "1.0"},
 		"typical_p 1.0":                {"typical_p", "1.0"},
 		"repeat_last_n 1":              {"repeat_last_n", "1"},

From 2c01610616074ef631ba5248f226099547ee7f57 Mon Sep 17 00:00:00 2001
From: Michael 
Date: Sun, 28 Jul 2024 17:21:38 -0400
Subject: [PATCH 33/46] update readme to llama3.1 (#5933)

---
 README.md | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index e7b12943..65c3a013 100644
--- a/README.md
+++ b/README.md
@@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
 
 ## Quickstart
 
-To run and chat with [Llama 3](https://ollama.com/library/llama3):
+To run and chat with [Llama 3.1](https://ollama.com/library/llama3.1):
 
 ```
-ollama run llama3
+ollama run llama3.1
 ```
 
 ## Model library
@@ -49,8 +49,9 @@ Here are some example models that can be downloaded:
 
 | Model              | Parameters | Size  | Download                       |
 | ------------------ | ---------- | ----- | ------------------------------ |
-| Llama 3            | 8B         | 4.7GB | `ollama run llama3`            |
-| Llama 3            | 70B        | 40GB  | `ollama run llama3:70b`        |
+| Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
+| Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
+| Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |
 | Phi 3 Mini         | 3.8B       | 2.3GB | `ollama run phi3`              |
 | Phi 3 Medium       | 14B        | 7.9GB | `ollama run phi3:medium`       |
 | Gemma 2            | 9B         | 5.5GB | `ollama run gemma2`            |
@@ -97,16 +98,16 @@ See the [guide](docs/import.md) on importing models for more information.
 
 ### Customize a prompt
 
-Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3` model:
+Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.1` model:
 
 ```
-ollama pull llama3
+ollama pull llama3.1
 ```
 
 Create a `Modelfile`:
 
 ```
-FROM llama3
+FROM llama3.1
 
 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
@@ -141,7 +142,7 @@ ollama create mymodel -f ./Modelfile
 ### Pull a model
 
 ```
-ollama pull llama3
+ollama pull llama3.1
 ```
 
 > This command can also be used to update a local model. Only the diff will be pulled.
@@ -149,13 +150,13 @@ ollama pull llama3
 ### Remove a model
 
 ```
-ollama rm llama3
+ollama rm llama3.1
 ```
 
 ### Copy a model
 
 ```
-ollama cp llama3 my-model
+ollama cp llama3.1 my-model
 ```
 
 ### Multiline input
@@ -179,14 +180,14 @@ The image features a yellow smiley face, which is likely the central focus of th
 ### Pass the prompt as an argument
 
 ```
-$ ollama run llama3 "Summarize this file: $(cat README.md)"
+$ ollama run llama3.1 "Summarize this file: $(cat README.md)"
  Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```
 
 ### Show model information
 
 ```
-ollama show llama3
+ollama show llama3.1
 ```
 
 ### List models on your computer
@@ -214,7 +215,7 @@ Next, start the server:
 Finally, in a separate shell, run a model:
 
 ```
-./ollama run llama3
+./ollama run llama3.1
 ```
 
 ## REST API
@@ -225,7 +226,7 @@ Ollama has a REST API for running and managing models.
 
 ```
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3",
+  "model": "llama3.1",
   "prompt":"Why is the sky blue?"
 }'
 ```
@@ -234,7 +235,7 @@ curl http://localhost:11434/api/generate -d '{
 
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3",
+  "model": "llama3.1",
   "messages": [
     { "role": "user", "content": "why is the sky blue?" }
   ]

From 0e4d653687f81db40622e287a846245b319f1fbe Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan 
Date: Sun, 28 Jul 2024 19:56:02 -0700
Subject: [PATCH 34/46] upate to `llama3.1` elsewhere in repo (#6032)

---
 app/ollama.iss                | 2 +-
 app/ollama_welcome.ps1        | 2 +-
 docs/docker.md                | 2 +-
 docs/faq.md                   | 2 +-
 docs/tutorials/langchainjs.md | 4 ++--
 macapp/src/app.tsx            | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/app/ollama.iss b/app/ollama.iss
index 6bedb9ff..dc6178f7 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -138,7 +138,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
 
 
 ;FinishedHeadingLabel=Run your first model
-;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3
+;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3.1
 ;ClickFinish=%n
 
 [Registry]
diff --git a/app/ollama_welcome.ps1 b/app/ollama_welcome.ps1
index 9af37a46..46777a3a 100644
--- a/app/ollama_welcome.ps1
+++ b/app/ollama_welcome.ps1
@@ -4,5 +4,5 @@ write-host "Welcome to Ollama!"
 write-host ""
 write-host "Run your first model:"
 write-host ""
-write-host "`tollama run llama3"
+write-host "`tollama run llama3.1"
 write-host ""
\ No newline at end of file
diff --git a/docs/docker.md b/docs/docker.md
index 0b58562b..a34c3291 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -63,7 +63,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114
 Now you can run a model:
 
 ```
-docker exec -it ollama ollama run llama3
+docker exec -it ollama ollama run llama3.1
 ```
 
 ### Try different models
diff --git a/docs/faq.md b/docs/faq.md
index da1848f7..f2f32af4 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -227,7 +227,7 @@ curl http://localhost:11434/api/chat -d '{"model": "mistral"}'
 
 To preload a model using the CLI, use the command:
 ```shell
-ollama run llama3 ""
+ollama run llama3.1 ""
 ```
 
 ## How do I keep a model loaded in memory or make it unload immediately?
diff --git a/docs/tutorials/langchainjs.md b/docs/tutorials/langchainjs.md
index 4d60afb6..f925869b 100644
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -15,7 +15,7 @@ import { Ollama } from "@langchain/community/llms/ollama";
 
 const ollama = new Ollama({
   baseUrl: "http://localhost:11434",
-  model: "llama3",
+  model: "llama3.1",
 });
 
 const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`);
 console.log(answer);
 ```
 
-That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
+That will get us the same thing as if we ran `ollama run llama3.1 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
 
 ```bash
 npm install cheerio
diff --git a/macapp/src/app.tsx b/macapp/src/app.tsx
index ab17df60..a627e63d 100644
--- a/macapp/src/app.tsx
+++ b/macapp/src/app.tsx
@@ -19,7 +19,7 @@ export default function () {
   const [step, setStep] = useState(Step.WELCOME)
   const [commandCopied, setCommandCopied] = useState(false)
 
-  const command = 'ollama run llama3'
+  const command = 'ollama run llama3.1'
 
   return (
     
From 6f26e9322fd4639b4e414f8890b0213783e74d7c Mon Sep 17 00:00:00 2001 From: Veit Heller Date: Mon, 29 Jul 2024 17:50:53 +0200 Subject: [PATCH 35/46] Fix typo in image docs (#6041) --- docs/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api.md b/docs/api.md index 90b41f3e..c0202ef3 100644 --- a/docs/api.md +++ b/docs/api.md @@ -587,7 +587,7 @@ Final response: ##### Request -Send a chat message with a conversation history. +Send a chat message with images. The images should be provided as an array, with the individual images encoded in Base64. ```shell curl http://localhost:11434/api/chat -d '{ From f26aef9a8bfdd3e0f0d13cafe8bd371f29d9d877 Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Tue, 30 Jul 2024 02:53:30 +0900 Subject: [PATCH 36/46] docs: update README.md (#6059) HuggingFace -> Hugging Face --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 65c3a013..824b3761 100644 --- a/README.md +++ b/README.md @@ -390,7 +390,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Llama Coder](https://github.com/ex3ndr/llama-coder) (Copilot alternative using Ollama) - [Ollama Copilot](https://github.com/bernardo-bruning/ollama-copilot) (Proxy that allows you to use ollama as a copilot like Github copilot) - [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama) -- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and HuggingFace) +- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face) - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension) - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend) - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support) From 68ee42f995a04bd163eb1c714f53d4c25ab25474 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 29 Jul 2024 13:20:26 -0700 Subject: [PATCH 37/46] update llama.cpp submodule to `6eeaeba1` (#6039) --- llm/ext_server/server.cpp | 9 --- llm/llama.cpp | 2 +- llm/patches/05-default-pretokenizer.diff | 10 ++-- llm/patches/09-lora.diff | 6 +- llm/patches/10-llama3-rope.diff | 70 ------------------------ 5 files changed, 8 insertions(+), 89 deletions(-) delete mode 100644 llm/patches/10-llama3-rope.diff diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 14d921c0..0d51460c 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -2438,15 +2438,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); params.use_mmap = false; } - else if (arg == "--lora-base") - { - if (++i >= argc) - { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } else if (arg == "-v" || arg == "--verbose") { server_verbose = true; diff --git a/llm/llama.cpp b/llm/llama.cpp index d94c6e0c..6eeaeba1 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa +Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972 diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index 646bc49c..0d40fc3c 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,8 +1,8 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index 8fe51971..7113ba64 100644 +index a207451f..2ddf431d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -5433,16 +5433,7 @@ static void llm_load_vocab( +@@ -5347,16 +5347,7 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { vocab.tokenizer_add_space_prefix = false; vocab.tokenizer_clean_spaces = true; @@ -20,9 +20,9 @@ index 8fe51971..7113ba64 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -5526,7 +5517,8 @@ static void llm_load_vocab( - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM; - vocab.tokenizer_clean_spaces = false; +@@ -5443,7 +5434,8 @@ static void llm_load_vocab( + tokenizer_pre == "codeshell") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__); diff --git a/llm/patches/09-lora.diff b/llm/patches/09-lora.diff index fc1017a6..10c66d1d 100644 --- a/llm/patches/09-lora.diff +++ b/llm/patches/09-lora.diff @@ -2,7 +2,7 @@ diff --git a/common/common.cpp b/common/common.cpp index dbb724fb..c26fe6ee 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -2087,14 +2087,29 @@ std::tuple llama_init_from_gpt_par +@@ -2087,14 +2087,27 @@ std::tuple llama_init_from_gpt_par for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); @@ -20,9 +20,7 @@ index dbb724fb..c26fe6ee 100644 + int err = llama_model_apply_lora_from_file(model, + lora_adapter.c_str(), + lora_scale, -+ ((i > 0) || params.lora_base.empty()) -+ ? NULL -+ : params.lora_base.c_str(), ++ nullptr, + params.n_threads); + if (err != 0) { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); diff --git a/llm/patches/10-llama3-rope.diff b/llm/patches/10-llama3-rope.diff deleted file mode 100644 index 39f38fea..00000000 --- a/llm/patches/10-llama3-rope.diff +++ /dev/null @@ -1,70 +0,0 @@ -From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001 -From: Michael Yang -Date: Tue, 23 Jul 2024 14:33:29 -0700 -Subject: [PATCH] llama 3.1 rope scaling - ---- - src/llama.cpp | 14 ++++++++++++-- - 1 file changed, 12 insertions(+), 2 deletions(-) - -diff --git a/src/llama.cpp b/src/llama.cpp -index 8fe51971..a9969df8 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -2472,6 +2472,7 @@ struct llama_layer { - // long rope factors - struct ggml_tensor * rope_long = nullptr; - struct ggml_tensor * rope_short = nullptr; -+ struct ggml_tensor * rope_freqs = nullptr; - - // bitnet scale - struct ggml_tensor * wq_scale; -@@ -6143,6 +6144,8 @@ static bool llm_load_tensors( - - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - -+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); -+ - if (n_expert == 0) { - layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); - layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); -@@ -8620,6 +8623,10 @@ struct llm_build_context { - // choose long/short freq factors based on the context size - const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; - -+ if (model.layers[il].rope_freqs != nullptr) { -+ return model.layers[il].rope_freqs; -+ } -+ - if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { - return model.layers[il].rope_long; - } -@@ -8814,6 +8821,9 @@ struct llm_build_context { - - // self-attention - { -+ // rope freq factors for llama3; may return nullptr for llama2 and other models -+ struct ggml_tensor * rope_factors = build_rope_factors(il); -+ - // compute Q and K and RoPE them - struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); -@@ -8837,14 +8847,14 @@ struct llm_build_context { - } - - Qcur = ggml_rope_ext( -- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, -+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); - cb(Qcur, "Qcur", il); - - Kcur = ggml_rope_ext( -- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, -+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor, beta_fast, beta_slow - ); --- -2.45.2 From 46e6327e0f85b046f5f92995d7f59146d347cd70 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 29 Jul 2024 13:35:16 -0700 Subject: [PATCH 38/46] api: add stringifier for `Tool` (#5891) --- api/types.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/api/types.go b/api/types.go index 35121813..ea5161ff 100644 --- a/api/types.go +++ b/api/types.go @@ -114,6 +114,11 @@ func (t Tools) String() string { return string(bts) } +func (t Tool) String() string { + bts, _ := json.Marshal(t) + return string(bts) +} + // Message is a single message in a chat sequence. The message contains the // role ("system", "user", or "assistant"), the content and an optional list // of images. From 365431d40617b85d0308fec8d0bd9c0cdb1ab3a4 Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Mon, 29 Jul 2024 13:56:57 -0700 Subject: [PATCH 39/46] return tool calls finish reason for openai (#5995) * hot fix * backend stream support * clean up * finish reason * move to openai --- openai/openai.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/openai/openai.go b/openai/openai.go index de6f4bd5..5bd80660 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -218,6 +218,9 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion { Index: 0, Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls}, FinishReason: func(reason string) *string { + if len(toolCalls) > 0 { + reason = "tool_calls" + } if len(reason) > 0 { return &reason } From 0be8baad2b684cda667fa5d48bf334382913a09c Mon Sep 17 00:00:00 2001 From: Kim Hallberg Date: Tue, 30 Jul 2024 08:56:37 +0200 Subject: [PATCH 40/46] Update and Fix example models (#6065) * Update example models * Remove unused README.md --- examples/go-chat/main.go | 2 +- examples/go-generate-streaming/main.go | 2 +- examples/go-generate/main.go | 2 +- examples/go-http-generate/README.md | 0 examples/langchain-python-rag-document/README.md | 8 ++++++++ examples/langchain-python-rag-document/main.py | 2 +- examples/langchain-python-rag-websummary/README.md | 4 ++-- examples/langchain-python-rag-websummary/main.py | 4 ++-- examples/langchain-python-simple/README.md | 4 ++-- examples/langchain-python-simple/main.py | 2 +- examples/modelfile-mario/Modelfile | 2 +- examples/modelfile-mario/readme.md | 6 +++--- examples/python-dockerit/dockerit.py | 2 +- examples/python-json-datagenerator/predefinedschema.py | 2 +- examples/python-json-datagenerator/randomaddresses.py | 2 +- examples/python-json-datagenerator/readme.md | 4 ++-- examples/python-simplechat/client.py | 2 +- examples/python-simplechat/readme.md | 4 ++-- examples/typescript-simplechat/client.ts | 2 +- 19 files changed, 32 insertions(+), 24 deletions(-) delete mode 100644 examples/go-http-generate/README.md diff --git a/examples/go-chat/main.go b/examples/go-chat/main.go index 5266f03e..7663fb8f 100644 --- a/examples/go-chat/main.go +++ b/examples/go-chat/main.go @@ -35,7 +35,7 @@ func main() { ctx := context.Background() req := &api.ChatRequest{ - Model: "llama3", + Model: "llama3.1", Messages: messages, } diff --git a/examples/go-generate-streaming/main.go b/examples/go-generate-streaming/main.go index 49403351..3acfb22a 100644 --- a/examples/go-generate-streaming/main.go +++ b/examples/go-generate-streaming/main.go @@ -16,7 +16,7 @@ func main() { // By default, GenerateRequest is streaming. req := &api.GenerateRequest{ - Model: "gemma", + Model: "gemma2", Prompt: "how many planets are there?", } diff --git a/examples/go-generate/main.go b/examples/go-generate/main.go index 50fbf64b..2fe28742 100644 --- a/examples/go-generate/main.go +++ b/examples/go-generate/main.go @@ -15,7 +15,7 @@ func main() { } req := &api.GenerateRequest{ - Model: "gemma", + Model: "gemma2", Prompt: "how many planets are there?", // set streaming to false diff --git a/examples/go-http-generate/README.md b/examples/go-http-generate/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/examples/langchain-python-rag-document/README.md b/examples/langchain-python-rag-document/README.md index 20a73a88..e2f3bc02 100644 --- a/examples/langchain-python-rag-document/README.md +++ b/examples/langchain-python-rag-document/README.md @@ -4,6 +4,14 @@ This example provides an interface for asking questions to a PDF document. ## Setup +1. Ensure you have the `llama3.1` model installed: + +``` +ollama pull llama3.1 +``` + +2. Install the Python Requirements. + ``` pip install -r requirements.txt ``` diff --git a/examples/langchain-python-rag-document/main.py b/examples/langchain-python-rag-document/main.py index 3ed9499f..6f7cec9b 100644 --- a/examples/langchain-python-rag-document/main.py +++ b/examples/langchain-python-rag-document/main.py @@ -51,7 +51,7 @@ while True: template=template, ) - llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])) + llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])) qa_chain = RetrievalQA.from_chain_type( llm, retriever=vectorstore.as_retriever(), diff --git a/examples/langchain-python-rag-websummary/README.md b/examples/langchain-python-rag-websummary/README.md index 3f3b9873..29c706a3 100644 --- a/examples/langchain-python-rag-websummary/README.md +++ b/examples/langchain-python-rag-websummary/README.md @@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso ## Running the Example -1. Ensure you have the `llama2` model installed: +1. Ensure you have the `llama3.1` model installed: ```bash - ollama pull llama2 + ollama pull llama3.1 ``` 2. Install the Python Requirements. diff --git a/examples/langchain-python-rag-websummary/main.py b/examples/langchain-python-rag-websummary/main.py index d1b05ba8..77b09fbb 100644 --- a/examples/langchain-python-rag-websummary/main.py +++ b/examples/langchain-python-rag-websummary/main.py @@ -5,8 +5,8 @@ from langchain.chains.summarize import load_summarize_chain loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally") docs = loader.load() -llm = Ollama(model="llama3") +llm = Ollama(model="llama3.1") chain = load_summarize_chain(llm, chain_type="stuff") -result = chain.invoke(docs) +result = chain.invoke(docs) print(result) diff --git a/examples/langchain-python-simple/README.md b/examples/langchain-python-simple/README.md index d4102dec..60db2c8c 100644 --- a/examples/langchain-python-simple/README.md +++ b/examples/langchain-python-simple/README.md @@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama. ## Running the Example -1. Ensure you have the `llama3` model installed: +1. Ensure you have the `llama3.1` model installed: ```bash - ollama pull llama3 + ollama pull llama3.1 ``` 2. Install the Python Requirements. diff --git a/examples/langchain-python-simple/main.py b/examples/langchain-python-simple/main.py index 7cb65286..a7ed81d6 100644 --- a/examples/langchain-python-simple/main.py +++ b/examples/langchain-python-simple/main.py @@ -1,6 +1,6 @@ from langchain.llms import Ollama input = input("What is your question?") -llm = Ollama(model="llama3") +llm = Ollama(model="llama3.1") res = llm.predict(input) print (res) diff --git a/examples/modelfile-mario/Modelfile b/examples/modelfile-mario/Modelfile index 33d5952b..a3747086 100644 --- a/examples/modelfile-mario/Modelfile +++ b/examples/modelfile-mario/Modelfile @@ -1,4 +1,4 @@ -FROM llama3 +FROM llama3.1 PARAMETER temperature 1 SYSTEM """ You are Mario from super mario bros, acting as an assistant. diff --git a/examples/modelfile-mario/readme.md b/examples/modelfile-mario/readme.md index e4f0d417..c3f34197 100644 --- a/examples/modelfile-mario/readme.md +++ b/examples/modelfile-mario/readme.md @@ -2,12 +2,12 @@ # Example character: Mario -This example shows how to create a basic character using Llama3 as the base model. +This example shows how to create a basic character using Llama3.1 as the base model. To run this example: 1. Download the Modelfile -2. `ollama pull llama3` to get the base model used in the model file. +2. `ollama pull llama3.1` to get the base model used in the model file. 3. `ollama create NAME -f ./Modelfile` 4. `ollama run NAME` @@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?" What the model file looks like: ``` -FROM llama3 +FROM llama3.1 PARAMETER temperature 1 SYSTEM """ You are Mario from Super Mario Bros, acting as an assistant. diff --git a/examples/python-dockerit/dockerit.py b/examples/python-dockerit/dockerit.py index b013102f..6a288d90 100644 --- a/examples/python-dockerit/dockerit.py +++ b/examples/python-dockerit/dockerit.py @@ -4,7 +4,7 @@ imageName = input("Enter the name of the image: ") client = docker.from_env() s = requests.Session() output="" -with s.post('http://localhost:11434/api/generate', json={'model': 'dockerit', 'prompt': inputDescription}, stream=True) as r: +with s.post('http://localhost:11434/api/generate', json={'model': 'mattw/dockerit', 'prompt': inputDescription}, stream=True) as r: for line in r.iter_lines(): if line: j = json.loads(line) diff --git a/examples/python-json-datagenerator/predefinedschema.py b/examples/python-json-datagenerator/predefinedschema.py index 1fd54892..68090ad7 100644 --- a/examples/python-json-datagenerator/predefinedschema.py +++ b/examples/python-json-datagenerator/predefinedschema.py @@ -2,7 +2,7 @@ import requests import json import random -model = "llama3" +model = "llama3.1" template = { "firstName": "", "lastName": "", diff --git a/examples/python-json-datagenerator/randomaddresses.py b/examples/python-json-datagenerator/randomaddresses.py index 72b1fefb..878c9803 100644 --- a/examples/python-json-datagenerator/randomaddresses.py +++ b/examples/python-json-datagenerator/randomaddresses.py @@ -12,7 +12,7 @@ countries = [ "France", ] country = random.choice(countries) -model = "llama3" +model = "llama3.1" prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters." diff --git a/examples/python-json-datagenerator/readme.md b/examples/python-json-datagenerator/readme.md index 88357044..5b444dff 100644 --- a/examples/python-json-datagenerator/readme.md +++ b/examples/python-json-datagenerator/readme.md @@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran ## Running the Example -1. Ensure you have the `llama3` model installed: +1. Ensure you have the `llama3.1` model installed: ```bash - ollama pull llama3 + ollama pull llama3.1 ``` 2. Install the Python Requirements. diff --git a/examples/python-simplechat/client.py b/examples/python-simplechat/client.py index f82a16b3..85043d5f 100644 --- a/examples/python-simplechat/client.py +++ b/examples/python-simplechat/client.py @@ -2,7 +2,7 @@ import json import requests # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve` -model = "llama3" # TODO: update this for whatever model you wish to use +model = "llama3.1" # TODO: update this for whatever model you wish to use def chat(messages): diff --git a/examples/python-simplechat/readme.md b/examples/python-simplechat/readme.md index dd2576bc..4c2ded4d 100644 --- a/examples/python-simplechat/readme.md +++ b/examples/python-simplechat/readme.md @@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam ## Running the Example -1. Ensure you have the `llama3` model installed: +1. Ensure you have the `llama3.1` model installed: ```bash - ollama pull llama3 + ollama pull llama3.1 ``` 2. Install the Python Requirements. diff --git a/examples/typescript-simplechat/client.ts b/examples/typescript-simplechat/client.ts index a1e0eea3..8ad113b1 100644 --- a/examples/typescript-simplechat/client.ts +++ b/examples/typescript-simplechat/client.ts @@ -1,6 +1,6 @@ import * as readline from "readline"; -const model = "llama3"; +const model = "llama3.1"; type Message = { role: "assistant" | "user" | "system"; content: string; From 345420998e90090d2d6fba38ad5c2f3f5512adf4 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 22 Jul 2024 11:57:26 -0700 Subject: [PATCH 41/46] Prevent partial loading on mixed GPU brands In mult-brand GPU setups, if we couldn't fully load the model we would fall through the scheduler and mistakenly try to load across a mix of brands. This makes sure we find the set of GPU(s) that best fit for the partial load. --- server/sched.go | 31 +++++++++++++++++++++++++++---- server/sched_test.go | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/server/sched.go b/server/sched.go index 2daed3ab..92b8d508 100644 --- a/server/sched.go +++ b/server/sched.go @@ -212,9 +212,12 @@ func (s *Scheduler) processPending(ctx context.Context) { } else if loadedCount == 0 { // No models loaded. Load the model but prefer the best fit. slog.Debug("loading first model", "model", pending.model.ModelPath) - g := pickBestFitGPUs(pending, ggml, gpus, &numParallel) + g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel) if g != nil { gpus = g + } else { + // Only allow partial loads when this is the first model + gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel) } s.loadFn(pending, ggml, gpus, numParallel) break @@ -231,7 +234,7 @@ func (s *Scheduler) processPending(ctx context.Context) { // Update free memory from currently loaded models s.updateFreeSpace(availGpus) - fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel) + fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel) if fitGpus != nil { slog.Debug("new model fits with existing models, loading") s.loadFn(pending, ggml, fitGpus, numParallel) @@ -668,11 +671,12 @@ func (a ByDuration) Less(i, j int) bool { // func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] } // func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM } -// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits +// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits +// The list of GPUs returned will always be the same brand (library) // If the model can not be fit fully within the available GPU(s) nil is returned // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust // opts.NumCtx accordingly -func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { +func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { var estimatedVRAM uint64 var numParallelToTry []int @@ -723,6 +727,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP return nil } +// If multiple Libraries are detected, pick the Library which loads the most layers for the model +func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { + *numParallel = 1 + byLibrary := gpus.ByLibrary() + if len(byLibrary) <= 1 { + return gpus + } + var bestEstimate uint64 + var bestFit int + for i, gl := range byLibrary { + _, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts) + if estimatedVRAM > bestEstimate { + bestEstimate = estimatedVRAM + bestFit = i + } + } + return byLibrary[bestFit] +} + // findRunnerToUnload finds a runner to unload to make room for a new model func (s *Scheduler) findRunnerToUnload() *runnerRef { s.loadedMu.Lock() diff --git a/server/sched_test.go b/server/sched_test.go index 9ddd1fab..a186ce0e 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -666,6 +666,45 @@ func TestAlreadyCanceled(t *testing.T) { require.Empty(t, scenario1a.req.successCh) } +func TestHomogeneousGPUs(t *testing.T) { + ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer done() + s := InitScheduler(ctx) + + s.getGpuFn = func() gpu.GpuInfoList { + // Set memory values to require the model to be spread + gpus := []gpu.GpuInfo{ + {Library: "cuda"}, + {Library: "rocm"}, + } + gpus[0].TotalMemory = 1 * format.GibiByte + gpus[0].FreeMemory = 256 * format.MebiByte + gpus[1].TotalMemory = 1 * format.GibiByte + gpus[1].FreeMemory = 256 * format.MebiByte + return gpus + } + s.getCpuFn = getCpuFn + a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}) + s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + require.Len(t, gpus, 1) + return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel) + } + slog.Info("a") + s.pendingReqCh <- a.req + require.Len(t, s.pendingReqCh, 1) + s.Run(ctx) + select { + case resp := <-a.req.successCh: + require.Equal(t, resp.llama, a.srv) + require.Empty(t, s.pendingReqCh) + require.Empty(t, a.req.errCh) + case err := <-a.req.errCh: + t.Fatal(err.Error()) + case <-ctx.Done(): + t.Fatal("timeout") + } +} + type mockLlm struct { pingResp error waitResp error From 1b44d873e74f62de4f53f154da386919c1426f8b Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Tue, 30 Jul 2024 13:12:21 -0700 Subject: [PATCH 42/46] Add Metrics to `api\embed` response (#5709) * add prompt tokens to embed response * rm slog * metrics * types * prompt n * clean up * reset submodule * update tests * test name * list metrics --- api/types.go | 4 ++++ integration/embed_test.go | 8 ++++++++ llm/ext_server/server.cpp | 7 ++++++- llm/server.go | 13 +++++++------ server/routes.go | 18 ++++++++++++------ server/sched_test.go | 4 ++-- 6 files changed, 39 insertions(+), 15 deletions(-) diff --git a/api/types.go b/api/types.go index ea5161ff..c2529652 100644 --- a/api/types.go +++ b/api/types.go @@ -267,6 +267,10 @@ type EmbedRequest struct { type EmbedResponse struct { Model string `json:"model"` Embeddings [][]float32 `json:"embeddings"` + + TotalDuration time.Duration `json:"total_duration,omitempty"` + LoadDuration time.Duration `json:"load_duration,omitempty"` + PromptEvalCount int `json:"prompt_eval_count,omitempty"` } // EmbeddingRequest is the request passed to [Client.Embeddings]. diff --git a/integration/embed_test.go b/integration/embed_test.go index 61b36fa2..10333d5d 100644 --- a/integration/embed_test.go +++ b/integration/embed_test.go @@ -69,6 +69,10 @@ func TestAllMiniLMEmbed(t *testing.T) { if !floatsEqual32(res.Embeddings[0][0], 0.010071031) { t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0]) } + + if res.PromptEvalCount != 8 { + t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount) + } } func TestAllMiniLMBatchEmbed(t *testing.T) { @@ -97,6 +101,10 @@ func TestAllMiniLMBatchEmbed(t *testing.T) { if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) { t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0]) } + + if res.PromptEvalCount != 16 { + t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount) + } } func TestAllMiniLMEmbedTruncate(t *testing.T) { diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 0d51460c..d72bb1b1 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1221,6 +1221,7 @@ struct llama_server_context res.result_json = json { {"embedding", std::vector(embd, embd + n_embd)}, + {"timings", slot.get_formated_timings()}, }; } } @@ -3203,11 +3204,15 @@ int main(int argc, char **argv) { responses = result.result_json.value("results", std::vector{result.result_json}); json embeddings = json::array(); + + int prompt_n = 0; for (auto & elem : responses) { embeddings.push_back(elem.at("embedding")); + prompt_n += elem.at("timings").at("prompt_n").get(); } + // send the result - json embedding_res = json{{"embedding", embeddings}}; + json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}}; return res.set_content(embedding_res.dump(), "application/json; charset=utf-8"); } }); diff --git a/llm/server.go b/llm/server.go index 8127960f..afde077e 100644 --- a/llm/server.go +++ b/llm/server.go @@ -33,7 +33,7 @@ type LlamaServer interface { Ping(ctx context.Context) error WaitUntilRunning(ctx context.Context) error Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error - Embed(ctx context.Context, input []string) ([][]float32, error) + Embed(ctx context.Context, input []string) (*EmbedResponse, error) Tokenize(ctx context.Context, content string) ([]int, error) Detokenize(ctx context.Context, tokens []int) (string, error) Close() error @@ -879,10 +879,11 @@ type EmbedRequest struct { } type EmbedResponse struct { - Embedding [][]float32 `json:"embedding"` + Embedding [][]float32 `json:"embedding"` + PromptEvalCount int `json:"prompt_n"` } -func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) { +func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) { if err := s.sem.Acquire(ctx, 1); err != nil { slog.Error("Failed to acquire semaphore", "error", err) return nil, err @@ -924,12 +925,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, err return nil, fmt.Errorf("%s", body) } - var embedding EmbedResponse - if err := json.Unmarshal(body, &embedding); err != nil { + var e EmbedResponse + if err := json.Unmarshal(body, &e); err != nil { return nil, fmt.Errorf("unmarshal tokenize response: %w", err) } - return embedding.Embedding, nil + return &e, nil } type TokenizeRequest struct { diff --git a/server/routes.go b/server/routes.go index e6ffe526..a560f369 100644 --- a/server/routes.go +++ b/server/routes.go @@ -284,6 +284,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { } func (s *Server) EmbedHandler(c *gin.Context) { + checkpointStart := time.Now() var req api.EmbedRequest err := c.ShouldBindJSON(&req) switch { @@ -332,6 +333,8 @@ func (s *Server) EmbedHandler(c *gin.Context) { return } + checkpointLoaded := time.Now() + kvData, err := getKVData(m.ModelPath, false) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) @@ -370,13 +373,16 @@ func (s *Server) EmbedHandler(c *gin.Context) { return } - for i, e := range embeddings { - embeddings[i] = normalize(e) + for i, e := range embeddings.Embedding { + embeddings.Embedding[i] = normalize(e) } resp := api.EmbedResponse{ - Model: req.Model, - Embeddings: embeddings, + Model: req.Model, + Embeddings: embeddings.Embedding, + TotalDuration: time.Since(checkpointStart), + LoadDuration: checkpointLoaded.Sub(checkpointStart), + PromptEvalCount: embeddings.PromptEvalCount, } c.JSON(http.StatusOK, resp) } @@ -428,9 +434,9 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { return } - embedding := make([]float64, len(embeddings[0])) + embedding := make([]float64, len(embeddings.Embedding[0])) - for i, v := range embeddings[0] { + for i, v := range embeddings.Embedding[0] { embedding[i] = float64(v) } diff --git a/server/sched_test.go b/server/sched_test.go index a186ce0e..4f8789fa 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -709,7 +709,7 @@ type mockLlm struct { pingResp error waitResp error completionResp error - embedResp [][]float32 + embedResp *llm.EmbedResponse embedRespErr error tokenizeResp []int tokenizeRespErr error @@ -727,7 +727,7 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error { return s.completionResp } -func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) { +func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) { return s.embedResp, s.embedRespErr } func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) { From afa8d6e9d56da834a03df7817d065f6c8b46e102 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Tue, 30 Jul 2024 18:06:26 -0700 Subject: [PATCH 43/46] patch gemma support --- llm/patches/10-params.diff | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 llm/patches/10-params.diff diff --git a/llm/patches/10-params.diff b/llm/patches/10-params.diff new file mode 100644 index 00000000..56699b8e --- /dev/null +++ b/llm/patches/10-params.diff @@ -0,0 +1,20 @@ +diff --git a/src/llama.cpp b/src/llama.cpp +index a207451f..fba6b175 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -4969,6 +4969,7 @@ static void llm_load_hparams( + hparams.attn_soft_cap = true; + + switch (hparams.n_layer) { ++ case 26: model.type = e_model::MODEL_2B; break; + case 42: model.type = e_model::MODEL_9B; break; + case 46: model.type = e_model::MODEL_27B; break; + default: model.type = e_model::MODEL_UNKNOWN; +@@ -11736,6 +11737,7 @@ struct llm_build_context { + + // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e + switch (model.type) { ++ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; + case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break; + case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break; + default: GGML_ABORT("fatal error"); From 5d6657835669064fa9658e6712b01887a072c606 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Tue, 30 Jul 2024 18:08:34 -0700 Subject: [PATCH 44/46] Update README.md Better example for multi-modal input --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 824b3761..0593a785 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol ### Multimodal models ``` ->>> What's in this image? /Users/jmorgan/Desktop/smile.png +ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png" The image features a yellow smiley face, which is likely the central focus of the picture. ``` From 3579b4966a9b21e048db4f7610e3f9f4a5c4dc64 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 30 Jul 2024 18:40:09 -0700 Subject: [PATCH 45/46] Update README to include Firebase Genkit (#6083) Firebase Genkit --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0593a785..941a4f99 100644 --- a/README.md +++ b/README.md @@ -337,6 +337,7 @@ See the [API documentation](./docs/api.md) for all endpoints. ### Libraries - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa) +- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama) - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example) - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java) - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs) From 463a8aa2731a9fe5258c6c7e1466f3dae27f0c6a Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Tue, 30 Jul 2024 21:01:12 -0700 Subject: [PATCH 46/46] Create SECURITY.md --- SECURITY.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..d38bb7c4 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,25 @@ +# Security + +The Ollama maintainer team takes security seriously and will actively work to resolve security issues. + +## Reporting a vulnerability + +If you discover a security vulnerability, please do not open a public issue. Instead, please report it by emailing hello@ollama.com. We ask that you give us sufficient time to investigate and address the vulnerability before disclosing it publicly. + +Please include the following details in your report: +- A description of the vulnerability +- Steps to reproduce the issue +- Your assessment of the potential impact +- Any possible mitigations + +## Security best practices + +While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as: + +- Regularly updating to the latest version of Ollama +- Securing access to hosted instances of Ollama +- Monitoring systems for unusual activity + +## Contact + +For any other questions or concerns related to security, please contact us at hello@ollama.com