From 784bf88b0d0005b771e1bab5adfd6094a3693494 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 18 Jun 2024 16:22:47 -0700 Subject: [PATCH 01/48] Wire up windows AMD driver reporting This seems to be ROCm version, not actually driver version, but it may be useful for toggling logic for VRAM reporting in the future --- gpu/amd_hip_windows.go | 5 ++--- gpu/amd_windows.go | 17 +++++++---------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/gpu/amd_hip_windows.go b/gpu/amd_hip_windows.go index 8572a24c..2586278c 100644 --- a/gpu/amd_hip_windows.go +++ b/gpu/amd_hip_windows.go @@ -84,9 +84,8 @@ func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) { } slog.Debug("hipDriverGetVersion", "version", version) - // TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway... - driverMajor = version / 1000 - driverMinor = (version - (driverMajor * 1000)) / 10 + driverMajor = version / 10000000 + driverMinor = (version - (driverMajor * 10000000)) / 100000 return driverMajor, driverMinor, nil } diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index 21585277..0c76f6b9 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -35,12 +35,11 @@ func AMDGetGPUInfo() []RocmGPUInfo { } defer hl.Release() - // TODO - this reports incorrect version information, so omitting for now - // driverMajor, driverMinor, err := hl.AMDDriverVersion() - // if err != nil { - // // For now this is benign, but we may eventually need to fail compatibility checks - // slog.Debug("error looking up amd driver version", "error", err) - // } + driverMajor, driverMinor, err := hl.AMDDriverVersion() + if err != nil { + // For now this is benign, but we may eventually need to fail compatibility checks + slog.Debug("error looking up amd driver version", "error", err) + } // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified count := hl.HipGetDeviceCount() @@ -131,10 +130,8 @@ func AMDGetGPUInfo() []RocmGPUInfo { MinimumMemory: rocmMinimumMemory, Name: name, Compute: gfx, - - // TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve - // DriverMajor: driverMajor, - // DriverMinor: driverMinor, + DriverMajor: driverMajor, + DriverMinor: driverMinor, }, index: i, } From 269ed6e6a2cea822ab137d40d5c70c8bf09470f8 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 17 Jun 2024 10:38:55 -0700 Subject: [PATCH 02/48] update message processing --- server/images.go | 17 +- server/prompt.go | 241 ++++-------------- server/prompt_test.go | 317 ++++++++++++------------ server/routes.go | 508 ++++++++++++-------------------------- template/template.go | 169 ++++++++++++- template/template_test.go | 153 +++++++++++- 6 files changed, 685 insertions(+), 720 deletions(-) diff --git a/server/images.go b/server/images.go index a62991f1..688d5dca 100644 --- a/server/images.go +++ b/server/images.go @@ -34,6 +34,8 @@ import ( "github.com/ollama/ollama/version" ) +var errCapabilityCompletion = errors.New("completion") + type Capability string const CapabilityCompletion = Capability("completion") @@ -62,7 +64,10 @@ type Model struct { Template *template.Template } -func (m *Model) Has(caps ...Capability) bool { +// CheckCapabilities checks if the model has the specified capabilities returning an error describing +// any missing or unknown capabilities +func (m *Model) CheckCapabilities(caps ...Capability) error { + var errs []error for _, cap := range caps { switch cap { case CapabilityCompletion: @@ -81,15 +86,19 @@ func (m *Model) Has(caps ...Capability) bool { } if _, ok := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]; ok { - return false + errs = append(errs, errCapabilityCompletion) } default: slog.Error("unknown capability", "capability", cap) - return false + return fmt.Errorf("unknown capability: %s", cap) } } - return true + if err := errors.Join(errs...); err != nil { + return fmt.Errorf("missing capabilities: %w", errors.Join(errs...)) + } + + return nil } func (m *Model) String() string { diff --git a/server/prompt.go b/server/prompt.go index bfc319a5..5016fbe1 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -1,217 +1,74 @@ package server import ( - "fmt" + "bytes" + "context" "log/slog" - "strings" - - "text/template/parse" + "slices" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/llm" "github.com/ollama/ollama/template" ) -// isResponseNode checks if the node contains .Response -func isResponseNode(node *parse.ActionNode) bool { - for _, cmd := range node.Pipe.Cmds { - for _, arg := range cmd.Args { - if fieldNode, ok := arg.(*parse.FieldNode); ok && len(fieldNode.Ident) > 0 { - if fieldNode.Ident[0] == "Response" { - return true - } - } +func chatPrompt(ctx context.Context, r *runnerRef, msgs []api.Message) (prompt string, images []llm.ImageData, _ error) { + // extract system messages which should always be included + var system []api.Message + msgs = slices.DeleteFunc(msgs, func(m api.Message) bool { + if m.Role == "system" { + system = append(system, m) + return true } - } - return false -} -// formatTemplateForResponse formats the template AST to: -// 1. remove all nodes after the first .Response (if generate=true) -// 2. add a .Response node to the end if it doesn't exist -// TODO(jmorganca): this should recursively cut the template before the first .Response -func formatTemplateForResponse(tmpl *template.Template, generate bool) { - var found bool - for i, node := range tmpl.Tree.Root.Nodes { - if actionNode, ok := node.(*parse.ActionNode); ok { - if isResponseNode(actionNode) { - found = true - if generate { - tmpl.Tree.Root.Nodes = tmpl.Tree.Root.Nodes[:i+1] - break - } - } + return false + }) + + if len(system) == 0 && r.model.System != "" { + // add model system prompt since it wasn't provided + system = append(system, api.Message{Role: "system", Content: r.model.System}) + } + + n := len(msgs) - 1 + for i := n - 1; i >= 0; i-- { + var b bytes.Buffer + if err := r.model.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...)}); err != nil { + return "", nil, err } - } - if !found { - // add the response node if it doesn't exist - responseFieldNode := &parse.FieldNode{NodeType: parse.NodeField, Ident: []string{"Response"}} - responsePipeNode := &parse.PipeNode{NodeType: parse.NodePipe, Cmds: []*parse.CommandNode{{NodeType: parse.NodeCommand, Args: []parse.Node{responseFieldNode}}}} - responseActionNode := &parse.ActionNode{NodeType: parse.NodeAction, Pipe: responsePipeNode} - tmpl.Tree.Root.Nodes = append(tmpl.Tree.Root.Nodes, responseActionNode) - } -} - -// Prompt renders a prompt from a template. If generate is set to true, -// the response and parts of the template following it are not rendered -func Prompt(tmpl *template.Template, system, prompt, response string, generate bool) (string, error) { - formatTemplateForResponse(tmpl, generate) - - vars := map[string]any{ - "System": system, - "Prompt": prompt, - "Response": response, - } - - var sb strings.Builder - if err := tmpl.Execute(&sb, vars); err != nil { - return "", err - } - - return sb.String(), nil -} - -func countTokens(tmpl *template.Template, system string, prompt string, response string, encode func(string) ([]int, error)) (int, error) { - rendered, err := Prompt(tmpl, system, prompt, response, false) - if err != nil { - return 0, err - } - - tokens, err := encode(rendered) - if err != nil { - slog.Error("failed to encode prompt", "err", err) - return 0, err - } - - return len(tokens), err -} - -// ChatPrompt builds up a prompt from a series of messages, truncating based on context window size -func ChatPrompt(tmpl *template.Template, messages []api.Message, window int, encode func(string) ([]int, error)) (string, error) { - type prompt struct { - System string - Prompt string - Response string - - images []int - tokens int - } - - var p prompt - - // iterate through messages to build up {system,user,response} prompts - var imgId int - var prompts []prompt - for _, msg := range messages { - switch strings.ToLower(msg.Role) { - case "system": - if p.System != "" || p.Prompt != "" || p.Response != "" { - prompts = append(prompts, p) - p = prompt{} - } - - p.System = msg.Content - case "user": - if p.Prompt != "" || p.Response != "" { - prompts = append(prompts, p) - p = prompt{} - } - - var sb strings.Builder - for range msg.Images { - fmt.Fprintf(&sb, "[img-%d] ", imgId) - p.images = append(p.images, imgId) - imgId += 1 - } - - sb.WriteString(msg.Content) - p.Prompt = sb.String() - case "assistant": - if p.Response != "" { - prompts = append(prompts, p) - p = prompt{} - } - - p.Response = msg.Content - default: - return "", fmt.Errorf("invalid role: %s, role must be one of [system, user, assistant]", msg.Role) - } - } - - // add final prompt - if p.System != "" || p.Prompt != "" || p.Response != "" { - prompts = append(prompts, p) - } - - // calculate token lengths for each prompt, estimating 768 tokens per images - for i, p := range prompts { - tokens, err := countTokens(tmpl, p.System, p.Prompt, p.Response, encode) + s, err := r.llama.Tokenize(ctx, b.String()) if err != nil { - return "", err + return "", nil, err } - prompts[i].tokens = tokens + len(prompts[i].images)*768 - } - - // truncate images and prompts starting from the beginning of the list - // until either one prompt remains or the total tokens fits the context window - // TODO (jmorganca): this doesn't account for the context window room required for the response - for { - var required int - for _, p := range prompts { - required += p.tokens + c := len(s) + if r.model.ProjectorPaths != nil { + for _, m := range msgs[i:] { + // TODO: get image embedding length from project metadata + c += 768 * len(m.Images) + } } - required += 1 // for bos token - - if required <= window { - slog.Debug("prompt now fits in context window", "required", required, "window", window) + if c > r.NumCtx { + slog.Debug("truncating input messages which exceed context length", "truncated", len(msgs[i:])) break + } else { + n = i } - - prompt := &prompts[0] - - if len(prompt.images) > 1 { - img := prompt.images[0] - slog.Debug("prompt longer than context window, removing image", "id", img, "required", required, "window", window) - prompt.images = prompt.images[1:] - prompt.Prompt = strings.Replace(prompt.Prompt, fmt.Sprintf(" [img-%d]", img), "", 1) - prompt.tokens -= 768 - continue - } - - if len(prompts) > 1 { - slog.Debug("required tokens longer than context window, removing first prompt", "prompt", prompts[0].tokens, "required", required, "window", window) - system := prompt.System - prompts = prompts[1:] - - if system != "" && prompts[0].System == "" { - prompts[0].System = system - - tokens, err := countTokens(tmpl, prompts[0].System, prompts[0].Prompt, prompts[0].Response, encode) - if err != nil { - return "", err - } - - prompts[0].tokens = tokens + len(prompts[0].images)*768 - } - - continue - } - - // stop truncating if there's only one prompt left - break } - var sb strings.Builder - for i, p := range prompts { - // last prompt should leave the response unrendered (for completion) - rendered, err := Prompt(tmpl, p.System, p.Prompt, p.Response, i == len(prompts)-1) - if err != nil { - return "", err - } - sb.WriteString(rendered) + var b bytes.Buffer + if err := r.model.Template.Execute(&b, template.Values{Messages: append(system, msgs[n:]...)}); err != nil { + return "", nil, err } - return sb.String(), nil + for _, m := range msgs[n:] { + for _, i := range m.Images { + images = append(images, llm.ImageData{ + ID: len(images), + Data: i, + }) + } + } + + return b.String(), images, nil } diff --git a/server/prompt_test.go b/server/prompt_test.go index 7df58d0b..59288b46 100644 --- a/server/prompt_test.go +++ b/server/prompt_test.go @@ -1,215 +1,214 @@ package server import ( + "bytes" + "context" "strings" "testing" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/llm" "github.com/ollama/ollama/template" ) -func TestPrompt(t *testing.T) { - tests := []struct { - name string - template string - system string - prompt string - response string - generate bool - want string - }{ - { - name: "simple prompt", - template: "[INST] {{ .System }} {{ .Prompt }} [/INST]", - system: "You are a Wizard.", - prompt: "What are the potion ingredients?", - want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]", - }, - { - name: "implicit response", - template: "[INST] {{ .System }} {{ .Prompt }} [/INST]", - system: "You are a Wizard.", - prompt: "What are the potion ingredients?", - response: "I don't know.", - want: "[INST] You are a Wizard. What are the potion ingredients? [/INST]I don't know.", - }, - { - name: "response", - template: "[INST] {{ .System }} {{ .Prompt }} [/INST] {{ .Response }}", - system: "You are a Wizard.", - prompt: "What are the potion ingredients?", - response: "I don't know.", - want: "[INST] You are a Wizard. What are the potion ingredients? [/INST] I don't know.", - }, - { - name: "cut", - template: "{{ .System }}{{ .Prompt }}{{ .Response }}", - system: "You are a Wizard.", - prompt: "What are the potion ingredients?", - response: "I don't know.", - generate: true, - want: "You are a Wizard.What are the potion ingredients?I don't know.", - }, - { - name: "nocut", - template: "{{ .System }}{{ .Prompt }}{{ .Response }}", - system: "You are a Wizard.", - prompt: "What are the potion ingredients?", - response: "I don't know.", - want: "You are a Wizard.What are the potion ingredients?I don't know.", - }, +type mock struct { + llm.LlamaServer +} + +func (m mock) Tokenize(_ context.Context, s string) (tokens []int, err error) { + for range strings.Fields(s) { + tokens = append(tokens, len(tokens)) } - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - tmpl, err := template.Parse(tc.template) - if err != nil { - t.Fatal(err) - } - - got, err := Prompt(tmpl, tc.system, tc.prompt, tc.response, tc.generate) - if err != nil { - t.Errorf("error = %v", err) - } - - if got != tc.want { - t.Errorf("got = %v, want %v", got, tc.want) - } - }) - } + return } func TestChatPrompt(t *testing.T) { - tests := []struct { - name string - template string - messages []api.Message - window int - want string + type expect struct { + prompt string + images [][]byte + } + + cases := []struct { + name string + limit int + msgs []api.Message + expect }{ { - name: "simple prompt", - template: "[INST] {{ .Prompt }} [/INST]", - messages: []api.Message{ - {Role: "user", Content: "Hello"}, + name: "messages", + limit: 64, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry!"}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager."}, + }, + expect: expect{ + prompt: "You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ", }, - window: 1024, - want: "[INST] Hello [/INST]", }, { - name: "with system message", - template: "[INST] {{ if .System }}<>{{ .System }}<> {{ end }}{{ .Prompt }} [/INST]", - messages: []api.Message{ - {Role: "system", Content: "You are a Wizard."}, - {Role: "user", Content: "Hello"}, + name: "truncate messages", + limit: 1, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry!"}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager."}, + }, + expect: expect{ + prompt: "A test. And a thumping good one at that, I'd wager. ", }, - window: 1024, - want: "[INST] <>You are a Wizard.<> Hello [/INST]", }, { - name: "with response", - template: "[INST] {{ if .System }}<>{{ .System }}<> {{ end }}{{ .Prompt }} [/INST] {{ .Response }}", - messages: []api.Message{ - {Role: "system", Content: "You are a Wizard."}, - {Role: "user", Content: "Hello"}, - {Role: "assistant", Content: "I am?"}, + name: "truncate messages with image", + limit: 64, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry!"}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{[]byte("something")}}, + }, + expect: expect{ + prompt: "[img-0] A test. And a thumping good one at that, I'd wager. ", + images: [][]byte{ + []byte("something"), + }, }, - window: 1024, - want: "[INST] <>You are a Wizard.<> Hello [/INST] I am?", }, { - name: "with implicit response", - template: "[INST] {{ if .System }}<>{{ .System }}<> {{ end }}{{ .Prompt }} [/INST]", - messages: []api.Message{ - {Role: "system", Content: "You are a Wizard."}, - {Role: "user", Content: "Hello"}, - {Role: "assistant", Content: "I am?"}, + name: "truncate messages with images", + limit: 64, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{[]byte("somethingelse")}}, + }, + expect: expect{ + prompt: "[img-0] A test. And a thumping good one at that, I'd wager. ", + images: [][]byte{ + []byte("somethingelse"), + }, }, - window: 1024, - want: "[INST] <>You are a Wizard.<> Hello [/INST]I am?", }, { - name: "with conversation", - template: "[INST] {{ if .System }}<>{{ .System }}<> {{ end }}{{ .Prompt }} [/INST] {{ .Response }} ", - messages: []api.Message{ - {Role: "system", Content: "You are a Wizard."}, - {Role: "user", Content: "What are the potion ingredients?"}, - {Role: "assistant", Content: "sugar"}, - {Role: "user", Content: "Anything else?"}, + name: "messages with images", + limit: 2048, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{[]byte("somethingelse")}}, + }, + expect: expect{ + prompt: "[img-0] You're a test, Harry! I-I'm a what? [img-1] A test. And a thumping good one at that, I'd wager. ", + images: [][]byte{ + []byte("something"), + []byte("somethingelse"), + }, }, - window: 1024, - want: "[INST] <>You are a Wizard.<> What are the potion ingredients? [/INST] sugar [INST] Anything else? [/INST] ", }, { - name: "with truncation", - template: "{{ .System }} {{ .Prompt }} {{ .Response }} ", - messages: []api.Message{ - {Role: "system", Content: "You are a Wizard."}, - {Role: "user", Content: "Hello"}, - {Role: "assistant", Content: "I am?"}, - {Role: "user", Content: "Why is the sky blue?"}, - {Role: "assistant", Content: "The sky is blue from rayleigh scattering"}, + name: "message with image tag", + limit: 2048, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry! [img]", Images: []api.ImageData{[]byte("something")}}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{[]byte("somethingelse")}}, + }, + expect: expect{ + prompt: "You're a test, Harry! [img-0] I-I'm a what? [img-1] A test. And a thumping good one at that, I'd wager. ", + images: [][]byte{ + []byte("something"), + []byte("somethingelse"), + }, }, - window: 10, - want: "You are a Wizard. Why is the sky blue? The sky is blue from rayleigh scattering", }, { - name: "images", - template: "{{ .System }} {{ .Prompt }}", - messages: []api.Message{ - {Role: "system", Content: "You are a Wizard."}, - {Role: "user", Content: "Hello", Images: []api.ImageData{[]byte("base64")}}, + name: "messages with interleaved images", + limit: 2048, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry!"}, + {Role: "user", Images: []api.ImageData{[]byte("something")}}, + {Role: "user", Images: []api.ImageData{[]byte("somethingelse")}}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager."}, + }, + expect: expect{ + prompt: "You're a test, Harry!\n\n[img-0]\n\n[img-1] I-I'm a what? A test. And a thumping good one at that, I'd wager. ", + images: [][]byte{ + []byte("something"), + []byte("somethingelse"), + }, }, - window: 1024, - want: "You are a Wizard. [img-0] Hello", }, { - name: "images truncated", - template: "{{ .System }} {{ .Prompt }}", - messages: []api.Message{ - {Role: "system", Content: "You are a Wizard."}, - {Role: "user", Content: "Hello", Images: []api.ImageData{[]byte("img1"), []byte("img2")}}, + name: "truncate message with interleaved images", + limit: 1024, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry!"}, + {Role: "user", Images: []api.ImageData{[]byte("something")}}, + {Role: "user", Images: []api.ImageData{[]byte("somethingelse")}}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager."}, + }, + expect: expect{ + prompt: "[img-0] I-I'm a what? A test. And a thumping good one at that, I'd wager. ", + images: [][]byte{ + []byte("somethingelse"), + }, }, - window: 1024, - want: "You are a Wizard. [img-0] [img-1] Hello", }, { - name: "empty list", - template: "{{ .System }} {{ .Prompt }}", - messages: []api.Message{}, - window: 1024, - want: "", - }, - { - name: "empty prompt", - template: "[INST] {{ if .System }}<>{{ .System }}<> {{ end }}{{ .Prompt }} [/INST] {{ .Response }} ", - messages: []api.Message{ - {Role: "user", Content: ""}, + name: "message with system prompt", + limit: 2048, + msgs: []api.Message{ + {Role: "system", Content: "You are the Test Who Lived."}, + {Role: "user", Content: "You're a test, Harry!"}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager."}, + }, + expect: expect{ + prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ", }, - window: 1024, - want: "", }, } - encode := func(s string) ([]int, error) { - words := strings.Fields(s) - return make([]int, len(words)), nil + tmpl, err := template.Parse(` +{{- if .System }}{{ .System }} {{ end }} +{{- if .Prompt }}{{ .Prompt }} {{ end }} +{{- if .Response }}{{ .Response }} {{ end }}`) + if err != nil { + t.Fatal(err) } - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - tmpl, err := template.Parse(tc.template) + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + r := runnerRef{ + llama: mock{}, + model: &Model{Template: tmpl, ProjectorPaths: []string{"vision"}}, + Options: &api.Options{}, + } + + r.NumCtx = tt.limit + prompt, images, err := chatPrompt(context.TODO(), &r, tt.msgs) if err != nil { t.Fatal(err) } - got, err := ChatPrompt(tmpl, tc.messages, tc.window, encode) - if err != nil { - t.Errorf("error = %v", err) + if tt.prompt != prompt { + t.Errorf("expected %q, got %q", tt.prompt, prompt) } - if got != tc.want { - t.Errorf("got: %q, want: %q", got, tc.want) + if len(images) != len(tt.images) { + t.Fatalf("expected %d images, got %d", len(tt.images), len(images)) + } + + for i := range images { + if images[i].ID != i { + t.Errorf("expected ID %d, got %d", i, images[i].ID) + } + + if !bytes.Equal(images[i].Data, tt.images[i]) { + t.Errorf("expected %q, got %q", tt.images[i], images[i]) + } } }) } diff --git a/server/routes.go b/server/routes.go index ac6b713a..35e64511 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1,13 +1,13 @@ package server import ( + "bytes" "cmp" "context" "encoding/json" "errors" "fmt" "io" - "io/fs" "log/slog" "net" "net/http" @@ -67,163 +67,140 @@ func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options return opts, nil } -func isSupportedImageType(image []byte) bool { - contentType := http.DetectContentType(image) - allowedTypes := []string{"image/jpeg", "image/jpg", "image/png"} - return slices.Contains(allowedTypes, contentType) +func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (*runnerRef, error) { + if name == "" { + return nil, errors.New("model is required") + } + + model, err := GetModel(name) + if err != nil { + return nil, err + } + + if err := model.CheckCapabilities(caps...); err != nil { + return nil, fmt.Errorf("%s %w", name, err) + } + + opts, err := modelOptions(model, requestOpts) + if err != nil { + return nil, err + } + + runnerCh, errCh := s.sched.GetRunner(ctx, model, opts, keepAlive) + var runner *runnerRef + select { + case runner = <-runnerCh: + case err = <-errCh: + return nil, err + } + + return runner, nil } func (s *Server) GenerateHandler(c *gin.Context) { - checkpointStart := time.Now() var req api.GenerateRequest - err := c.ShouldBindJSON(&req) - - switch { - case errors.Is(err, io.EOF): + if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) return - case err != nil: + } else if err != nil { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } - // validate the request - switch { - case req.Model == "": - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"}) + if req.Format != "" && req.Format != "json" { + c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""}) return - case len(req.Format) > 0 && req.Format != "json": - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be json"}) - return - case req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0): + } else if req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0) { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"}) return } - for _, img := range req.Images { - if !isSupportedImageType(img) { - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "unsupported image format"}) - return - } - } - - model, err := GetModel(req.Model) - if err != nil { - var pErr *fs.PathError - if errors.As(err, &pErr) { - c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)}) - return - } - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + caps := []Capability{CapabilityCompletion} + r, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive) + if errors.Is(err, errCapabilityCompletion) { + c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support generate", req.Model)}) + return + } else if err != nil { + handleScheduleError(c, err) return } - if !model.Has(CapabilityCompletion) { - c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%s does not support generate", req.Model)}) - return + images := make([]llm.ImageData, len(req.Images)) + for i := range req.Images { + images[i] = llm.ImageData{ID: i, Data: req.Images[i]} } - opts, err := modelOptions(model, req.Options) - if err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - - rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive) - var runner *runnerRef - select { - case runner = <-rCh: - case err = <-eCh: - handleErrorResponse(c, err) - return - } - - // an empty request loads the model - // note: for a short while template was used in lieu - // of `raw` mode so we need to check for it too - if req.Prompt == "" && req.Template == "" && req.System == "" { - c.JSON(http.StatusOK, api.GenerateResponse{ - CreatedAt: time.Now().UTC(), - Model: req.Model, - Done: true, - DoneReason: "load", - }) - return - } - - tmpl, err := template.Parse(req.Template) - if err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - - checkpointLoaded := time.Now() - - var prompt string - switch { - case req.Raw: - prompt = req.Prompt - case req.Prompt != "": - if req.Template == "" { - tmpl = model.Template + prompt := req.Prompt + if !req.Raw { + var msgs []api.Message + if req.System != "" { + msgs = append(msgs, api.Message{Role: "system", Content: req.System}) + } else if r.model.System != "" { + msgs = append(msgs, api.Message{Role: "system", Content: r.model.System}) } - if req.System == "" { - req.System = model.System + if req.Prompt != "" { + for _, i := range images { + msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)}) + } + + msgs = append(msgs, api.Message{Role: "user", Content: req.Prompt}) } - slog.Debug("generate handler", "prompt", req.Prompt) - slog.Debug("generate handler", "template", req.Template) - slog.Debug("generate handler", "system", req.System) - - var sb strings.Builder - for i := range req.Images { - fmt.Fprintf(&sb, "[img-%d] ", i) - } - - sb.WriteString(req.Prompt) - - p, err := Prompt(tmpl, req.System, sb.String(), "", true) - if err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + if len(msgs) == 0 { + c.JSON(http.StatusOK, api.GenerateResponse{ + Model: req.Model, + CreatedAt: time.Now().UTC(), + Done: true, + DoneReason: "load", + }) return } - sb.Reset() + tmpl := r.model.Template + if req.Template != "" { + tmpl, err = template.Parse(req.Template) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + } + + var b bytes.Buffer if req.Context != nil { - prev, err := runner.llama.Detokenize(c.Request.Context(), req.Context) + s, err := r.llama.Detokenize(c.Request.Context(), req.Context) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } - sb.WriteString(prev) + b.WriteString(s) } - sb.WriteString(p) + if err := tmpl.Execute(&b, template.Values{Messages: msgs}); err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } - prompt = sb.String() + prompt = b.String() } - slog.Debug("generate handler", "prompt", prompt) + slog.Debug("generate request", "prompt", prompt, "images", images) ch := make(chan any) - var generated strings.Builder go func() { defer close(ch) - - fn := func(r llm.CompletionResponse) { - // Build up the full response - if _, err := generated.WriteString(r.Content); err != nil { - ch <- gin.H{"error": err.Error()} - return - } - - resp := api.GenerateResponse{ + if err := r.llama.Completion(c.Request.Context(), llm.CompletionRequest{ + Prompt: prompt, + Images: images, + Format: req.Format, + Options: *r.Options, + }, func(r llm.CompletionResponse) { + ch <- api.GenerateResponse{ Model: req.Model, CreatedAt: time.Now().UTC(), - Done: r.Done, Response: r.Content, + Done: r.Done, DoneReason: r.DoneReason, Metrics: api.Metrics{ PromptEvalCount: r.PromptEvalCount, @@ -232,77 +209,35 @@ func (s *Server) GenerateHandler(c *gin.Context) { EvalDuration: r.EvalDuration, }, } - - if r.Done { - resp.TotalDuration = time.Since(checkpointStart) - resp.LoadDuration = checkpointLoaded.Sub(checkpointStart) - - if !req.Raw { - p, err := Prompt(tmpl, req.System, req.Prompt, generated.String(), false) - if err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - - // TODO (jmorganca): encode() should not strip special tokens - tokens, err := runner.llama.Tokenize(c.Request.Context(), p) - if err != nil { - ch <- gin.H{"error": err.Error()} - return - } - - resp.Context = append(req.Context, tokens...) - } - } - - ch <- resp - } - - var images []llm.ImageData - for i := range req.Images { - images = append(images, llm.ImageData{ - ID: i, - Data: req.Images[i], - }) - } - - // Start prediction - req := llm.CompletionRequest{ - Prompt: prompt, - Format: req.Format, - Images: images, - Options: opts, - } - if err := runner.llama.Completion(c.Request.Context(), req, fn); err != nil { + }); err != nil { ch <- gin.H{"error": err.Error()} } }() if req.Stream != nil && !*req.Stream { - // Accumulate responses into the final response - var final api.GenerateResponse + var r api.GenerateResponse var sb strings.Builder - for resp := range ch { - switch r := resp.(type) { + for rr := range ch { + switch t := rr.(type) { case api.GenerateResponse: - sb.WriteString(r.Response) - final = r + sb.WriteString(t.Response) + r = t case gin.H: - if errorMsg, ok := r["error"].(string); ok { - c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg}) - return - } else { - c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"}) - return + msg, ok := t["error"].(string) + if !ok { + msg = "unexpected error format in response" } + + c.JSON(http.StatusInternalServerError, gin.H{"error": msg}) + return default: - c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"}) + c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected response"}) return } } - final.Response = sb.String() - c.JSON(http.StatusOK, final) + r.Response = sb.String() + c.JSON(http.StatusOK, r) return } @@ -311,44 +246,17 @@ func (s *Server) GenerateHandler(c *gin.Context) { func (s *Server) EmbeddingsHandler(c *gin.Context) { var req api.EmbeddingRequest - err := c.ShouldBindJSON(&req) - switch { - case errors.Is(err, io.EOF): + if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) return - case err != nil: + } else if err != nil { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } - if req.Model == "" { - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"}) - return - } - - model, err := GetModel(req.Model) + r, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive) if err != nil { - var pErr *fs.PathError - if errors.As(err, &pErr) { - c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)}) - return - } - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - - opts, err := modelOptions(model, req.Options) - if err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - - rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive) - var runner *runnerRef - select { - case runner = <-rCh: - case err = <-eCh: - handleErrorResponse(c, err) + handleScheduleError(c, err) return } @@ -358,17 +266,14 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { return } - embedding, err := runner.llama.Embedding(c.Request.Context(), req.Prompt) + embedding, err := r.llama.Embedding(c.Request.Context(), req.Prompt) if err != nil { slog.Info(fmt.Sprintf("embedding generation failed: %v", err)) c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"}) return } - resp := api.EmbeddingResponse{ - Embedding: embedding, - } - c.JSON(http.StatusOK, resp) + c.JSON(http.StatusOK, api.EmbeddingResponse{Embedding: embedding}) } func (s *Server) PullModelHandler(c *gin.Context) { @@ -649,9 +554,9 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { } } - msgs := make([]api.Message, 0) - for _, msg := range m.Messages { - msgs = append(msgs, api.Message{Role: msg.Role, Content: msg.Content}) + msgs := make([]api.Message, len(m.Messages)) + for i, msg := range m.Messages { + msgs[i] = api.Message{Role: msg.Role, Content: msg.Content} } n := model.ParseName(req.Model) @@ -1214,132 +1119,55 @@ func (s *Server) ProcessHandler(c *gin.Context) { c.JSON(http.StatusOK, api.ProcessResponse{Models: models}) } -// ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model -func chatPrompt(ctx context.Context, runner *runnerRef, template *template.Template, messages []api.Message, numCtx int) (string, error) { - encode := func(s string) ([]int, error) { - return runner.llama.Tokenize(ctx, s) - } - - prompt, err := ChatPrompt(template, messages, numCtx, encode) - if err != nil { - return "", err - } - - return prompt, nil -} - func (s *Server) ChatHandler(c *gin.Context) { - checkpointStart := time.Now() - var req api.ChatRequest - err := c.ShouldBindJSON(&req) - switch { - case errors.Is(err, io.EOF): + if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) return - case err != nil: + } else if err != nil { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": err.Error()}) return } - // validate the request - switch { - case req.Model == "": - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"}) + caps := []Capability{CapabilityCompletion} + r, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive) + if errors.Is(err, errCapabilityCompletion) { + c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support chat", req.Model)}) return - case len(req.Format) > 0 && req.Format != "json": - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be json"}) + } else if err != nil { + handleScheduleError(c, err) return } - model, err := GetModel(req.Model) - if err != nil { - var pErr *fs.PathError - if errors.As(err, &pErr) { - c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found, try pulling it first", req.Model)}) - return - } - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - - if !model.Has(CapabilityCompletion) { - c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%s does not support chat", req.Model)}) - return - } - - opts, err := modelOptions(model, req.Options) - if err != nil { - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) - return - } - - rCh, eCh := s.sched.GetRunner(c.Request.Context(), model, opts, req.KeepAlive) - var runner *runnerRef - select { - case runner = <-rCh: - case err = <-eCh: - handleErrorResponse(c, err) - return - } - - checkpointLoaded := time.Now() - - // if the first message is not a system message, then add the model's default system message - if len(req.Messages) > 0 && req.Messages[0].Role != "system" { - req.Messages = append([]api.Message{ - { - Role: "system", - Content: model.System, - }, - }, req.Messages...) - } - - prompt, err := chatPrompt(c.Request.Context(), runner, model.Template, req.Messages, opts.NumCtx) - if err != nil { - c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) - return - } - - // an empty request loads the model - if len(req.Messages) == 0 || prompt == "" { - resp := api.ChatResponse{ - CreatedAt: time.Now().UTC(), + if len(req.Messages) == 0 { + c.JSON(http.StatusOK, api.ChatResponse{ Model: req.Model, + CreatedAt: time.Now().UTC(), + Message: api.Message{Role: "assistant"}, Done: true, DoneReason: "load", - Message: api.Message{Role: "assistant"}, - } - c.JSON(http.StatusOK, resp) + }) return } - // only send images that are in the prompt - var i int - var images []llm.ImageData - for _, m := range req.Messages { - for _, img := range m.Images { - if !isSupportedImageType(img) { - c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "unsupported image format"}) - return - } - - if strings.Contains(prompt, fmt.Sprintf("[img-%d]", i)) { - images = append(images, llm.ImageData{Data: img, ID: i}) - } - i += 1 - } + prompt, images, err := chatPrompt(c.Request.Context(), r, req.Messages) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return } - slog.Debug("chat handler", "prompt", prompt, "images", len(images)) + slog.Debug("chat request", "images", len(images), "prompt", prompt) ch := make(chan any) - go func() { defer close(ch) - - fn := func(r llm.CompletionResponse) { - resp := api.ChatResponse{ + if err := r.llama.Completion(c.Request.Context(), llm.CompletionRequest{ + Prompt: prompt, + Images: images, + Format: req.Format, + Options: *r.Options, + }, func(r llm.CompletionResponse) { + ch <- api.ChatResponse{ Model: req.Model, CreatedAt: time.Now().UTC(), Message: api.Message{Role: "assistant", Content: r.Content}, @@ -1352,64 +1180,48 @@ func (s *Server) ChatHandler(c *gin.Context) { EvalDuration: r.EvalDuration, }, } - - if r.Done { - resp.TotalDuration = time.Since(checkpointStart) - resp.LoadDuration = checkpointLoaded.Sub(checkpointStart) - } - - ch <- resp - } - - if err := runner.llama.Completion(c.Request.Context(), llm.CompletionRequest{ - Prompt: prompt, - Format: req.Format, - Images: images, - Options: opts, - }, fn); err != nil { + }); err != nil { ch <- gin.H{"error": err.Error()} } }() if req.Stream != nil && !*req.Stream { - // Accumulate responses into the final response - var final api.ChatResponse + var r api.ChatResponse var sb strings.Builder - for resp := range ch { - switch r := resp.(type) { + for rr := range ch { + switch t := rr.(type) { case api.ChatResponse: - sb.WriteString(r.Message.Content) - final = r + sb.WriteString(t.Message.Content) + r = t case gin.H: - if errorMsg, ok := r["error"].(string); ok { - c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg}) - return - } else { - c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in response"}) - return + msg, ok := t["error"].(string) + if !ok { + msg = "unexpected error format in response" } + + c.JSON(http.StatusInternalServerError, gin.H{"error": msg}) + return default: - c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error"}) + c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected response"}) return } } - final.Message = api.Message{Role: "assistant", Content: sb.String()} - c.JSON(http.StatusOK, final) + r.Message.Content = sb.String() + c.JSON(http.StatusOK, r) return } streamResponse(c, ch) } -func handleErrorResponse(c *gin.Context, err error) { - if errors.Is(err, context.Canceled) { +func handleScheduleError(c *gin.Context, err error) { + switch { + case errors.Is(err, context.Canceled): c.JSON(499, gin.H{"error": "request canceled"}) - return - } - if errors.Is(err, ErrMaxQueue) { + case errors.Is(err, ErrMaxQueue): c.JSON(http.StatusServiceUnavailable, gin.H{"error": err.Error()}) - return + default: + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) } - c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) } diff --git a/template/template.go b/template/template.go index d15f7156..cfba5a23 100644 --- a/template/template.go +++ b/template/template.go @@ -5,6 +5,7 @@ import ( "embed" "encoding/json" "errors" + "fmt" "io" "math" "slices" @@ -14,6 +15,7 @@ import ( "text/template/parse" "github.com/agnivade/levenshtein" + "github.com/ollama/ollama/api" "golang.org/x/exp/maps" ) @@ -74,30 +76,78 @@ func Named(s string) (*named, error) { return nil, errors.New("no matching template found") } +var DefaultTemplate, _ = Parse("{{ .Prompt }}") + type Template struct { *template.Template raw string } +var response = parse.ActionNode{ + NodeType: parse.NodeAction, + Pipe: &parse.PipeNode{ + NodeType: parse.NodePipe, + Cmds: []*parse.CommandNode{ + { + NodeType: parse.NodeCommand, + Args: []parse.Node{ + &parse.FieldNode{ + NodeType: parse.NodeField, + Ident: []string{"Response"}, + }, + }, + }, + }, + }, +} + +func Parse(s string) (*Template, error) { + tmpl := template.New("").Option("missingkey=zero").Funcs(template.FuncMap{ + "toJson": func(v any) string { + b, err := json.Marshal(v) + if err != nil { + return "" + } + + return string(b) + }, + "isLastMessage": func(s []*api.Message, m *api.Message) bool { + for i := len(s) - 1; i >= 0; i-- { + if m.Role != s[i].Role { + continue + } + + return m == s[i] + } + + return false + }, + }) + + tmpl, err := tmpl.Parse(s) + if err != nil { + return nil, err + } + + t := Template{Template: tmpl, raw: s} + if vars := t.Vars(); !slices.Contains(vars, "messages") && !slices.Contains(vars, "response") { + // touch up the template and append {{ .Response }} + tmpl.Tree.Root.Nodes = append(tmpl.Tree.Root.Nodes, &response) + } + + return &t, nil +} + func (t *Template) String() string { return t.raw } -var DefaultTemplate, _ = Parse("{{ .Prompt }}") - -func Parse(s string) (*Template, error) { - t, err := template.New("").Option("missingkey=zero").Parse(s) - if err != nil { - return nil, err - } - - return &Template{Template: t, raw: s}, nil -} - func (t *Template) Vars() []string { var vars []string - for _, n := range t.Tree.Root.Nodes { - vars = append(vars, parseNode(n)...) + for _, tt := range t.Templates() { + for _, n := range tt.Root.Nodes { + vars = append(vars, parseNode(n)...) + } } set := make(map[string]struct{}) @@ -110,6 +160,97 @@ func (t *Template) Vars() []string { return vars } +type Values struct { + Messages []api.Message +} + +func (t *Template) Execute(w io.Writer, v Values) error { + system, collated := collate(v.Messages) + if slices.Contains(t.Vars(), "messages") { + return t.Template.Execute(w, map[string]any{ + "System": system, + "Messages": collated, + }) + } + + var b bytes.Buffer + var prompt, response string + for i, m := range collated { + if m.Role == "user" { + prompt = m.Content + } else { + response = m.Content + } + + if i != len(collated)-1 && prompt != "" && response != "" { + if err := t.Template.Execute(&b, map[string]any{ + "System": "", + "Prompt": prompt, + "Response": response, + }); err != nil { + return err + } + + prompt = "" + response = "" + } + } + + var cut bool + tree := t.Template.Copy() + // for the last message, cut everything after "{{ .Response }}" + tree.Root.Nodes = slices.DeleteFunc(tree.Root.Nodes, func(n parse.Node) bool { + if slices.Contains(parseNode(n), "Response") { + cut = true + } + + return cut + }) + + if err := template.Must(template.New("").AddParseTree("", tree)).Execute(&b, map[string]any{ + "System": system, + "Prompt": prompt, + }); err != nil { + return err + } + + _, err := io.Copy(w, &b) + return err +} + +func collate(msgs []api.Message) (system string, collated []*api.Message) { + var n int + for i := range msgs { + msg := msgs[i] + if msg.Role == "system" { + if system != "" { + system += "\n\n" + } + + system += msg.Content + continue + } + + for range msg.Images { + imageTag := fmt.Sprintf("[img-%d]", n) + if !strings.Contains(msg.Content, "[img]") { + msg.Content = strings.TrimSpace("[img] " + msg.Content) + } + + msg.Content = strings.Replace(msg.Content, "[img]", imageTag, 1) + n++ + } + + if len(collated) > 0 && collated[len(collated)-1].Role == msg.Role { + collated[len(collated)-1].Content += "\n\n" + msg.Content + } else { + collated = append(collated, &msg) + } + } + + return +} + func parseNode(n parse.Node) []string { switch n := n.(type) { case *parse.ActionNode: @@ -152,6 +293,8 @@ func parseNode(n parse.Node) []string { return names case *parse.FieldNode: return n.Ident + case *parse.TemplateNode: + return parseNode(n.Pipe) } return nil diff --git a/template/template_test.go b/template/template_test.go index eda4634f..5d5dad4b 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -11,6 +11,7 @@ import ( "testing" "text/template" + "github.com/ollama/ollama/api" "github.com/ollama/ollama/llm" ) @@ -64,13 +65,12 @@ func TestParse(t *testing.T) { template string vars []string }{ - {"{{ .Prompt }}", []string{"prompt"}}, - {"{{ .System }} {{ .Prompt }}", []string{"prompt", "system"}}, + {"{{ .Prompt }}", []string{"prompt", "response"}}, + {"{{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system"}}, {"{{ .System }} {{ .Prompt }} {{ .Response }}", []string{"prompt", "response", "system"}}, - {"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "system", "tools"}}, + {"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}}, {"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}}, {"{{ range .Messages }}{{ if eq .Role \"system\" }}SYSTEM: {{ .Content }}{{ else if eq .Role \"user\" }}USER: {{ .Content }}{{ else if eq .Role \"assistant\" }}ASSISTANT: {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role"}}, - {"{{ .Prompt }} {{ .Suffix }}", []string{"prompt", "suffix"}}, } for _, tt := range cases { @@ -87,3 +87,148 @@ func TestParse(t *testing.T) { }) } } + +func TestExecuteWithMessages(t *testing.T) { + cases := []struct { + templates []string + values Values + expected string + }{ + { + []string{ + `[INST] {{ if .System }}{{ .System }}{{ print "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `, + `[INST] {{ if .System }}{{ .System }}{{ print "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`, + `{{- range .Messages }} +{{- if eq .Role "user" }}[INST] {{ if and (isLastMessage $.Messages .) $.System }}{{ $.System }}{{ print "\n\n" }} +{{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} +{{- end }} +{{- end }}`, + }, + Values{ + Messages: []api.Message{ + {Role: "user", Content: "Hello friend!"}, + {Role: "assistant", Content: "Hello human!"}, + {Role: "user", Content: "Yay!"}, + }, + }, + `[INST] Hello friend![/INST] Hello human![INST] Yay![/INST] `, + }, + { + []string{ + `[INST] {{ if .System }}{{ .System }}{{ print "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `, + `[INST] {{ if .System }}{{ .System }}{{ print "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`, + ` +{{- range .Messages }} +{{- if eq .Role "user" }}[INST] {{ if and (isLastMessage $.Messages .) $.System }}{{ $.System }}{{ print "\n\n" }} +{{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} +{{- end }} +{{- end }}`, + }, + Values{ + Messages: []api.Message{ + {Role: "system", Content: "You are a helpful assistant!"}, + {Role: "user", Content: "Hello friend!"}, + {Role: "assistant", Content: "Hello human!"}, + {Role: "user", Content: "Yay!"}, + }, + }, + `[INST] Hello friend![/INST] Hello human![INST] You are a helpful assistant! + +Yay![/INST] `, + }, + { + []string{ + `{{ if .System }}<|im_start|>system +{{ .System }}<|im_end|> +{{ end }}{{ if .Prompt }}<|im_start|>user +{{ .Prompt }}<|im_end|> +{{ end }}<|im_start|>assistant +{{ .Response }}<|im_end|> +`, + ` +{{- range .Messages }} +{{- if and (eq .Role "user") (isLastMessage $.Messages .) $.System }}<|im_start|>system +{{ $.System }}<|im_end|>{{ print "\n" }} +{{- end }}<|im_start|>{{ .Role }} +{{ .Content }}<|im_end|>{{ print "\n" }} +{{- end }}<|im_start|>assistant +`, + }, + Values{ + Messages: []api.Message{ + {Role: "system", Content: "You are a helpful assistant!"}, + {Role: "user", Content: "Hello friend!"}, + {Role: "assistant", Content: "Hello human!"}, + {Role: "user", Content: "Yay!"}, + }, + }, + `<|im_start|>user +Hello friend!<|im_end|> +<|im_start|>assistant +Hello human!<|im_end|> +<|im_start|>system +You are a helpful assistant!<|im_end|> +<|im_start|>user +Yay!<|im_end|> +<|im_start|>assistant +`, + }, + { + []string{ + `{{ if .Prompt }}Question: {{ .Prompt }} + +{{ end }}Answer: {{ .Response }} + +`, + ` +{{- range .Messages }} +{{- if eq .Role "user" }}Question: {{ .Content }}{{ print "\n\n" }} +{{- else if eq .Role "assistant" }}Answer: {{ .Content }}{{ print "\n\n" }} +{{- end }} +{{- end }}Answer: `, + }, + Values{ + Messages: []api.Message{ + {Role: "user", Content: "What's in this image?", Images: []api.ImageData{[]byte("")}}, + {Role: "assistant", Content: "It's a hot dog."}, + {Role: "user", Content: "What's in _this_ image?"}, + {Role: "user", Images: []api.ImageData{[]byte("")}}, + {Role: "user", Content: "Is it a hot dog?"}, + }, + }, + `Question: [img-0] What's in this image? + +Answer: It's a hot dog. + +Question: What's in _this_ image? + +[img-1] + +Is it a hot dog? + +Answer: `, + }, + } + + for _, tt := range cases { + t.Run("", func(t *testing.T) { + for _, tmpl := range tt.templates { + t.Run("", func(t *testing.T) { + tmpl, err := Parse(tmpl) + if err != nil { + t.Fatal(err) + } + + var b bytes.Buffer + if err := tmpl.Execute(&b, tt.values); err != nil { + t.Fatal(err) + } + + if b.String() != tt.expected { + t.Errorf("expected\n%s,\ngot\n%s", tt.expected, b.String()) + } + }) + } + }) + } +} From 2c3fe1fd972b7810091120f844afc35bc98accbd Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 20 Jun 2024 11:00:08 -0700 Subject: [PATCH 03/48] comments --- server/prompt.go | 29 +++--- server/prompt_test.go | 34 +++---- server/routes.go | 46 +++++----- template/template.go | 48 +++++----- template/template_test.go | 180 ++++++++++++++++++++++++++++++-------- 5 files changed, 224 insertions(+), 113 deletions(-) diff --git a/server/prompt.go b/server/prompt.go index 5016fbe1..51d691a9 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -11,8 +11,13 @@ import ( "github.com/ollama/ollama/template" ) -func chatPrompt(ctx context.Context, r *runnerRef, msgs []api.Message) (prompt string, images []llm.ImageData, _ error) { - // extract system messages which should always be included +type tokenizeFunc func(context.Context, string) ([]int, error) + +// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn. +// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the +// latest message and 2) system messages +func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message) (prompt string, images []llm.ImageData, _ error) { + // pull out any system messages which should always be included in the prompt var system []api.Message msgs = slices.DeleteFunc(msgs, func(m api.Message) bool { if m.Role == "system" { @@ -23,32 +28,35 @@ func chatPrompt(ctx context.Context, r *runnerRef, msgs []api.Message) (prompt s return false }) - if len(system) == 0 && r.model.System != "" { + if len(system) == 0 && m.System != "" { // add model system prompt since it wasn't provided - system = append(system, api.Message{Role: "system", Content: r.model.System}) + system = append(system, api.Message{Role: "system", Content: m.System}) } + // always include the last message n := len(msgs) - 1 + // in reverse, find all messages that fit into context window for i := n - 1; i >= 0; i-- { var b bytes.Buffer - if err := r.model.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...)}); err != nil { + if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...)}); err != nil { return "", nil, err } - s, err := r.llama.Tokenize(ctx, b.String()) + s, err := tokenize(ctx, b.String()) if err != nil { return "", nil, err } c := len(s) - if r.model.ProjectorPaths != nil { + if m.ProjectorPaths != nil { for _, m := range msgs[i:] { - // TODO: get image embedding length from project metadata + // images are represented as 768 sized embeddings + // TODO: get embedding length from project metadata c += 768 * len(m.Images) } } - if c > r.NumCtx { + if c > opts.NumCtx { slog.Debug("truncating input messages which exceed context length", "truncated", len(msgs[i:])) break } else { @@ -56,8 +64,9 @@ func chatPrompt(ctx context.Context, r *runnerRef, msgs []api.Message) (prompt s } } + // truncate any messages that do not fit into the context window var b bytes.Buffer - if err := r.model.Template.Execute(&b, template.Values{Messages: append(system, msgs[n:]...)}); err != nil { + if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[n:]...)}); err != nil { return "", nil, err } diff --git a/server/prompt_test.go b/server/prompt_test.go index 59288b46..d4cee98c 100644 --- a/server/prompt_test.go +++ b/server/prompt_test.go @@ -7,15 +7,10 @@ import ( "testing" "github.com/ollama/ollama/api" - "github.com/ollama/ollama/llm" "github.com/ollama/ollama/template" ) -type mock struct { - llm.LlamaServer -} - -func (m mock) Tokenize(_ context.Context, s string) (tokens []int, err error) { +func tokenize(_ context.Context, s string) (tokens []int, err error) { for range strings.Fields(s) { tokens = append(tokens, len(tokens)) } @@ -48,7 +43,7 @@ func TestChatPrompt(t *testing.T) { }, }, { - name: "truncate messages", + name: "truncate messages", limit: 1, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!"}, @@ -60,7 +55,7 @@ func TestChatPrompt(t *testing.T) { }, }, { - name: "truncate messages with image", + name: "truncate messages with image", limit: 64, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!"}, @@ -75,7 +70,7 @@ func TestChatPrompt(t *testing.T) { }, }, { - name: "truncate messages with images", + name: "truncate messages with images", limit: 64, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}}, @@ -90,7 +85,7 @@ func TestChatPrompt(t *testing.T) { }, }, { - name: "messages with images", + name: "messages with images", limit: 2048, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!", Images: []api.ImageData{[]byte("something")}}, @@ -106,7 +101,7 @@ func TestChatPrompt(t *testing.T) { }, }, { - name: "message with image tag", + name: "message with image tag", limit: 2048, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry! [img]", Images: []api.ImageData{[]byte("something")}}, @@ -122,7 +117,7 @@ func TestChatPrompt(t *testing.T) { }, }, { - name: "messages with interleaved images", + name: "messages with interleaved images", limit: 2048, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!"}, @@ -140,7 +135,7 @@ func TestChatPrompt(t *testing.T) { }, }, { - name: "truncate message with interleaved images", + name: "truncate message with interleaved images", limit: 1024, msgs: []api.Message{ {Role: "user", Content: "You're a test, Harry!"}, @@ -157,7 +152,7 @@ func TestChatPrompt(t *testing.T) { }, }, { - name: "message with system prompt", + name: "message with system prompt", limit: 2048, msgs: []api.Message{ {Role: "system", Content: "You are the Test Who Lived."}, @@ -181,14 +176,9 @@ func TestChatPrompt(t *testing.T) { for _, tt := range cases { t.Run(tt.name, func(t *testing.T) { - r := runnerRef{ - llama: mock{}, - model: &Model{Template: tmpl, ProjectorPaths: []string{"vision"}}, - Options: &api.Options{}, - } - - r.NumCtx = tt.limit - prompt, images, err := chatPrompt(context.TODO(), &r, tt.msgs) + model := Model{Template: tmpl, ProjectorPaths: []string{"vision"}} + opts := api.Options{Runner: api.Runner{NumCtx: tt.limit}} + prompt, images, err := chatPrompt(context.TODO(), &model, tokenize, &opts, tt.msgs) if err != nil { t.Fatal(err) } diff --git a/server/routes.go b/server/routes.go index 35e64511..1a93e977 100644 --- a/server/routes.go +++ b/server/routes.go @@ -54,6 +54,8 @@ func init() { gin.SetMode(mode) } +var errRequired = errors.New("is required") + func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options, error) { opts := api.DefaultOptions() if err := opts.FromMap(model.Options); err != nil { @@ -69,7 +71,7 @@ func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (*runnerRef, error) { if name == "" { - return nil, errors.New("model is required") + return nil, fmt.Errorf("model %w", errRequired) } model, err := GetModel(name) @@ -121,7 +123,17 @@ func (s *Server) GenerateHandler(c *gin.Context) { c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support generate", req.Model)}) return } else if err != nil { - handleScheduleError(c, err) + handleScheduleError(c, req.Model, err) + return + } + + if req.Prompt == "" { + c.JSON(http.StatusOK, api.GenerateResponse{ + Model: req.Model, + CreatedAt: time.Now().UTC(), + Done: true, + DoneReason: "load", + }) return } @@ -139,23 +151,11 @@ func (s *Server) GenerateHandler(c *gin.Context) { msgs = append(msgs, api.Message{Role: "system", Content: r.model.System}) } - if req.Prompt != "" { - for _, i := range images { - msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)}) - } - - msgs = append(msgs, api.Message{Role: "user", Content: req.Prompt}) + for _, i := range images { + msgs = append(msgs, api.Message{Role: "user", Content: fmt.Sprintf("[img-%d]", i.ID)}) } - if len(msgs) == 0 { - c.JSON(http.StatusOK, api.GenerateResponse{ - Model: req.Model, - CreatedAt: time.Now().UTC(), - Done: true, - DoneReason: "load", - }) - return - } + msgs = append(msgs, api.Message{Role: "user", Content: req.Prompt}) tmpl := r.model.Template if req.Template != "" { @@ -256,7 +256,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { r, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive) if err != nil { - handleScheduleError(c, err) + handleScheduleError(c, req.Model, err) return } @@ -1135,7 +1135,7 @@ func (s *Server) ChatHandler(c *gin.Context) { c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support chat", req.Model)}) return } else if err != nil { - handleScheduleError(c, err) + handleScheduleError(c, req.Model, err) return } @@ -1150,7 +1150,7 @@ func (s *Server) ChatHandler(c *gin.Context) { return } - prompt, images, err := chatPrompt(c.Request.Context(), r, req.Messages) + prompt, images, err := chatPrompt(c.Request.Context(), r.model, r.llama.Tokenize, r.Options, req.Messages) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return @@ -1215,12 +1215,16 @@ func (s *Server) ChatHandler(c *gin.Context) { streamResponse(c, ch) } -func handleScheduleError(c *gin.Context, err error) { +func handleScheduleError(c *gin.Context, name string, err error) { switch { + case errors.Is(err, errRequired): + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) case errors.Is(err, context.Canceled): c.JSON(499, gin.H{"error": "request canceled"}) case errors.Is(err, ErrMaxQueue): c.JSON(http.StatusServiceUnavailable, gin.H{"error": err.Error()}) + case errors.Is(err, os.ErrNotExist): + c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model %q not found, try pulling it first", name)}) default: c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) } diff --git a/template/template.go b/template/template.go index cfba5a23..c8f8f6d0 100644 --- a/template/template.go +++ b/template/template.go @@ -83,6 +83,7 @@ type Template struct { raw string } +// response is a template node that can be added to templates that don't already have one var response = parse.ActionNode{ NodeType: parse.NodeAction, Pipe: &parse.PipeNode{ @@ -101,28 +102,25 @@ var response = parse.ActionNode{ }, } +var funcs = template.FuncMap{ + "toJson": func(v any) string { + b, err := json.Marshal(v) + if err != nil { + return "" + } + + return string(b) + }, + "add": func(a, b int) int { + return a + b + }, + "sub": func(a, b int) int { + return a - b + }, +} + func Parse(s string) (*Template, error) { - tmpl := template.New("").Option("missingkey=zero").Funcs(template.FuncMap{ - "toJson": func(v any) string { - b, err := json.Marshal(v) - if err != nil { - return "" - } - - return string(b) - }, - "isLastMessage": func(s []*api.Message, m *api.Message) bool { - for i := len(s) - 1; i >= 0; i-- { - if m.Role != s[i].Role { - continue - } - - return m == s[i] - } - - return false - }, - }) + tmpl := template.New("").Option("missingkey=zero").Funcs(funcs) tmpl, err := tmpl.Parse(s) if err != nil { @@ -218,7 +216,13 @@ func (t *Template) Execute(w io.Writer, v Values) error { return err } -func collate(msgs []api.Message) (system string, collated []*api.Message) { +type messages []*api.Message + +// collate messages based on role. consecutive messages of the same role are merged +// into a single message. collate also pulls out and merges messages with Role == "system" +// which are templated separately. As a side effect, it mangles message content adding image +// tags ([img-%d]) as needed +func collate(msgs []api.Message) (system string, collated messages) { var n int for i := range msgs { msg := msgs[i] diff --git a/template/template_test.go b/template/template_test.go index 5d5dad4b..ac92bf48 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" "slices" + "strconv" "testing" "text/template" @@ -15,6 +16,98 @@ import ( "github.com/ollama/ollama/llm" ) +func TestFuncs(t *testing.T) { + t.Run("toJson", func(t *testing.T) { + cases := []struct { + input any + expected string + }{ + {nil, "null"}, + {true, "true"}, + {false, "false"}, + {0, "0"}, + {1, "1"}, + {1.0, "1"}, + {1.1, "1.1"}, + {"", `""`}, + {"hello", `"hello"`}, + {[]int{1, 2, 3}, "[1,2,3]"}, + {[]string{"a", "b", "c"}, `["a","b","c"]`}, + {map[string]int{"a": 1, "b": 2}, `{"a":1,"b":2}`}, + {map[string]string{"a": "b", "c": "d"}, `{"a":"b","c":"d"}`}, + } + + for _, tt := range cases { + t.Run(tt.expected, func(t *testing.T) { + toJson, ok := funcs["toJson"].(func(any) string) + if !ok { + t.Fatal("toJson is not a function") + } + + if s := toJson(tt.input); s != tt.expected { + t.Errorf("expected %q, got %q", tt.expected, s) + } + }) + } + }) + + t.Run("add", func(t *testing.T) { + cases := []struct { + a, b int + expected int + }{ + {0, 0, 0}, + {0, 1, 1}, + {1, 0, 1}, + {1, 1, 2}, + {1, -1, 0}, + {-1, 1, 0}, + {-1, -1, -2}, + } + + for _, tt := range cases { + t.Run(strconv.Itoa(tt.expected), func(t *testing.T) { + add, ok := funcs["add"].(func(int, int) int) + if !ok { + t.Fatal("add is not a function") + } + + if n := add(tt.a, tt.b); n != tt.expected { + t.Errorf("expected %d, got %d", tt.expected, n) + } + }) + } + }) + + t.Run("sub", func(t *testing.T) { + cases := []struct { + a, b int + expected int + }{ + {0, 0, 0}, + {0, 1, -1}, + {1, 0, 1}, + {1, 1, 0}, + {1, -1, 2}, + {-1, 1, -2}, + {-1, -1, 0}, + } + + for _, tt := range cases { + t.Run(strconv.Itoa(tt.expected), func(t *testing.T) { + sub, ok := funcs["sub"].(func(int, int) int) + if !ok { + t.Fatal("sub is not a function") + } + + if n := sub(tt.a, tt.b); n != tt.expected { + t.Errorf("expected %d, got %d", tt.expected, n) + } + }) + } + }) +} + func TestNamed(t *testing.T) { f, err := os.Open(filepath.Join("testdata", "templates.jsonl")) if err != nil { @@ -89,77 +182,86 @@ func TestParse(t *testing.T) { } func TestExecuteWithMessages(t *testing.T) { + type template struct { + name string + template string + } cases := []struct { - templates []string + name string + templates []template values Values expected string }{ { - []string{ - `[INST] {{ if .System }}{{ .System }}{{ print "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `, - `[INST] {{ if .System }}{{ .System }}{{ print "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`, - `{{- range .Messages }} -{{- if eq .Role "user" }}[INST] {{ if and (isLastMessage $.Messages .) $.System }}{{ $.System }}{{ print "\n\n" }} + "mistral", + []template{ + {"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `}, + {"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, + {"messages", `{{- range .Messages }} +{{- if eq .Role "user" }}[INST] {{ if and (eq (index $.Messages (sub (len $.Messages) 1)) .) $.System }}{{ $.System }}{{ "\n\n" }} {{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} {{- end }} -{{- end }}`, +{{- end }}`}, }, Values{ Messages: []api.Message{ {Role: "user", Content: "Hello friend!"}, {Role: "assistant", Content: "Hello human!"}, - {Role: "user", Content: "Yay!"}, + {Role: "user", Content: "What is your name?"}, }, }, - `[INST] Hello friend![/INST] Hello human![INST] Yay![/INST] `, + `[INST] Hello friend![/INST] Hello human![INST] What is your name?[/INST] `, }, { - []string{ - `[INST] {{ if .System }}{{ .System }}{{ print "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `, - `[INST] {{ if .System }}{{ .System }}{{ print "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`, - ` + "mistral system", + []template{ + {"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `}, + {"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, + {"messages", ` {{- range .Messages }} -{{- if eq .Role "user" }}[INST] {{ if and (isLastMessage $.Messages .) $.System }}{{ $.System }}{{ print "\n\n" }} +{{- if eq .Role "user" }}[INST] {{ if and (eq (index $.Messages (sub (len $.Messages) 1)) .) $.System }}{{ $.System }}{{ "\n\n" }} {{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} {{- end }} -{{- end }}`, +{{- end }}`}, }, Values{ Messages: []api.Message{ {Role: "system", Content: "You are a helpful assistant!"}, {Role: "user", Content: "Hello friend!"}, {Role: "assistant", Content: "Hello human!"}, - {Role: "user", Content: "Yay!"}, + {Role: "user", Content: "What is your name?"}, }, }, `[INST] Hello friend![/INST] Hello human![INST] You are a helpful assistant! -Yay![/INST] `, +What is your name?[/INST] `, }, { - []string{ - `{{ if .System }}<|im_start|>system + "chatml", + []template{ + // this does not have a "no response" test because it's impossible to render the same output + {"response", `{{ if .System }}<|im_start|>system {{ .System }}<|im_end|> {{ end }}{{ if .Prompt }}<|im_start|>user {{ .Prompt }}<|im_end|> {{ end }}<|im_start|>assistant {{ .Response }}<|im_end|> -`, - ` +`}, + {"messages", ` {{- range .Messages }} -{{- if and (eq .Role "user") (isLastMessage $.Messages .) $.System }}<|im_start|>system -{{ $.System }}<|im_end|>{{ print "\n" }} +{{- if and (eq .Role "user") (eq (index $.Messages (sub (len $.Messages) 1)) .) $.System }}<|im_start|>system +{{ $.System }}<|im_end|>{{ "\n" }} {{- end }}<|im_start|>{{ .Role }} -{{ .Content }}<|im_end|>{{ print "\n" }} +{{ .Content }}<|im_end|>{{ "\n" }} {{- end }}<|im_start|>assistant -`, +`}, }, Values{ Messages: []api.Message{ {Role: "system", Content: "You are a helpful assistant!"}, {Role: "user", Content: "Hello friend!"}, {Role: "assistant", Content: "Hello human!"}, - {Role: "user", Content: "Yay!"}, + {Role: "user", Content: "What is your name?"}, }, }, `<|im_start|>user @@ -169,23 +271,25 @@ Hello human!<|im_end|> <|im_start|>system You are a helpful assistant!<|im_end|> <|im_start|>user -Yay!<|im_end|> +What is your name?<|im_end|> <|im_start|>assistant `, }, { - []string{ - `{{ if .Prompt }}Question: {{ .Prompt }} + "moondream", + []template{ + // this does not have a "no response" test because it's impossible to render the same output + {"response", `{{ if .Prompt }}Question: {{ .Prompt }} {{ end }}Answer: {{ .Response }} -`, - ` +`}, + {"messages", ` {{- range .Messages }} -{{- if eq .Role "user" }}Question: {{ .Content }}{{ print "\n\n" }} -{{- else if eq .Role "assistant" }}Answer: {{ .Content }}{{ print "\n\n" }} +{{- if eq .Role "user" }}Question: {{ .Content }}{{ "\n\n" }} +{{- else if eq .Role "assistant" }}Answer: {{ .Content }}{{ "\n\n" }} {{- end }} -{{- end }}Answer: `, +{{- end }}Answer: `}, }, Values{ Messages: []api.Message{ @@ -211,10 +315,10 @@ Answer: `, } for _, tt := range cases { - t.Run("", func(t *testing.T) { - for _, tmpl := range tt.templates { - t.Run("", func(t *testing.T) { - tmpl, err := Parse(tmpl) + t.Run(tt.name, func(t *testing.T) { + for _, ttt := range tt.templates { + t.Run(ttt.name, func(t *testing.T) { + tmpl, err := Parse(ttt.template) if err != nil { t.Fatal(err) } From ac7a842e550721fbc00e36e416e7cf6606993149 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 3 Jul 2024 09:00:07 -0700 Subject: [PATCH 04/48] fix model reloading ensure runtime model changes (template, system prompt, messages, options) are captured on model updates without needing to reload the server --- llm/server.go | 2 +- server/routes.go | 42 ++++++++++++++++++++++-------------------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/llm/server.go b/llm/server.go index 206f9e39..229d61e4 100644 --- a/llm/server.go +++ b/llm/server.go @@ -679,7 +679,7 @@ type CompletionRequest struct { Prompt string Format string Images []ImageData - Options api.Options + Options *api.Options } type CompletionResponse struct { diff --git a/server/routes.go b/server/routes.go index 1a93e977..4059c7c5 100644 --- a/server/routes.go +++ b/server/routes.go @@ -69,23 +69,25 @@ func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options return opts, nil } -func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (*runnerRef, error) { +// scheduleRunner schedules a runner after validating inputs such as capabilities and model options. +// It returns the allocated runner, model instance, and consolidated options if successful and error otherwise. +func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (llm.LlamaServer, *Model, *api.Options, error) { if name == "" { - return nil, fmt.Errorf("model %w", errRequired) + return nil, nil, nil, fmt.Errorf("model %w", errRequired) } model, err := GetModel(name) if err != nil { - return nil, err + return nil, nil, nil, err } if err := model.CheckCapabilities(caps...); err != nil { - return nil, fmt.Errorf("%s %w", name, err) + return nil, nil, nil, fmt.Errorf("%s %w", name, err) } opts, err := modelOptions(model, requestOpts) if err != nil { - return nil, err + return nil, nil, nil, err } runnerCh, errCh := s.sched.GetRunner(ctx, model, opts, keepAlive) @@ -93,10 +95,10 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capabil select { case runner = <-runnerCh: case err = <-errCh: - return nil, err + return nil, nil, nil, err } - return runner, nil + return runner.llama, model, &opts, nil } func (s *Server) GenerateHandler(c *gin.Context) { @@ -118,7 +120,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { } caps := []Capability{CapabilityCompletion} - r, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive) + r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive) if errors.Is(err, errCapabilityCompletion) { c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support generate", req.Model)}) return @@ -147,8 +149,8 @@ func (s *Server) GenerateHandler(c *gin.Context) { var msgs []api.Message if req.System != "" { msgs = append(msgs, api.Message{Role: "system", Content: req.System}) - } else if r.model.System != "" { - msgs = append(msgs, api.Message{Role: "system", Content: r.model.System}) + } else if m.System != "" { + msgs = append(msgs, api.Message{Role: "system", Content: m.System}) } for _, i := range images { @@ -157,7 +159,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { msgs = append(msgs, api.Message{Role: "user", Content: req.Prompt}) - tmpl := r.model.Template + tmpl := m.Template if req.Template != "" { tmpl, err = template.Parse(req.Template) if err != nil { @@ -168,7 +170,7 @@ func (s *Server) GenerateHandler(c *gin.Context) { var b bytes.Buffer if req.Context != nil { - s, err := r.llama.Detokenize(c.Request.Context(), req.Context) + s, err := r.Detokenize(c.Request.Context(), req.Context) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return @@ -190,11 +192,11 @@ func (s *Server) GenerateHandler(c *gin.Context) { ch := make(chan any) go func() { defer close(ch) - if err := r.llama.Completion(c.Request.Context(), llm.CompletionRequest{ + if err := r.Completion(c.Request.Context(), llm.CompletionRequest{ Prompt: prompt, Images: images, Format: req.Format, - Options: *r.Options, + Options: opts, }, func(r llm.CompletionResponse) { ch <- api.GenerateResponse{ Model: req.Model, @@ -254,7 +256,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { return } - r, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive) + r, _, _, err := s.scheduleRunner(c.Request.Context(), req.Model, []Capability{}, req.Options, req.KeepAlive) if err != nil { handleScheduleError(c, req.Model, err) return @@ -266,7 +268,7 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) { return } - embedding, err := r.llama.Embedding(c.Request.Context(), req.Prompt) + embedding, err := r.Embedding(c.Request.Context(), req.Prompt) if err != nil { slog.Info(fmt.Sprintf("embedding generation failed: %v", err)) c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"}) @@ -1130,7 +1132,7 @@ func (s *Server) ChatHandler(c *gin.Context) { } caps := []Capability{CapabilityCompletion} - r, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive) + r, m, opts, err := s.scheduleRunner(c.Request.Context(), req.Model, caps, req.Options, req.KeepAlive) if errors.Is(err, errCapabilityCompletion) { c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("%q does not support chat", req.Model)}) return @@ -1150,7 +1152,7 @@ func (s *Server) ChatHandler(c *gin.Context) { return } - prompt, images, err := chatPrompt(c.Request.Context(), r.model, r.llama.Tokenize, r.Options, req.Messages) + prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, req.Messages) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return @@ -1161,11 +1163,11 @@ func (s *Server) ChatHandler(c *gin.Context) { ch := make(chan any) go func() { defer close(ch) - if err := r.llama.Completion(c.Request.Context(), llm.CompletionRequest{ + if err := r.Completion(c.Request.Context(), llm.CompletionRequest{ Prompt: prompt, Images: images, Format: req.Format, - Options: *r.Options, + Options: opts, }, func(r llm.CompletionResponse) { ch <- api.ChatResponse{ Model: req.Model, From 326363b3a72d9e2972a019dfc4c6147ea901f501 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 3 Jul 2024 13:49:14 -0700 Subject: [PATCH 05/48] no funcs --- template/template.go | 19 +------ template/template_test.go | 105 +++----------------------------------- 2 files changed, 7 insertions(+), 117 deletions(-) diff --git a/template/template.go b/template/template.go index c8f8f6d0..b133b97e 100644 --- a/template/template.go +++ b/template/template.go @@ -102,25 +102,8 @@ var response = parse.ActionNode{ }, } -var funcs = template.FuncMap{ - "toJson": func(v any) string { - b, err := json.Marshal(v) - if err != nil { - return "" - } - - return string(b) - }, - "add": func(a, b int) int { - return a + b - }, - "sub": func(a, b int) int { - return a - b - }, -} - func Parse(s string) (*Template, error) { - tmpl := template.New("").Option("missingkey=zero").Funcs(funcs) + tmpl := template.New("").Option("missingkey=zero") tmpl, err := tmpl.Parse(s) if err != nil { diff --git a/template/template_test.go b/template/template_test.go index ac92bf48..ac16bd60 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -8,7 +8,6 @@ import ( "os" "path/filepath" "slices" - "strconv" "testing" "text/template" @@ -16,98 +15,6 @@ import ( "github.com/ollama/ollama/llm" ) -func TestFuncs(t *testing.T) { - t.Run("toJson", func(t *testing.T) { - cases := []struct { - input any - expected string - }{ - {nil, "null"}, - {true, "true"}, - {false, "false"}, - {0, "0"}, - {1, "1"}, - {1.0, "1"}, - {1.1, "1.1"}, - {"", `""`}, - {"hello", `"hello"`}, - {[]int{1, 2, 3}, "[1,2,3]"}, - {[]string{"a", "b", "c"}, `["a","b","c"]`}, - {map[string]int{"a": 1, "b": 2}, `{"a":1,"b":2}`}, - {map[string]string{"a": "b", "c": "d"}, `{"a":"b","c":"d"}`}, - } - - for _, tt := range cases { - t.Run(tt.expected, func(t *testing.T) { - toJson, ok := funcs["toJson"].(func(any) string) - if !ok { - t.Fatal("toJson is not a function") - } - - if s := toJson(tt.input); s != tt.expected { - t.Errorf("expected %q, got %q", tt.expected, s) - } - }) - } - }) - - t.Run("add", func(t *testing.T) { - cases := []struct { - a, b int - expected int - }{ - {0, 0, 0}, - {0, 1, 1}, - {1, 0, 1}, - {1, 1, 2}, - {1, -1, 0}, - {-1, 1, 0}, - {-1, -1, -2}, - } - - for _, tt := range cases { - t.Run(strconv.Itoa(tt.expected), func(t *testing.T) { - add, ok := funcs["add"].(func(int, int) int) - if !ok { - t.Fatal("add is not a function") - } - - if n := add(tt.a, tt.b); n != tt.expected { - t.Errorf("expected %d, got %d", tt.expected, n) - } - }) - } - }) - - t.Run("sub", func(t *testing.T) { - cases := []struct { - a, b int - expected int - }{ - {0, 0, 0}, - {0, 1, -1}, - {1, 0, 1}, - {1, 1, 0}, - {1, -1, 2}, - {-1, 1, -2}, - {-1, -1, 0}, - } - - for _, tt := range cases { - t.Run(strconv.Itoa(tt.expected), func(t *testing.T) { - sub, ok := funcs["sub"].(func(int, int) int) - if !ok { - t.Fatal("sub is not a function") - } - - if n := sub(tt.a, tt.b); n != tt.expected { - t.Errorf("expected %d, got %d", tt.expected, n) - } - }) - } - }) -} - func TestNamed(t *testing.T) { f, err := os.Open(filepath.Join("testdata", "templates.jsonl")) if err != nil { @@ -197,8 +104,8 @@ func TestExecuteWithMessages(t *testing.T) { []template{ {"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `}, {"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, - {"messages", `{{- range .Messages }} -{{- if eq .Role "user" }}[INST] {{ if and (eq (index $.Messages (sub (len $.Messages) 1)) .) $.System }}{{ $.System }}{{ "\n\n" }} + {"messages", `{{- range $index, $_ := .Messages }} +{{- if eq .Role "user" }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}{{ "\n\n" }} {{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} {{- end }} {{- end }}`}, @@ -218,8 +125,8 @@ func TestExecuteWithMessages(t *testing.T) { {"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `}, {"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, {"messages", ` -{{- range .Messages }} -{{- if eq .Role "user" }}[INST] {{ if and (eq (index $.Messages (sub (len $.Messages) 1)) .) $.System }}{{ $.System }}{{ "\n\n" }} +{{- range $index, $_ := .Messages }} +{{- if eq .Role "user" }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}{{ "\n\n" }} {{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} {{- end }} {{- end }}`}, @@ -248,8 +155,8 @@ What is your name?[/INST] `, {{ .Response }}<|im_end|> `}, {"messages", ` -{{- range .Messages }} -{{- if and (eq .Role "user") (eq (index $.Messages (sub (len $.Messages) 1)) .) $.System }}<|im_start|>system +{{- range $index, $_ := .Messages }} +{{- if and (eq .Role "user") (eq (len (slice $.Messages $index)) 1) $.System }}<|im_start|>system {{ $.System }}<|im_end|>{{ "\n" }} {{- end }}<|im_start|>{{ .Role }} {{ .Content }}<|im_end|>{{ "\n" }} From fb6cbc02fbe0ff8d791413a81558a1fe9725b778 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 27 Jun 2024 14:15:17 -0700 Subject: [PATCH 06/48] update named templates --- go.mod | 3 +- server/routes_create_test.go | 4 +- template/alfred.gotmpl | 9 ++- template/alpaca.gotmpl | 14 +++- template/chatml.gotmpl | 11 ++- template/chatqa.gotmpl | 14 +++- template/codellama-70b-instruct.gotmpl | 13 +++- template/falcon-instruct.gotmpl | 12 +++- template/gemma-instruct.gotmpl | 14 +++- template/granite-instruct.gotmpl | 16 ++++- template/llama2-chat.gotmpl | 15 +++- template/llama3-instruct.gotmpl | 14 +++- template/magicoder.gotmpl | 15 +++- template/mistral-instruct.gotmpl | 15 ++-- template/openchat.gotmpl | 12 +++- template/phi-3.gotmpl | 11 ++- template/solar-instruct.gotmpl | 16 ++++- template/starcoder2-instruct.gotmpl | 15 ++++ template/template_test.go | 69 ++++++++++++++++++- .../alfred.gotmpl/system-user-assistant-user | 1 + template/testdata/alfred.gotmpl/user | 1 + .../alfred.gotmpl/user-assistant-user | 1 + .../alpaca.gotmpl/system-user-assistant-user | 10 +++ template/testdata/alpaca.gotmpl/user | 4 ++ .../alpaca.gotmpl/user-assistant-user | 10 +++ .../chatml.gotmpl/system-user-assistant-user | 9 +++ template/testdata/chatml.gotmpl/user | 3 + .../chatml.gotmpl/user-assistant-user | 7 ++ .../chatqa.gotmpl/system-user-assistant-user | 9 +++ template/testdata/chatqa.gotmpl/user | 3 + .../chatqa.gotmpl/user-assistant-user | 7 ++ .../system-user-assistant-user | 11 +++ .../codellama-70b-instruct.gotmpl/user | 5 ++ .../user-assistant-user | 9 +++ .../system-user-assistant-user | 8 +++ template/testdata/falcon-instruct.gotmpl/user | 3 + .../user-assistant-user | 7 ++ .../system-user-assistant-user | 8 +++ template/testdata/gemma-instruct.gotmpl/user | 3 + .../gemma-instruct.gotmpl/user-assistant-user | 7 ++ .../system-user-assistant-user | 13 ++++ .../testdata/granite-instruct.gotmpl/user | 4 ++ .../user-assistant-user | 10 +++ .../system-user-assistant-user | 5 ++ template/testdata/llama2-chat.gotmpl/user | 3 + .../llama2-chat.gotmpl/user-assistant-user | 3 + .../system-user-assistant-user | 10 +++ template/testdata/llama3-instruct.gotmpl/user | 4 ++ .../user-assistant-user | 8 +++ .../system-user-assistant-user | 12 ++++ template/testdata/magicoder.gotmpl/user | 4 ++ .../magicoder.gotmpl/user-assistant-user | 10 +++ .../system-user-assistant-user | 2 + .../testdata/mistral-instruct.gotmpl/user | 1 + .../user-assistant-user | 1 + .../system-user-assistant-user | 1 + template/testdata/openchat.gotmpl/user | 1 + .../openchat.gotmpl/user-assistant-user | 1 + .../phi-3.gotmpl/system-user-assistant-user | 9 +++ template/testdata/phi-3.gotmpl/user | 3 + .../testdata/phi-3.gotmpl/user-assistant-user | 7 ++ .../system-user-assistant-user | 13 ++++ template/testdata/solar-instruct.gotmpl/user | 4 ++ .../solar-instruct.gotmpl/user-assistant-user | 10 +++ .../system-user-assistant-user | 12 ++++ .../testdata/starcoder2-instruct.gotmpl/user | 4 ++ .../user-assistant-user | 10 +++ .../vicuna.gotmpl/system-user-assistant-user | 6 ++ template/testdata/vicuna.gotmpl/user | 2 + .../vicuna.gotmpl/user-assistant-user | 4 ++ .../zephyr.gotmpl/system-user-assistant-user | 9 +++ template/testdata/zephyr.gotmpl/user | 3 + .../zephyr.gotmpl/user-assistant-user | 7 ++ template/vicuna.gotmpl | 13 +++- template/zephyr.gotmpl | 11 ++- 75 files changed, 611 insertions(+), 27 deletions(-) create mode 100644 template/testdata/alfred.gotmpl/system-user-assistant-user create mode 100644 template/testdata/alfred.gotmpl/user create mode 100644 template/testdata/alfred.gotmpl/user-assistant-user create mode 100644 template/testdata/alpaca.gotmpl/system-user-assistant-user create mode 100644 template/testdata/alpaca.gotmpl/user create mode 100644 template/testdata/alpaca.gotmpl/user-assistant-user create mode 100644 template/testdata/chatml.gotmpl/system-user-assistant-user create mode 100644 template/testdata/chatml.gotmpl/user create mode 100644 template/testdata/chatml.gotmpl/user-assistant-user create mode 100644 template/testdata/chatqa.gotmpl/system-user-assistant-user create mode 100644 template/testdata/chatqa.gotmpl/user create mode 100644 template/testdata/chatqa.gotmpl/user-assistant-user create mode 100644 template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user create mode 100644 template/testdata/codellama-70b-instruct.gotmpl/user create mode 100644 template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user create mode 100644 template/testdata/falcon-instruct.gotmpl/system-user-assistant-user create mode 100644 template/testdata/falcon-instruct.gotmpl/user create mode 100644 template/testdata/falcon-instruct.gotmpl/user-assistant-user create mode 100644 template/testdata/gemma-instruct.gotmpl/system-user-assistant-user create mode 100644 template/testdata/gemma-instruct.gotmpl/user create mode 100644 template/testdata/gemma-instruct.gotmpl/user-assistant-user create mode 100644 template/testdata/granite-instruct.gotmpl/system-user-assistant-user create mode 100644 template/testdata/granite-instruct.gotmpl/user create mode 100644 template/testdata/granite-instruct.gotmpl/user-assistant-user create mode 100644 template/testdata/llama2-chat.gotmpl/system-user-assistant-user create mode 100644 template/testdata/llama2-chat.gotmpl/user create mode 100644 template/testdata/llama2-chat.gotmpl/user-assistant-user create mode 100644 template/testdata/llama3-instruct.gotmpl/system-user-assistant-user create mode 100644 template/testdata/llama3-instruct.gotmpl/user create mode 100644 template/testdata/llama3-instruct.gotmpl/user-assistant-user create mode 100644 template/testdata/magicoder.gotmpl/system-user-assistant-user create mode 100644 template/testdata/magicoder.gotmpl/user create mode 100644 template/testdata/magicoder.gotmpl/user-assistant-user create mode 100644 template/testdata/mistral-instruct.gotmpl/system-user-assistant-user create mode 100644 template/testdata/mistral-instruct.gotmpl/user create mode 100644 template/testdata/mistral-instruct.gotmpl/user-assistant-user create mode 100644 template/testdata/openchat.gotmpl/system-user-assistant-user create mode 100644 template/testdata/openchat.gotmpl/user create mode 100644 template/testdata/openchat.gotmpl/user-assistant-user create mode 100644 template/testdata/phi-3.gotmpl/system-user-assistant-user create mode 100644 template/testdata/phi-3.gotmpl/user create mode 100644 template/testdata/phi-3.gotmpl/user-assistant-user create mode 100644 template/testdata/solar-instruct.gotmpl/system-user-assistant-user create mode 100644 template/testdata/solar-instruct.gotmpl/user create mode 100644 template/testdata/solar-instruct.gotmpl/user-assistant-user create mode 100644 template/testdata/starcoder2-instruct.gotmpl/system-user-assistant-user create mode 100644 template/testdata/starcoder2-instruct.gotmpl/user create mode 100644 template/testdata/starcoder2-instruct.gotmpl/user-assistant-user create mode 100644 template/testdata/vicuna.gotmpl/system-user-assistant-user create mode 100644 template/testdata/vicuna.gotmpl/user create mode 100644 template/testdata/vicuna.gotmpl/user-assistant-user create mode 100644 template/testdata/zephyr.gotmpl/system-user-assistant-user create mode 100644 template/testdata/zephyr.gotmpl/user create mode 100644 template/testdata/zephyr.gotmpl/user-assistant-user diff --git a/go.mod b/go.mod index 6807b9b4..2e0c6614 100644 --- a/go.mod +++ b/go.mod @@ -18,6 +18,7 @@ require ( require ( github.com/agnivade/levenshtein v1.1.1 github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 + github.com/google/go-cmp v0.6.0 github.com/mattn/go-runewidth v0.0.14 github.com/nlpodyssey/gopickle v0.3.0 github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c @@ -71,7 +72,7 @@ require ( golang.org/x/net v0.25.0 // indirect golang.org/x/sys v0.20.0 golang.org/x/term v0.20.0 - golang.org/x/text v0.15.0 // indirect + golang.org/x/text v0.15.0 google.golang.org/protobuf v1.34.1 gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 34061282..269a0ba1 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -545,9 +545,9 @@ func TestCreateDetectTemplate(t *testing.T) { } checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{ - filepath.Join(p, "blobs", "sha256-2f8e594e6f34b1b4d36a246628eeb3365ce442303d656f1fcc69e821722acea0"), - filepath.Join(p, "blobs", "sha256-542b217f179c7825eeb5bca3c77d2b75ed05bafbd3451d9188891a60a85337c6"), filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"), + filepath.Join(p, "blobs", "sha256-9512c372dfc7d84d6065b8dd2b601aeed8cc1a78e7a7aa784a42fff37f5524b7"), + filepath.Join(p, "blobs", "sha256-b8b78cb8c6eefd14c06f1af042e6161255bf87bbf2dd14fce57cdac893db8139"), }) }) diff --git a/template/alfred.gotmpl b/template/alfred.gotmpl index cecb9d2c..44284f04 100644 --- a/template/alfred.gotmpl +++ b/template/alfred.gotmpl @@ -1 +1,8 @@ -{{ if .System }}{{ .System }}{{ end }}{{ if .Prompt }}{{ .Prompt }}{{ end }}{{ .Response }} \ No newline at end of file +{{- if .Messages }} +{{- if .System }}{{ .System }} +{{- end }} +{{- range .Messages }}{{ .Content }} +{{- end }} +{{- else }} +{{ if .System }}{{ .System }}{{ end }}{{ if .Prompt }}{{ .Prompt }}{{ end }}{{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/alpaca.gotmpl b/template/alpaca.gotmpl index 440d0662..c1f69dc9 100644 --- a/template/alpaca.gotmpl +++ b/template/alpaca.gotmpl @@ -1,7 +1,19 @@ +{{- if .Messages }} +{{- if .System }}{{ .System }} +{{- end }} +{{- range .Messages }} +{{- if eq .Role "user" }}### Instruction: +{{- else if eq .Role "assistant" }}### Response: +{{- end }} +{{ .Content }} + +{{ end }}### Response: +{{ else }} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}### Instruction: {{ .Prompt }} {{ end }}### Response: -{{ .Response }} \ No newline at end of file +{{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/chatml.gotmpl b/template/chatml.gotmpl index dcf17285..d945547c 100644 --- a/template/chatml.gotmpl +++ b/template/chatml.gotmpl @@ -1,6 +1,15 @@ +{{- if .Messages }} +{{- if .System }}<|im_start|>system +{{ .System }}<|im_end|> +{{ end }} +{{- range .Messages }}<|im_start|>{{ .Role }} +{{ .Content }}<|im_end|> +{{ end }}<|im_start|>assistant +{{ else }} {{ if .System }}<|im_start|>system {{ .System }}<|im_end|> {{ end }}{{ if .Prompt }}<|im_start|>user {{ .Prompt }}<|im_end|> {{ end }}<|im_start|>assistant -{{ .Response }}<|im_end|> \ No newline at end of file +{{ .Response }}<|im_end|> +{{- end }} \ No newline at end of file diff --git a/template/chatqa.gotmpl b/template/chatqa.gotmpl index 1ede6227..7022c479 100644 --- a/template/chatqa.gotmpl +++ b/template/chatqa.gotmpl @@ -1,5 +1,17 @@ +{{- if .Messages }} +{{- if .System }}System: {{ .System }} + +{{ end }} +{{- range .Messages }} +{{- if eq .Role "user" }}User: +{{- else if eq .Role "assistant" }}Assistant: +{{- end }} {{ .Content }} + +{{ end }}Assistant: +{{- else }} {{ if .System }}System: {{ .System }} {{ end }}{{ if .Prompt }}User: {{ .Prompt }} -{{ end }}Assistant: <|begin_of_text|>{{ .Response }} \ No newline at end of file +{{ end }}Assistant: <|begin_of_text|>{{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/codellama-70b-instruct.gotmpl b/template/codellama-70b-instruct.gotmpl index 3196bd6f..392d839e 100644 --- a/template/codellama-70b-instruct.gotmpl +++ b/template/codellama-70b-instruct.gotmpl @@ -1,3 +1,13 @@ +{{- if .Messages }} +{{- if .System }}Source: system + + {{ .System }} {{ end }} +{{- range .Messages }}Source: {{ .Role }} + + {{ .Content }} {{ end }}Source: assistant +Destination: user + +{{ else }} {{ if .System }} Source: system {{ .System }} {{ end }} Source: user @@ -5,4 +15,5 @@ {{ .Prompt }} Source: assistant Destination: user - {{ .Response }} \ No newline at end of file + {{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/falcon-instruct.gotmpl b/template/falcon-instruct.gotmpl index 2309a1c5..99d67f93 100644 --- a/template/falcon-instruct.gotmpl +++ b/template/falcon-instruct.gotmpl @@ -1,3 +1,13 @@ +{{- if .Messages }} +{{- if .System }}System: {{ .System }} +{{ end }} +{{- range .Messages }} +{{- if eq .Role "user" }}User: +{{ else if eq .Role "assistant" }}Falcon: +{{ end }}{{ .Content }} +{{ end }}Falcon: +{{ else }} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}User: {{ .Prompt }} -{{ end }}Assistant: {{ .Response }} \ No newline at end of file +{{ end }}Assistant: {{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/gemma-instruct.gotmpl b/template/gemma-instruct.gotmpl index 91b9883a..870a8f2e 100644 --- a/template/gemma-instruct.gotmpl +++ b/template/gemma-instruct.gotmpl @@ -1,4 +1,16 @@ +{{- if .Messages }} +{{- range $index, $_ := .Messages }} +{{- if eq .Role "user" }}user +{{- if and $.System (eq $index 0) }} +{{ $.System }} +{{- end }} +{{- else if eq .Role "assistant" }}model +{{- end }} +{{ .Content }} +{{ end }}model +{{ else }} user {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} model -{{ .Response }} \ No newline at end of file +{{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/granite-instruct.gotmpl b/template/granite-instruct.gotmpl index 2ede647f..327ff3ee 100644 --- a/template/granite-instruct.gotmpl +++ b/template/granite-instruct.gotmpl @@ -1,3 +1,16 @@ +{{- if .Messages }} +{{- if .System }}System: +{{ .System }} + +{{ end }} +{{- range .Messages }} +{{- if eq .Role "user" }}Question: +{{- else if eq .Role "assistant" }}Answer: +{{- end }} +{{ .Content }} + +{{ end }}Answer: +{{ else }} {{ if .System }} System: {{ .System }} @@ -6,4 +19,5 @@ System: {{ .Prompt }} {{ end }}Answer: -{{ .Response }} \ No newline at end of file +{{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/llama2-chat.gotmpl b/template/llama2-chat.gotmpl index a739f690..6327d581 100644 --- a/template/llama2-chat.gotmpl +++ b/template/llama2-chat.gotmpl @@ -1,3 +1,16 @@ +{{- if .Messages }} +{{- range $index, $_ := .Messages }} +{{- if eq .Role "user" }}[INST] {{ if eq $index 0 }}<> +{{- if $.System }} +{{ $.System }} +{{ end }}<> + +{{ end }}{{ .Content }} +{{- else }} [/INST] {{ .Content }} +{{- end }} +{{- end }} [/INST] +{{- else }} [INST] <>{{ .System }}<> -{{ .Prompt }} [/INST] {{ .Response }} \ No newline at end of file +{{ .Prompt }} [/INST] {{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/llama3-instruct.gotmpl b/template/llama3-instruct.gotmpl index 36d0218b..9c81a953 100644 --- a/template/llama3-instruct.gotmpl +++ b/template/llama3-instruct.gotmpl @@ -1,7 +1,19 @@ +{{- if .Messages }} +{{- if .System }}<|start_header_id|>system<|end_header_id|> + +{{ .System }}<|eot_id|> +{{- end }} +{{- range .Messages }}<|start_header_id|>{{ .Role }}<|end_header_id|> + +{{ .Content }}<|eot_id|> +{{- end }}<|start_header_id|>assistant<|end_header_id|> + +{{ else }} {{ if .System }}<|start_header_id|>system<|end_header_id|> {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|> {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|> -{{ .Response }}<|eot_id|> \ No newline at end of file +{{ .Response }}<|eot_id|> +{{- end }} \ No newline at end of file diff --git a/template/magicoder.gotmpl b/template/magicoder.gotmpl index 306972ec..73a58127 100644 --- a/template/magicoder.gotmpl +++ b/template/magicoder.gotmpl @@ -1,7 +1,20 @@ +{{- if .Messages }} +{{- if .System }}{{ .System }} + +{{ end }} +{{- range .Messages }} +{{- if eq .Role "user" }}@@ Instruction +{{- else if eq .Role "assistant" }}@@ Response +{{- end }} +{{ .Content }} + +{{ end }}@@ Response +{{ else }} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}@@ Instruction {{ .Prompt }} {{ end }}@@ Response -{{ .Response }} \ No newline at end of file +{{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/mistral-instruct.gotmpl b/template/mistral-instruct.gotmpl index dcf17285..eb3d5ced 100644 --- a/template/mistral-instruct.gotmpl +++ b/template/mistral-instruct.gotmpl @@ -1,6 +1,9 @@ -{{ if .System }}<|im_start|>system -{{ .System }}<|im_end|> -{{ end }}{{ if .Prompt }}<|im_start|>user -{{ .Prompt }}<|im_end|> -{{ end }}<|im_start|>assistant -{{ .Response }}<|im_end|> \ No newline at end of file +{{- if .Messages }} +{{- range $index, $_ := .Messages }} +{{- if eq .Role "user" }}[INST] {{ if and $.System (eq (len (slice $.Messages $index)) 1) }}{{ $.System }} +{{ end }}{{ .Content }} +{{- else if eq .Role "assistant" }}[/INST] {{ .Content }} +{{- end }} +{{- end }}[/INST] +{{- else }}[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST] {{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/openchat.gotmpl b/template/openchat.gotmpl index d2ca3868..d5e1cbb0 100644 --- a/template/openchat.gotmpl +++ b/template/openchat.gotmpl @@ -1 +1,11 @@ -{{ .System }}<|end_of_turn|>GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|> \ No newline at end of file +{{- if .Messages }} +{{- if .System }}GPT Correct System: {{ .System }}<|end_of_turn|> +{{- end }} +{{- range .Messages }}GPT Correct +{{- if eq .Role "user" }} User: +{{- else if eq .Role "assistant" }} Assistant: +{{- end }} {{ .Content }}<|end_of_turn|> +{{- end }}GPT Correct Assistant: +{{- else }} +{{ .System }}<|end_of_turn|>GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|> +{{- end }} \ No newline at end of file diff --git a/template/phi-3.gotmpl b/template/phi-3.gotmpl index bf26dcee..a3558d2b 100644 --- a/template/phi-3.gotmpl +++ b/template/phi-3.gotmpl @@ -1,6 +1,15 @@ +{{- if .Messages }} +{{- if .System }}<|system|> +{{ .System }}<|end|> +{{ end }} +{{- range .Messages }}<|{{ .Role }}|> +{{ .Content }}<|end|> +{{ end }}<|assistant|> +{{ else }} {{ if .System }}<|system|> {{ .System }}<|end|> {{ end }}{{ if .Prompt }}<|user|> {{ .Prompt }}<|end|> {{ end }}<|assistant|> -{{ .Response }}<|end|> \ No newline at end of file +{{ .Response }}<|end|> +{{- end }} \ No newline at end of file diff --git a/template/solar-instruct.gotmpl b/template/solar-instruct.gotmpl index c275a26a..caa6e8e7 100644 --- a/template/solar-instruct.gotmpl +++ b/template/solar-instruct.gotmpl @@ -1,3 +1,16 @@ +{{- if .Messages }} +{{- if .System }}### System: +{{ .System }} + +{{ end }} +{{- range .Messages }} +{{- if eq .Role "user" }}### User: +{{ .Content }} +{{ else if eq .Role "assistant" }}### Assistant: +{{ .Content }} +{{ end }} +{{ end }}### Assistant: +{{ else }} {{ if .System }}### System: {{ .System }} @@ -5,4 +18,5 @@ {{ .Prompt }} {{ end }}### Assistant: -{{ .Response }} \ No newline at end of file +{{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/starcoder2-instruct.gotmpl b/template/starcoder2-instruct.gotmpl index 33357e54..7d7ff932 100644 --- a/template/starcoder2-instruct.gotmpl +++ b/template/starcoder2-instruct.gotmpl @@ -1,3 +1,17 @@ +{{- if .Messages }} +{{- if .System }}{{ .System }} + +{{ end }} +{{- range .Messages }} +{{- if eq .Role "user" }}### Instruction +{{ .Content }} + +{{ else if eq .Role "assistant" }}### Response +{{ .Content }}<|endoftext|> + +{{ end }} +{{- end }}### Response +{{ else }} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}### Instruction @@ -7,3 +21,4 @@ {{ end }}### Response {{ .Response }}<|endoftext|> +{{- end }} \ No newline at end of file diff --git a/template/template_test.go b/template/template_test.go index ac16bd60..428cdc77 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -8,9 +8,10 @@ import ( "os" "path/filepath" "slices" + "strings" "testing" - "text/template" + "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" "github.com/ollama/ollama/llm" ) @@ -47,7 +48,7 @@ func TestNamed(t *testing.T) { t.Fatal(err) } - tmpl, err := template.New(s).Parse(b.String()) + tmpl, err := Parse(b.String()) if err != nil { t.Fatal(err) } @@ -60,6 +61,70 @@ func TestNamed(t *testing.T) { } } +func TestTemplate(t *testing.T) { + cases := make(map[string][]api.Message) + for _, mm := range [][]api.Message{ + { + {Role: "user", Content: "Hello, how are you?"}, + }, + { + {Role: "user", Content: "Hello, how are you?"}, + {Role: "assistant", Content: "I'm doing great. How can I help you today?"}, + {Role: "user", Content: "I'd like to show off how chat templating works!"}, + }, + { + {Role: "system", Content: "You are a helpful assistant."}, + {Role: "user", Content: "Hello, how are you?"}, + {Role: "assistant", Content: "I'm doing great. How can I help you today?"}, + {Role: "user", Content: "I'd like to show off how chat templating works!"}, + }, + } { + var roles []string + for _, m := range mm { + roles = append(roles, m.Role) + } + + cases[strings.Join(roles, "-")] = mm + } + + matches, err := filepath.Glob("*.gotmpl") + if err != nil { + t.Fatal(err) + } + + for _, match := range matches { + t.Run(match, func(t *testing.T) { + bts, err := os.ReadFile(match) + if err != nil { + t.Fatal(err) + } + + tmpl, err := Parse(string(bts)) + if err != nil { + t.Fatal(err) + } + + for n, tt := range cases { + t.Run(n, func(t *testing.T) { + var actual bytes.Buffer + if err := tmpl.Execute(&actual, Values{Messages: tt}); err != nil { + t.Fatal(err) + } + + expect, err := os.ReadFile(filepath.Join("testdata", match, n)) + if err != nil { + t.Fatal(err) + } + + if diff := cmp.Diff(actual.Bytes(), expect); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + } + }) + } +} + func TestParse(t *testing.T) { cases := []struct { template string diff --git a/template/testdata/alfred.gotmpl/system-user-assistant-user b/template/testdata/alfred.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..03e23ea9 --- /dev/null +++ b/template/testdata/alfred.gotmpl/system-user-assistant-user @@ -0,0 +1 @@ +You are a helpful assistant.Hello, how are you?I'm doing great. How can I help you today?I'd like to show off how chat templating works! \ No newline at end of file diff --git a/template/testdata/alfred.gotmpl/user b/template/testdata/alfred.gotmpl/user new file mode 100644 index 00000000..7c884a6f --- /dev/null +++ b/template/testdata/alfred.gotmpl/user @@ -0,0 +1 @@ +Hello, how are you? \ No newline at end of file diff --git a/template/testdata/alfred.gotmpl/user-assistant-user b/template/testdata/alfred.gotmpl/user-assistant-user new file mode 100644 index 00000000..a60701ed --- /dev/null +++ b/template/testdata/alfred.gotmpl/user-assistant-user @@ -0,0 +1 @@ +Hello, how are you?I'm doing great. How can I help you today?I'd like to show off how chat templating works! \ No newline at end of file diff --git a/template/testdata/alpaca.gotmpl/system-user-assistant-user b/template/testdata/alpaca.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..20182d82 --- /dev/null +++ b/template/testdata/alpaca.gotmpl/system-user-assistant-user @@ -0,0 +1,10 @@ +You are a helpful assistant.### Instruction: +Hello, how are you? + +### Response: +I'm doing great. How can I help you today? + +### Instruction: +I'd like to show off how chat templating works! + +### Response: diff --git a/template/testdata/alpaca.gotmpl/user b/template/testdata/alpaca.gotmpl/user new file mode 100644 index 00000000..a0ce5dec --- /dev/null +++ b/template/testdata/alpaca.gotmpl/user @@ -0,0 +1,4 @@ +### Instruction: +Hello, how are you? + +### Response: diff --git a/template/testdata/alpaca.gotmpl/user-assistant-user b/template/testdata/alpaca.gotmpl/user-assistant-user new file mode 100644 index 00000000..6c5e23ff --- /dev/null +++ b/template/testdata/alpaca.gotmpl/user-assistant-user @@ -0,0 +1,10 @@ +### Instruction: +Hello, how are you? + +### Response: +I'm doing great. How can I help you today? + +### Instruction: +I'd like to show off how chat templating works! + +### Response: diff --git a/template/testdata/chatml.gotmpl/system-user-assistant-user b/template/testdata/chatml.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..8b013fcf --- /dev/null +++ b/template/testdata/chatml.gotmpl/system-user-assistant-user @@ -0,0 +1,9 @@ +<|im_start|>system +You are a helpful assistant.<|im_end|> +<|im_start|>user +Hello, how are you?<|im_end|> +<|im_start|>assistant +I'm doing great. How can I help you today?<|im_end|> +<|im_start|>user +I'd like to show off how chat templating works!<|im_end|> +<|im_start|>assistant diff --git a/template/testdata/chatml.gotmpl/user b/template/testdata/chatml.gotmpl/user new file mode 100644 index 00000000..aa9e597a --- /dev/null +++ b/template/testdata/chatml.gotmpl/user @@ -0,0 +1,3 @@ +<|im_start|>user +Hello, how are you?<|im_end|> +<|im_start|>assistant diff --git a/template/testdata/chatml.gotmpl/user-assistant-user b/template/testdata/chatml.gotmpl/user-assistant-user new file mode 100644 index 00000000..a7cba4de --- /dev/null +++ b/template/testdata/chatml.gotmpl/user-assistant-user @@ -0,0 +1,7 @@ +<|im_start|>user +Hello, how are you?<|im_end|> +<|im_start|>assistant +I'm doing great. How can I help you today?<|im_end|> +<|im_start|>user +I'd like to show off how chat templating works!<|im_end|> +<|im_start|>assistant diff --git a/template/testdata/chatqa.gotmpl/system-user-assistant-user b/template/testdata/chatqa.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..98fd59bf --- /dev/null +++ b/template/testdata/chatqa.gotmpl/system-user-assistant-user @@ -0,0 +1,9 @@ +System: You are a helpful assistant. + +User: Hello, how are you? + +Assistant: I'm doing great. How can I help you today? + +User: I'd like to show off how chat templating works! + +Assistant: \ No newline at end of file diff --git a/template/testdata/chatqa.gotmpl/user b/template/testdata/chatqa.gotmpl/user new file mode 100644 index 00000000..9e7cf702 --- /dev/null +++ b/template/testdata/chatqa.gotmpl/user @@ -0,0 +1,3 @@ +User: Hello, how are you? + +Assistant: \ No newline at end of file diff --git a/template/testdata/chatqa.gotmpl/user-assistant-user b/template/testdata/chatqa.gotmpl/user-assistant-user new file mode 100644 index 00000000..405bbe12 --- /dev/null +++ b/template/testdata/chatqa.gotmpl/user-assistant-user @@ -0,0 +1,7 @@ +User: Hello, how are you? + +Assistant: I'm doing great. How can I help you today? + +User: I'd like to show off how chat templating works! + +Assistant: \ No newline at end of file diff --git a/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user b/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..fdd0fc8b --- /dev/null +++ b/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user @@ -0,0 +1,11 @@ +Source: system + + You are a helpful assistant. Source: user + + Hello, how are you? Source: assistant + + I'm doing great. How can I help you today? Source: user + + I'd like to show off how chat templating works! Source: assistant +Destination: user + diff --git a/template/testdata/codellama-70b-instruct.gotmpl/user b/template/testdata/codellama-70b-instruct.gotmpl/user new file mode 100644 index 00000000..9e7174a8 --- /dev/null +++ b/template/testdata/codellama-70b-instruct.gotmpl/user @@ -0,0 +1,5 @@ +Source: user + + Hello, how are you? Source: assistant +Destination: user + diff --git a/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user b/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user new file mode 100644 index 00000000..b4ba1736 --- /dev/null +++ b/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user @@ -0,0 +1,9 @@ +Source: user + + Hello, how are you? Source: assistant + + I'm doing great. How can I help you today? Source: user + + I'd like to show off how chat templating works! Source: assistant +Destination: user + diff --git a/template/testdata/falcon-instruct.gotmpl/system-user-assistant-user b/template/testdata/falcon-instruct.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..16e45e5b --- /dev/null +++ b/template/testdata/falcon-instruct.gotmpl/system-user-assistant-user @@ -0,0 +1,8 @@ +System: You are a helpful assistant. +User: +Hello, how are you? +Falcon: +I'm doing great. How can I help you today? +User: +I'd like to show off how chat templating works! +Falcon: diff --git a/template/testdata/falcon-instruct.gotmpl/user b/template/testdata/falcon-instruct.gotmpl/user new file mode 100644 index 00000000..110831a2 --- /dev/null +++ b/template/testdata/falcon-instruct.gotmpl/user @@ -0,0 +1,3 @@ +User: +Hello, how are you? +Falcon: diff --git a/template/testdata/falcon-instruct.gotmpl/user-assistant-user b/template/testdata/falcon-instruct.gotmpl/user-assistant-user new file mode 100644 index 00000000..b49639ea --- /dev/null +++ b/template/testdata/falcon-instruct.gotmpl/user-assistant-user @@ -0,0 +1,7 @@ +User: +Hello, how are you? +Falcon: +I'm doing great. How can I help you today? +User: +I'd like to show off how chat templating works! +Falcon: diff --git a/template/testdata/gemma-instruct.gotmpl/system-user-assistant-user b/template/testdata/gemma-instruct.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..5f6c3732 --- /dev/null +++ b/template/testdata/gemma-instruct.gotmpl/system-user-assistant-user @@ -0,0 +1,8 @@ +user +You are a helpful assistant. +Hello, how are you? +model +I'm doing great. How can I help you today? +user +I'd like to show off how chat templating works! +model diff --git a/template/testdata/gemma-instruct.gotmpl/user b/template/testdata/gemma-instruct.gotmpl/user new file mode 100644 index 00000000..dc8b30b6 --- /dev/null +++ b/template/testdata/gemma-instruct.gotmpl/user @@ -0,0 +1,3 @@ +user +Hello, how are you? +model diff --git a/template/testdata/gemma-instruct.gotmpl/user-assistant-user b/template/testdata/gemma-instruct.gotmpl/user-assistant-user new file mode 100644 index 00000000..1185924b --- /dev/null +++ b/template/testdata/gemma-instruct.gotmpl/user-assistant-user @@ -0,0 +1,7 @@ +user +Hello, how are you? +model +I'm doing great. How can I help you today? +user +I'd like to show off how chat templating works! +model diff --git a/template/testdata/granite-instruct.gotmpl/system-user-assistant-user b/template/testdata/granite-instruct.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..a732a77f --- /dev/null +++ b/template/testdata/granite-instruct.gotmpl/system-user-assistant-user @@ -0,0 +1,13 @@ +System: +You are a helpful assistant. + +Question: +Hello, how are you? + +Answer: +I'm doing great. How can I help you today? + +Question: +I'd like to show off how chat templating works! + +Answer: diff --git a/template/testdata/granite-instruct.gotmpl/user b/template/testdata/granite-instruct.gotmpl/user new file mode 100644 index 00000000..7abd2ea3 --- /dev/null +++ b/template/testdata/granite-instruct.gotmpl/user @@ -0,0 +1,4 @@ +Question: +Hello, how are you? + +Answer: diff --git a/template/testdata/granite-instruct.gotmpl/user-assistant-user b/template/testdata/granite-instruct.gotmpl/user-assistant-user new file mode 100644 index 00000000..da5e43ea --- /dev/null +++ b/template/testdata/granite-instruct.gotmpl/user-assistant-user @@ -0,0 +1,10 @@ +Question: +Hello, how are you? + +Answer: +I'm doing great. How can I help you today? + +Question: +I'd like to show off how chat templating works! + +Answer: diff --git a/template/testdata/llama2-chat.gotmpl/system-user-assistant-user b/template/testdata/llama2-chat.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..fc2679bf --- /dev/null +++ b/template/testdata/llama2-chat.gotmpl/system-user-assistant-user @@ -0,0 +1,5 @@ +[INST] <> +You are a helpful assistant. +<> + +Hello, how are you? [/INST] I'm doing great. How can I help you today?[INST] I'd like to show off how chat templating works! [/INST] \ No newline at end of file diff --git a/template/testdata/llama2-chat.gotmpl/user b/template/testdata/llama2-chat.gotmpl/user new file mode 100644 index 00000000..ceef9bdb --- /dev/null +++ b/template/testdata/llama2-chat.gotmpl/user @@ -0,0 +1,3 @@ +[INST] <><> + +Hello, how are you? [/INST] \ No newline at end of file diff --git a/template/testdata/llama2-chat.gotmpl/user-assistant-user b/template/testdata/llama2-chat.gotmpl/user-assistant-user new file mode 100644 index 00000000..42b4c529 --- /dev/null +++ b/template/testdata/llama2-chat.gotmpl/user-assistant-user @@ -0,0 +1,3 @@ +[INST] <><> + +Hello, how are you? [/INST] I'm doing great. How can I help you today?[INST] I'd like to show off how chat templating works! [/INST] \ No newline at end of file diff --git a/template/testdata/llama3-instruct.gotmpl/system-user-assistant-user b/template/testdata/llama3-instruct.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..6740bcb4 --- /dev/null +++ b/template/testdata/llama3-instruct.gotmpl/system-user-assistant-user @@ -0,0 +1,10 @@ +<|start_header_id|>system<|end_header_id|> + +You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|> + +Hello, how are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +I'm doing great. How can I help you today?<|eot_id|><|start_header_id|>user<|end_header_id|> + +I'd like to show off how chat templating works!<|eot_id|><|start_header_id|>assistant<|end_header_id|> + diff --git a/template/testdata/llama3-instruct.gotmpl/user b/template/testdata/llama3-instruct.gotmpl/user new file mode 100644 index 00000000..470aa028 --- /dev/null +++ b/template/testdata/llama3-instruct.gotmpl/user @@ -0,0 +1,4 @@ +<|start_header_id|>user<|end_header_id|> + +Hello, how are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|> + diff --git a/template/testdata/llama3-instruct.gotmpl/user-assistant-user b/template/testdata/llama3-instruct.gotmpl/user-assistant-user new file mode 100644 index 00000000..6dd768af --- /dev/null +++ b/template/testdata/llama3-instruct.gotmpl/user-assistant-user @@ -0,0 +1,8 @@ +<|start_header_id|>user<|end_header_id|> + +Hello, how are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|> + +I'm doing great. How can I help you today?<|eot_id|><|start_header_id|>user<|end_header_id|> + +I'd like to show off how chat templating works!<|eot_id|><|start_header_id|>assistant<|end_header_id|> + diff --git a/template/testdata/magicoder.gotmpl/system-user-assistant-user b/template/testdata/magicoder.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..c966a861 --- /dev/null +++ b/template/testdata/magicoder.gotmpl/system-user-assistant-user @@ -0,0 +1,12 @@ +You are a helpful assistant. + +@@ Instruction +Hello, how are you? + +@@ Response +I'm doing great. How can I help you today? + +@@ Instruction +I'd like to show off how chat templating works! + +@@ Response diff --git a/template/testdata/magicoder.gotmpl/user b/template/testdata/magicoder.gotmpl/user new file mode 100644 index 00000000..ccfb02bd --- /dev/null +++ b/template/testdata/magicoder.gotmpl/user @@ -0,0 +1,4 @@ +@@ Instruction +Hello, how are you? + +@@ Response diff --git a/template/testdata/magicoder.gotmpl/user-assistant-user b/template/testdata/magicoder.gotmpl/user-assistant-user new file mode 100644 index 00000000..3aea6dab --- /dev/null +++ b/template/testdata/magicoder.gotmpl/user-assistant-user @@ -0,0 +1,10 @@ +@@ Instruction +Hello, how are you? + +@@ Response +I'm doing great. How can I help you today? + +@@ Instruction +I'd like to show off how chat templating works! + +@@ Response diff --git a/template/testdata/mistral-instruct.gotmpl/system-user-assistant-user b/template/testdata/mistral-instruct.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..b6b4bf93 --- /dev/null +++ b/template/testdata/mistral-instruct.gotmpl/system-user-assistant-user @@ -0,0 +1,2 @@ +[INST] Hello, how are you?[/INST] I'm doing great. How can I help you today?[INST] You are a helpful assistant. +I'd like to show off how chat templating works![/INST] \ No newline at end of file diff --git a/template/testdata/mistral-instruct.gotmpl/user b/template/testdata/mistral-instruct.gotmpl/user new file mode 100644 index 00000000..b04871e5 --- /dev/null +++ b/template/testdata/mistral-instruct.gotmpl/user @@ -0,0 +1 @@ +[INST] Hello, how are you?[/INST] \ No newline at end of file diff --git a/template/testdata/mistral-instruct.gotmpl/user-assistant-user b/template/testdata/mistral-instruct.gotmpl/user-assistant-user new file mode 100644 index 00000000..b473e0df --- /dev/null +++ b/template/testdata/mistral-instruct.gotmpl/user-assistant-user @@ -0,0 +1 @@ +[INST] Hello, how are you?[/INST] I'm doing great. How can I help you today?[INST] I'd like to show off how chat templating works![/INST] \ No newline at end of file diff --git a/template/testdata/openchat.gotmpl/system-user-assistant-user b/template/testdata/openchat.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..1214c126 --- /dev/null +++ b/template/testdata/openchat.gotmpl/system-user-assistant-user @@ -0,0 +1 @@ +GPT Correct System: You are a helpful assistant.<|end_of_turn|>GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT Correct Assistant: \ No newline at end of file diff --git a/template/testdata/openchat.gotmpl/user b/template/testdata/openchat.gotmpl/user new file mode 100644 index 00000000..611daa83 --- /dev/null +++ b/template/testdata/openchat.gotmpl/user @@ -0,0 +1 @@ +GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: \ No newline at end of file diff --git a/template/testdata/openchat.gotmpl/user-assistant-user b/template/testdata/openchat.gotmpl/user-assistant-user new file mode 100644 index 00000000..f97b02b9 --- /dev/null +++ b/template/testdata/openchat.gotmpl/user-assistant-user @@ -0,0 +1 @@ +GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT Correct Assistant: \ No newline at end of file diff --git a/template/testdata/phi-3.gotmpl/system-user-assistant-user b/template/testdata/phi-3.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..6109a9a2 --- /dev/null +++ b/template/testdata/phi-3.gotmpl/system-user-assistant-user @@ -0,0 +1,9 @@ +<|system|> +You are a helpful assistant.<|end|> +<|user|> +Hello, how are you?<|end|> +<|assistant|> +I'm doing great. How can I help you today?<|end|> +<|user|> +I'd like to show off how chat templating works!<|end|> +<|assistant|> diff --git a/template/testdata/phi-3.gotmpl/user b/template/testdata/phi-3.gotmpl/user new file mode 100644 index 00000000..feb96e7c --- /dev/null +++ b/template/testdata/phi-3.gotmpl/user @@ -0,0 +1,3 @@ +<|user|> +Hello, how are you?<|end|> +<|assistant|> diff --git a/template/testdata/phi-3.gotmpl/user-assistant-user b/template/testdata/phi-3.gotmpl/user-assistant-user new file mode 100644 index 00000000..db79d01c --- /dev/null +++ b/template/testdata/phi-3.gotmpl/user-assistant-user @@ -0,0 +1,7 @@ +<|user|> +Hello, how are you?<|end|> +<|assistant|> +I'm doing great. How can I help you today?<|end|> +<|user|> +I'd like to show off how chat templating works!<|end|> +<|assistant|> diff --git a/template/testdata/solar-instruct.gotmpl/system-user-assistant-user b/template/testdata/solar-instruct.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..28c1730a --- /dev/null +++ b/template/testdata/solar-instruct.gotmpl/system-user-assistant-user @@ -0,0 +1,13 @@ +### System: +You are a helpful assistant. + +### User: +Hello, how are you? + +### Assistant: +I'm doing great. How can I help you today? + +### User: +I'd like to show off how chat templating works! + +### Assistant: diff --git a/template/testdata/solar-instruct.gotmpl/user b/template/testdata/solar-instruct.gotmpl/user new file mode 100644 index 00000000..3a43382a --- /dev/null +++ b/template/testdata/solar-instruct.gotmpl/user @@ -0,0 +1,4 @@ +### User: +Hello, how are you? + +### Assistant: diff --git a/template/testdata/solar-instruct.gotmpl/user-assistant-user b/template/testdata/solar-instruct.gotmpl/user-assistant-user new file mode 100644 index 00000000..8553e73b --- /dev/null +++ b/template/testdata/solar-instruct.gotmpl/user-assistant-user @@ -0,0 +1,10 @@ +### User: +Hello, how are you? + +### Assistant: +I'm doing great. How can I help you today? + +### User: +I'd like to show off how chat templating works! + +### Assistant: diff --git a/template/testdata/starcoder2-instruct.gotmpl/system-user-assistant-user b/template/testdata/starcoder2-instruct.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..5b718b3e --- /dev/null +++ b/template/testdata/starcoder2-instruct.gotmpl/system-user-assistant-user @@ -0,0 +1,12 @@ +You are a helpful assistant. + +### Instruction +Hello, how are you? + +### Response +I'm doing great. How can I help you today?<|endoftext|> + +### Instruction +I'd like to show off how chat templating works! + +### Response diff --git a/template/testdata/starcoder2-instruct.gotmpl/user b/template/testdata/starcoder2-instruct.gotmpl/user new file mode 100644 index 00000000..11b0be1f --- /dev/null +++ b/template/testdata/starcoder2-instruct.gotmpl/user @@ -0,0 +1,4 @@ +### Instruction +Hello, how are you? + +### Response diff --git a/template/testdata/starcoder2-instruct.gotmpl/user-assistant-user b/template/testdata/starcoder2-instruct.gotmpl/user-assistant-user new file mode 100644 index 00000000..d99feabb --- /dev/null +++ b/template/testdata/starcoder2-instruct.gotmpl/user-assistant-user @@ -0,0 +1,10 @@ +### Instruction +Hello, how are you? + +### Response +I'm doing great. How can I help you today?<|endoftext|> + +### Instruction +I'd like to show off how chat templating works! + +### Response diff --git a/template/testdata/vicuna.gotmpl/system-user-assistant-user b/template/testdata/vicuna.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..50d2f92c --- /dev/null +++ b/template/testdata/vicuna.gotmpl/system-user-assistant-user @@ -0,0 +1,6 @@ +You are a helpful assistant. + +USER: Hello, how are you? +ASSISTANT: I'm doing great. How can I help you today? +USER: I'd like to show off how chat templating works! +ASSISTANT: \ No newline at end of file diff --git a/template/testdata/vicuna.gotmpl/user b/template/testdata/vicuna.gotmpl/user new file mode 100644 index 00000000..cbe5ef70 --- /dev/null +++ b/template/testdata/vicuna.gotmpl/user @@ -0,0 +1,2 @@ +USER: Hello, how are you? +ASSISTANT: \ No newline at end of file diff --git a/template/testdata/vicuna.gotmpl/user-assistant-user b/template/testdata/vicuna.gotmpl/user-assistant-user new file mode 100644 index 00000000..9172547e --- /dev/null +++ b/template/testdata/vicuna.gotmpl/user-assistant-user @@ -0,0 +1,4 @@ +USER: Hello, how are you? +ASSISTANT: I'm doing great. How can I help you today? +USER: I'd like to show off how chat templating works! +ASSISTANT: \ No newline at end of file diff --git a/template/testdata/zephyr.gotmpl/system-user-assistant-user b/template/testdata/zephyr.gotmpl/system-user-assistant-user new file mode 100644 index 00000000..03d43fc3 --- /dev/null +++ b/template/testdata/zephyr.gotmpl/system-user-assistant-user @@ -0,0 +1,9 @@ +<|system|> +You are a helpful assistant. +<|user|> +Hello, how are you? +<|assistant|> +I'm doing great. How can I help you today? +<|user|> +I'd like to show off how chat templating works! +<|assistant|> diff --git a/template/testdata/zephyr.gotmpl/user b/template/testdata/zephyr.gotmpl/user new file mode 100644 index 00000000..6cefdaa0 --- /dev/null +++ b/template/testdata/zephyr.gotmpl/user @@ -0,0 +1,3 @@ +<|user|> +Hello, how are you? +<|assistant|> diff --git a/template/testdata/zephyr.gotmpl/user-assistant-user b/template/testdata/zephyr.gotmpl/user-assistant-user new file mode 100644 index 00000000..3937b006 --- /dev/null +++ b/template/testdata/zephyr.gotmpl/user-assistant-user @@ -0,0 +1,7 @@ +<|user|> +Hello, how are you? +<|assistant|> +I'm doing great. How can I help you today? +<|user|> +I'd like to show off how chat templating works! +<|assistant|> diff --git a/template/vicuna.gotmpl b/template/vicuna.gotmpl index 174c1a35..2e13e990 100644 --- a/template/vicuna.gotmpl +++ b/template/vicuna.gotmpl @@ -1,3 +1,14 @@ +{{- if .Messages }} +{{- if .System }}{{ .System }} + +{{ end }} +{{- range .Messages }} +{{- if eq .Role "user" }}USER: {{ .Content }} +{{ else if eq .Role "assistant" }}ASSISTANT: {{ .Content }} +{{ end }} +{{- end }}ASSISTANT: +{{- else }} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} -{{ end }}ASSISTANT: {{ .Response }} \ No newline at end of file +{{ end }}ASSISTANT: {{ .Response }} +{{- end }} \ No newline at end of file diff --git a/template/zephyr.gotmpl b/template/zephyr.gotmpl index aac0c7a1..e6668848 100644 --- a/template/zephyr.gotmpl +++ b/template/zephyr.gotmpl @@ -1,6 +1,15 @@ +{{- if .Messages }} +{{- if .System }}<|system|> +{{ .System }} +{{ end }} +{{- range .Messages }}<|{{ .Role }}|> +{{ .Content }} +{{ end }}<|assistant|> +{{ else }} {{ if .System }}<|system|> {{ .System }} {{ end }}{{ if .Prompt }}<|user|> {{ .Prompt }} {{ end }}<|assistant|> -{{ .Response }} \ No newline at end of file +{{ .Response }} +{{- end }} \ No newline at end of file From 6cea0360276e5fc7e2fecbe0cadf89cc72615279 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 6 Jul 2024 15:10:48 -0400 Subject: [PATCH 07/48] Revert "llm: only statically link libstdc++" This reverts commit 5796bfc4013f4ebe26cdbf13554332a25c405027. --- .github/workflows/release.yaml | 4 ---- llm/llm.go | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 1042c684..61ca3c43 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -304,10 +304,6 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" - - name: remove unwanted mingw dll.a files - run: | - Remove-Item "C:\mingw64\x86_64-w64-mingw32\lib\libpthread.dll.a" - Remove-Item "C:\mingw64\x86_64-w64-mingw32\lib\libwinpthread.dll.a" - uses: actions/setup-go@v5 with: go-version-file: go.mod diff --git a/llm/llm.go b/llm/llm.go index ac6a5249..3cd162e0 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -1,8 +1,8 @@ package llm // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include -// #cgo windows LDFLAGS: -static-libstdc++ // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread +// #cgo windows LDFLAGS: -static-libstdc++ -static-libgcc -static // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src From a08f20d910194edff79d45315330a088fda3f136 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 6 Jul 2024 15:21:15 -0400 Subject: [PATCH 08/48] release: remove unwanted mingw dll.a files --- .github/workflows/release.yaml | 5 +++++ llm/llm.go | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 61ca3c43..d1faf9f5 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -85,6 +85,11 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" + - name: remove unwanted mingw dll.a files + run: | + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force - uses: actions/setup-go@v5 with: go-version-file: go.mod diff --git a/llm/llm.go b/llm/llm.go index 3cd162e0..88c0258d 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -2,7 +2,6 @@ package llm // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread -// #cgo windows LDFLAGS: -static-libstdc++ -static-libgcc -static // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src From c12f1c5b99c9d9f9388f464aa77063987fdb8f0f Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 6 Jul 2024 16:12:29 -0400 Subject: [PATCH 09/48] release: move mingw library cleanup to correct job --- .github/workflows/release.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index d1faf9f5..0005c69d 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -85,11 +85,6 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" - - name: remove unwanted mingw dll.a files - run: | - Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force - Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force - Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force - uses: actions/setup-go@v5 with: go-version-file: go.mod @@ -309,6 +304,11 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" + - name: remove unwanted mingw dll.a files + run: | + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force - uses: actions/setup-go@v5 with: go-version-file: go.mod From 4607c706413f1354d0e762d25a9a0a933edc14ec Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 6 Jul 2024 18:58:16 -0400 Subject: [PATCH 10/48] llm: add `-DBUILD_SHARED_LIBS=off` to common cpu cmake flags (#5520) --- llm/generate/gen_linux.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 2bea1c4e..d3e2d13b 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -77,7 +77,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then init_vars echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" - CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" + CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building custom CPU" build @@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake - COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" + COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) From f8241bfba384cf8c888847dc44b73d7f43a42d82 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 6 Jul 2024 19:35:04 -0400 Subject: [PATCH 11/48] gpu: report system free memory instead of 0 (#5521) --- gpu/gpu_darwin.go | 2 +- gpu/gpu_info_darwin.h | 1 + gpu/gpu_info_darwin.m | 26 ++++++++++++++++++++++++-- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index f26d23c1..39d8fcf8 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -56,7 +56,7 @@ func GetCPUInfo() GpuInfoList { func GetCPUMem() (memInfo, error) { return memInfo{ TotalMemory: uint64(C.getPhysicalMemory()), - FreeMemory: 0, + FreeMemory: uint64(C.getFreeMemory()), }, nil } diff --git a/gpu/gpu_info_darwin.h b/gpu/gpu_info_darwin.h index 3edca237..415e7922 100644 --- a/gpu/gpu_info_darwin.h +++ b/gpu/gpu_info_darwin.h @@ -2,3 +2,4 @@ #include uint64_t getRecommendedMaxVRAM(); uint64_t getPhysicalMemory(); +uint64_t getFreeMemory(); diff --git a/gpu/gpu_info_darwin.m b/gpu/gpu_info_darwin.m index a145ac07..5ca139e0 100644 --- a/gpu/gpu_info_darwin.m +++ b/gpu/gpu_info_darwin.m @@ -1,4 +1,5 @@ -// go:build darwin +#import +#import #include "gpu_info_darwin.h" uint64_t getRecommendedMaxVRAM() { @@ -8,6 +9,27 @@ uint64_t getRecommendedMaxVRAM() { return result; } +// getPhysicalMemory returns the total physical memory in bytes uint64_t getPhysicalMemory() { - return [[NSProcessInfo processInfo] physicalMemory]; + return [NSProcessInfo processInfo].physicalMemory; +} + +// getFreeMemory returns the total free memory in bytes, including inactive +// memory that can be reclaimed by the system. +uint64_t getFreeMemory() { + mach_port_t host_port = mach_host_self(); + mach_msg_type_number_t host_size = sizeof(vm_statistics64_data_t) / sizeof(integer_t); + vm_size_t pagesize; + vm_statistics64_data_t vm_stat; + + host_page_size(host_port, &pagesize); + if (host_statistics64(host_port, HOST_VM_INFO64, (host_info64_t)&vm_stat, &host_size) != KERN_SUCCESS) { + return 0; + } + + uint64_t free_memory = (uint64_t)vm_stat.free_count * pagesize; + free_memory += (uint64_t)vm_stat.speculative_count * pagesize; + free_memory += (uint64_t)vm_stat.inactive_count * pagesize; + + return free_memory; } From 0ee87615c74c69d8fbc3cad8f3ea5a2364b1a876 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 6 Jul 2024 22:01:52 -0400 Subject: [PATCH 12/48] sched: don't error if paging to disk on Windows and macOS (#5523) --- server/sched.go | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/server/sched.go b/server/sched.go index 8c054c6b..9dff2ae0 100644 --- a/server/sched.go +++ b/server/sched.go @@ -197,25 +197,36 @@ func (s *Scheduler) processPending(ctx context.Context) { break } - // Block attempting to load a model larger than system memory + GPU memory estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts) maxSize := systemMem.FreeMemory - for _, gpu := range gpus { - if gpu.Library == "cpu" { - continue - } - if loadedCount == 0 { - // If no other models are loaded, set the limit based on what's available - maxSize += gpu.FreeMemory - } else { - // Other models could be unloaded, favor total memory for limit - maxSize += gpu.TotalMemory + + // Add available GPU memory to the total pool + // macOS hardware has unified memory so don't double count + if runtime.GOOS != "darwin" { + for _, gpu := range gpus { + if gpu.Library == "cpu" { + continue + } + if loadedCount == 0 { + // If no other models are loaded, set the limit based on what's available + maxSize += gpu.FreeMemory + } else { + // Other models could be unloaded, favor total memory for limit + maxSize += gpu.TotalMemory + } } } + + // Block attempting to load a model larger than system memory + GPU memory if estimate.TotalSize > maxSize { slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize)) - pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize)) - break + + // Linux will crash if over-allocating memory - return an error to the user. + // TODO (jmorganca): add reasonable upper limits for darwin and windows as well + if runtime.GOOS == "linux" { + pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize)) + break + } } // Evaluate if the model will fit in the available system memory, or if we should unload a model first From 0e09c380fcae8b81db3c3447d70d721cfad00dbd Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 7 Jul 2024 12:38:04 -0400 Subject: [PATCH 13/48] llm: print caching notices in debug only (#5533) --- llm/ext_server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 00a15b4a..7ae58e38 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1413,7 +1413,7 @@ struct llama_server_context return get_slot(-1); } - LOG_INFO("slot with common prefix found", {{ + LOG_DEBUG("slot with common prefix found", {{ "slot_id", slot->id, "characters", longest }}); From 571dc61955ced560a45e9d32b1cd2a52d9803c8c Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 7 Jul 2024 13:03:09 -0400 Subject: [PATCH 14/48] Update llama.cpp submodule to `a8db2a9c` (#5530) --- llm/llama.cpp | 2 +- llm/patches/05-default-pretokenizer.diff | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llm/llama.cpp b/llm/llama.cpp index d7fd29ff..a8db2a9c 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit d7fd29fff16456ce9c3a23fd2d09a66256b05aff +Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584 diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index f4eaced7..341a6f59 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,11 +1,11 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index 73f52435..2b81b4bd 100644 +index 2b9ace28..172640e2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -5092,16 +5092,7 @@ static void llm_load_vocab( - - // for now, only BPE models have pre-tokenizers +@@ -5357,16 +5357,7 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { + vocab.tokenizer_add_space_prefix = false; + vocab.tokenizer_clean_spaces = true; - if (tokenizer_pre.empty()) { - LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); - LLAMA_LOG_WARN("%s: \n", __func__); @@ -20,7 +20,7 @@ index 73f52435..2b81b4bd 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -5164,7 +5155,8 @@ static void llm_load_vocab( +@@ -5439,7 +5430,8 @@ static void llm_load_vocab( tokenizer_pre == "jais") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; } else { From d8def1ff9432ef60d1067e5e6dde0d700dd95021 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 7 Jul 2024 13:41:51 -0400 Subject: [PATCH 15/48] llm: allow gemma 2 to context shift (#5534) --- llm/ext_server/server.cpp | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 7ae58e38..0ef3956e 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1688,22 +1688,8 @@ struct llama_server_context } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - char buf[256]; - llama_model_meta_val_str(model, "general.architecture", buf, 256); - bool gemma2 = strcmp(buf, "gemma2") == 0; - - int32_t truncate_at = slot.n_ctx; - - // truncate at 2/3 of the context length for gemma2 models - // as they do not support context shifts (from the sliding window implementation). - // this way, prompts that almost fit the context length can still generate a full - // response without a sudden stop from hitting the context limit - if (gemma2) { - truncate_at = 2 * slot.n_ctx / 3; - } - // if input prompt is too big, truncate it, if group attention self-extend is disabled - if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at) + if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_shift = n_left / 2; @@ -1731,19 +1717,6 @@ struct llama_server_context GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } - // Models with sliding window attention do not work with context shifts, so - // limit their prediction to the context length - if (gemma2) { - int32_t limit = slot.n_ctx - slot.n_prompt_tokens; - slot.n_predict = limit; - slot.params.n_predict = limit; - LOG_INFO("model does not support sliding window, limiting generation", { - {"n_ctx", slot.n_ctx}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"n_predict", slot.n_predict} - }); - } - if (!slot.params.cache_prompt) { llama_sampling_reset(slot.ctx_sampling); From 53da2c69654769c0c086af695722e1d9b9ee6ecc Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 7 Jul 2024 14:32:05 -0400 Subject: [PATCH 16/48] llm: remove ambiguous comment when putting upper limit on predictions to avoid infinite generation (#5535) --- llm/server.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llm/server.go b/llm/server.go index 206f9e39..54fad92c 100644 --- a/llm/server.go +++ b/llm/server.go @@ -699,10 +699,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu } defer s.sem.Release(1) - // only allow maximum 10 "context shifts" to avoid infinite generation + // put an upper limit on num_predict to avoid the model running on forever if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx { req.Options.NumPredict = 10 * s.options.NumCtx - slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict) } request := map[string]any{ From 0bacb300071ba4baa928075b142633f2e85281ab Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 5 Jul 2024 12:46:28 -0700 Subject: [PATCH 17/48] Workaround broken ROCm p2p copy Enable the build flag for llama.cpp to use CPU copy for multi-GPU scenarios. --- llm/generate/gen_linux.sh | 2 +- llm/generate/gen_windows.ps1 | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index d3e2d13b..304eadbd 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -254,7 +254,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true) fi init_vars - CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" + CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\"" diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 5c694350..26bc4fa3 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -366,6 +366,7 @@ function build_rocm() { "-DCMAKE_C_COMPILER=clang.exe", "-DCMAKE_CXX_COMPILER=clang++.exe", "-DGGML_HIPBLAS=on", + "-DLLAMA_CUDA_NO_PEER_COPY=on", "-DHIP_PLATFORM=amd", "-DGGML_AVX=on", "-DGGML_AVX2=off", From b44320db1302baea88e2f318d984218c68faa5f1 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 8 Jul 2024 18:24:21 -0700 Subject: [PATCH 18/48] Bundle missing CRT libraries Some users are experienging runner startup errors due to not having these msvc redist libraries on their host --- scripts/build_windows.ps1 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index b3991ce1..edc73759 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -107,9 +107,12 @@ function gatherDependencies() { # TODO - this varies based on host build system and MSVC version - drive from dumpbin output # currently works for Win11 + MSVC 2019 + Cuda V11 - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\" cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\" cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\" + foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) { + cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\" + } cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\" From e4ff73297db2f53f1ea4b603df5670c5bde6a944 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 8 Jul 2024 22:32:15 -0700 Subject: [PATCH 19/48] server: fix model reloads when setting `OLLAMA_NUM_PARALLEL` (#5560) * server: fix unneeded model reloads when setting `OLLAMA_NUM_PARALLEL` * remove whitespace change * undo some changes --- server/sched.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/server/sched.go b/server/sched.go index 9dff2ae0..48047bfe 100644 --- a/server/sched.go +++ b/server/sched.go @@ -133,10 +133,6 @@ func (s *Scheduler) processPending(ctx context.Context) { numParallel = 1 slog.Warn("multimodal models don't support parallel requests yet") } - // Keep NumCtx and numParallel in sync - if numParallel > 1 { - pending.opts.NumCtx = pending.origNumCtx * numParallel - } for { cpus := s.getCpuFn() @@ -234,9 +230,10 @@ func (s *Scheduler) processPending(ctx context.Context) { // simplifying assumption of defaultParallel when in CPU mode if numParallel <= 0 { numParallel = defaultParallel - pending.opts.NumCtx = pending.origNumCtx * numParallel } + pending.opts.NumCtx = pending.origNumCtx * numParallel + if loadedCount == 0 { slog.Debug("cpu mode with first model, loading") s.loadFn(pending, ggml, gpus, numParallel) From b51e3b63ac7bc995e99f3a8f7c1b507a1f8fb5d9 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 9 Jul 2024 11:17:44 -0700 Subject: [PATCH 20/48] Statically link c++ and thread lib This makes sure we statically link the c++ and thread library on windows to avoid unnecessary runtime dependencies on non-standard DLLs --- .github/workflows/release.yaml | 5 ----- llm/llm.go | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 0005c69d..61ca3c43 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -304,11 +304,6 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" - - name: remove unwanted mingw dll.a files - run: | - Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force - Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force - Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force - uses: actions/setup-go@v5 with: go-version-file: go.mod diff --git a/llm/llm.go b/llm/llm.go index 88c0258d..f2a5e557 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -4,8 +4,8 @@ package llm // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src -// #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src -// #cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src +// #cgo windows,amd64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src +// #cgo windows,arm64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src // #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src // #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src // #include From f6f759fc5fb4868125b8a25c28ce96d2c0980ef7 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 9 Jul 2024 10:27:53 -0700 Subject: [PATCH 21/48] Detect CUDA OS Overhead This adds logic to detect skew between the driver and management library which can be attributed to OS overhead and records that so we can adjust subsequent management library free VRAM updates and avoid OOM scenarios. --- gpu/gpu.go | 27 +++++++++++++++++++++++++++ gpu/types.go | 3 ++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 29a3c103..58144991 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -274,6 +274,28 @@ func GetGPUInfo() GpuInfoList { gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor + // query the management library as well so we can record any skew between the two + // which represents overhead on the GPU we must set aside on subsequent updates + if cHandles.nvml != nil { + C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used) + if memInfo.err != nil { + slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err)) + C.free(unsafe.Pointer(memInfo.err)) + } else { + if memInfo.free != 0 && uint64(memInfo.free) > gpuInfo.FreeMemory { + gpuInfo.OSOverhead = uint64(memInfo.free) - gpuInfo.FreeMemory + slog.Info("detected OS VRAM overhead", + "id", gpuInfo.ID, + "library", gpuInfo.Library, + "compute", gpuInfo.Compute, + "driver", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor), + "name", gpuInfo.Name, + "overhead", format.HumanBytes2(gpuInfo.OSOverhead), + ) + } + } + } + // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... cudaGPUs = append(cudaGPUs, gpuInfo) } @@ -374,9 +396,14 @@ func GetGPUInfo() GpuInfoList { slog.Warn("error looking up nvidia GPU memory") continue } + if cHandles.nvml != nil && gpu.OSOverhead > 0 { + // When using the management library update based on recorded overhead + memInfo.free -= C.uint64_t(gpu.OSOverhead) + } slog.Debug("updating cuda memory data", "gpu", gpu.ID, "name", gpu.Name, + "overhead", format.HumanBytes2(gpu.OSOverhead), slog.Group( "before", "total", format.HumanBytes2(gpu.TotalMemory), diff --git a/gpu/types.go b/gpu/types.go index 2eaa9bae..7a7749b8 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -52,7 +52,8 @@ type CPUInfo struct { type CudaGPUInfo struct { GpuInfo - index int //nolint:unused,nolintlint + OSOverhead uint64 // Memory overhead between the driver library and management library + index int //nolint:unused,nolintlint } type CudaGPUInfoList []CudaGPUInfo From 0aff67877ed01adc00056742c9a88143eeabf0c5 Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Tue, 9 Jul 2024 13:48:31 -0700 Subject: [PATCH 22/48] separate request tests (#5578) --- openai/openai_test.go | 194 +++++++++++++++++------------------------- 1 file changed, 78 insertions(+), 116 deletions(-) diff --git a/openai/openai_test.go b/openai/openai_test.go index 4d21382c..39e8dc58 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -3,7 +3,6 @@ package openai import ( "bytes" "encoding/json" - "fmt" "io" "net/http" "net/http/httptest" @@ -16,49 +15,33 @@ import ( "github.com/stretchr/testify/assert" ) -func TestMiddleware(t *testing.T) { +func TestMiddlewareRequests(t *testing.T) { type testCase struct { Name string Method string Path string - TestPath string Handler func() gin.HandlerFunc - Endpoint func(c *gin.Context) Setup func(t *testing.T, req *http.Request) - Expected func(t *testing.T, resp *httptest.ResponseRecorder) + Expected func(t *testing.T, req *http.Request) + } + + var capturedRequest *http.Request + + captureRequestMiddleware := func() gin.HandlerFunc { + return func(c *gin.Context) { + bodyBytes, _ := io.ReadAll(c.Request.Body) + c.Request.Body = io.NopCloser(bytes.NewReader(bodyBytes)) + capturedRequest = c.Request + c.Next() + } } testCases := []testCase{ { - Name: "chat handler", - Method: http.MethodPost, - Path: "/api/chat", - TestPath: "/api/chat", - Handler: ChatMiddleware, - Endpoint: func(c *gin.Context) { - var chatReq api.ChatRequest - if err := c.ShouldBindJSON(&chatReq); err != nil { - c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request"}) - return - } - - userMessage := chatReq.Messages[0].Content - var assistantMessage string - - switch userMessage { - case "Hello": - assistantMessage = "Hello!" - default: - assistantMessage = "I'm not sure how to respond to that." - } - - c.JSON(http.StatusOK, api.ChatResponse{ - Message: api.Message{ - Role: "assistant", - Content: assistantMessage, - }, - }) - }, + Name: "chat handler", + Method: http.MethodPost, + Path: "/api/chat", + Handler: ChatMiddleware, Setup: func(t *testing.T, req *http.Request) { body := ChatCompletionRequest{ Model: "test-model", @@ -70,88 +53,26 @@ func TestMiddleware(t *testing.T) { req.Body = io.NopCloser(bytes.NewReader(bodyBytes)) req.Header.Set("Content-Type", "application/json") }, - Expected: func(t *testing.T, resp *httptest.ResponseRecorder) { - assert.Equal(t, http.StatusOK, resp.Code) - - var chatResp ChatCompletion - if err := json.NewDecoder(resp.Body).Decode(&chatResp); err != nil { + Expected: func(t *testing.T, req *http.Request) { + var chatReq api.ChatRequest + if err := json.NewDecoder(req.Body).Decode(&chatReq); err != nil { t.Fatal(err) } - if chatResp.Object != "chat.completion" { - t.Fatalf("expected chat.completion, got %s", chatResp.Object) + if chatReq.Messages[0].Role != "user" { + t.Fatalf("expected 'user', got %s", chatReq.Messages[0].Role) } - if chatResp.Choices[0].Message.Content != "Hello!" { - t.Fatalf("expected Hello!, got %s", chatResp.Choices[0].Message.Content) + if chatReq.Messages[0].Content != "Hello" { + t.Fatalf("expected 'Hello', got %s", chatReq.Messages[0].Content) } }, }, { - Name: "completions handler", - Method: http.MethodPost, - Path: "/api/generate", - TestPath: "/api/generate", - Handler: CompletionsMiddleware, - Endpoint: func(c *gin.Context) { - c.JSON(http.StatusOK, api.GenerateResponse{ - Response: "Hello!", - }) - }, - Setup: func(t *testing.T, req *http.Request) { - body := CompletionRequest{ - Model: "test-model", - Prompt: "Hello", - } - - bodyBytes, _ := json.Marshal(body) - - req.Body = io.NopCloser(bytes.NewReader(bodyBytes)) - req.Header.Set("Content-Type", "application/json") - }, - Expected: func(t *testing.T, resp *httptest.ResponseRecorder) { - assert.Equal(t, http.StatusOK, resp.Code) - var completionResp Completion - if err := json.NewDecoder(resp.Body).Decode(&completionResp); err != nil { - t.Fatal(err) - } - - if completionResp.Object != "text_completion" { - t.Fatalf("expected text_completion, got %s", completionResp.Object) - } - - if completionResp.Choices[0].Text != "Hello!" { - t.Fatalf("expected Hello!, got %s", completionResp.Choices[0].Text) - } - }, - }, - { - Name: "completions handler with params", - Method: http.MethodPost, - Path: "/api/generate", - TestPath: "/api/generate", - Handler: CompletionsMiddleware, - Endpoint: func(c *gin.Context) { - var generateReq api.GenerateRequest - if err := c.ShouldBindJSON(&generateReq); err != nil { - c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request"}) - return - } - - temperature := generateReq.Options["temperature"].(float64) - var assistantMessage string - - switch temperature { - case 1.6: - assistantMessage = "Received temperature of 1.6" - default: - assistantMessage = fmt.Sprintf("Received temperature of %f", temperature) - } - - c.JSON(http.StatusOK, api.GenerateResponse{ - Response: assistantMessage, - }) - }, + Name: "completions handler", + Method: http.MethodPost, + Path: "/api/generate", + Handler: CompletionsMiddleware, Setup: func(t *testing.T, req *http.Request) { temp := float32(0.8) body := CompletionRequest{ @@ -165,24 +86,65 @@ func TestMiddleware(t *testing.T) { req.Body = io.NopCloser(bytes.NewReader(bodyBytes)) req.Header.Set("Content-Type", "application/json") }, - Expected: func(t *testing.T, resp *httptest.ResponseRecorder) { - assert.Equal(t, http.StatusOK, resp.Code) - var completionResp Completion - if err := json.NewDecoder(resp.Body).Decode(&completionResp); err != nil { + Expected: func(t *testing.T, req *http.Request) { + var genReq api.GenerateRequest + if err := json.NewDecoder(req.Body).Decode(&genReq); err != nil { t.Fatal(err) } - if completionResp.Object != "text_completion" { - t.Fatalf("expected text_completion, got %s", completionResp.Object) + if genReq.Prompt != "Hello" { + t.Fatalf("expected 'Hello', got %s", genReq.Prompt) } - if completionResp.Choices[0].Text != "Received temperature of 1.6" { - t.Fatalf("expected Received temperature of 1.6, got %s", completionResp.Choices[0].Text) + if genReq.Options["temperature"] != 1.6 { + t.Fatalf("expected 1.6, got %f", genReq.Options["temperature"]) } }, }, + } + + gin.SetMode(gin.TestMode) + router := gin.New() + + endpoint := func(c *gin.Context) { + c.Status(http.StatusOK) + } + + for _, tc := range testCases { + t.Run(tc.Name, func(t *testing.T) { + router = gin.New() + router.Use(captureRequestMiddleware()) + router.Use(tc.Handler()) + router.Handle(tc.Method, tc.Path, endpoint) + req, _ := http.NewRequest(tc.Method, tc.Path, nil) + + if tc.Setup != nil { + tc.Setup(t, req) + } + + resp := httptest.NewRecorder() + router.ServeHTTP(resp, req) + + tc.Expected(t, capturedRequest) + }) + } +} + +func TestMiddlewareResponses(t *testing.T) { + type testCase struct { + Name string + Method string + Path string + TestPath string + Handler func() gin.HandlerFunc + Endpoint func(c *gin.Context) + Setup func(t *testing.T, req *http.Request) + Expected func(t *testing.T, resp *httptest.ResponseRecorder) + } + + testCases := []testCase{ { - Name: "completions handler with error", + Name: "completions handler error forwarding", Method: http.MethodPost, Path: "/api/generate", TestPath: "/api/generate", From 4918fae535cb3d146100bacc0eff67a8579a8a7f Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:01:26 -0700 Subject: [PATCH 23/48] OpenAI v1/completions: allow stop token list (#5551) * stop token parsing fix * add stop test --- openai/openai.go | 14 +++++++++----- openai/openai_test.go | 11 +++++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/openai/openai.go b/openai/openai.go index f1e75bf2..1707da14 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -338,12 +338,16 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) { switch stop := r.Stop.(type) { case string: options["stop"] = []string{stop} - case []string: - options["stop"] = stop - default: - if r.Stop != nil { - return api.GenerateRequest{}, fmt.Errorf("invalid type for 'stop' field: %T", r.Stop) + case []any: + var stops []string + for _, s := range stop { + if str, ok := s.(string); ok { + stops = append(stops, str) + } else { + return api.GenerateRequest{}, fmt.Errorf("invalid type for 'stop' field: %T", s) + } } + options["stop"] = stops } if r.MaxTokens != nil { diff --git a/openai/openai_test.go b/openai/openai_test.go index 39e8dc58..5f1ae52e 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -79,6 +79,7 @@ func TestMiddlewareRequests(t *testing.T) { Model: "test-model", Prompt: "Hello", Temperature: &temp, + Stop: []string{"\n", "stop"}, } bodyBytes, _ := json.Marshal(body) @@ -99,6 +100,16 @@ func TestMiddlewareRequests(t *testing.T) { if genReq.Options["temperature"] != 1.6 { t.Fatalf("expected 1.6, got %f", genReq.Options["temperature"]) } + + stopTokens, ok := genReq.Options["stop"].([]any) + + if !ok { + t.Fatalf("expected stop tokens to be a list") + } + + if stopTokens[0] != "\n" || stopTokens[1] != "stop" { + t.Fatalf("expected ['\\n', 'stop'], got %v", stopTokens) + } }, }, } From 22c81f62ec845bd8f77215ae5599be14117ec8db Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 10 Jul 2024 09:01:33 -0700 Subject: [PATCH 24/48] Remove duplicate merge glitch --- llm/server.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llm/server.go b/llm/server.go index 08dc04d5..aa504d19 100644 --- a/llm/server.go +++ b/llm/server.go @@ -254,10 +254,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--tensor-split", estimate.TensorSplit) } - if estimate.TensorSplit != "" { - params = append(params, "--tensor-split", estimate.TensorSplit) - } - for i := range len(servers) { dir := availableServers[servers[i]] if dir == "" { From 1f50356e8e3c3a2956c5ffacc3b9fa33b8285541 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 10 Jul 2024 11:01:22 -0700 Subject: [PATCH 25/48] Bump ROCm on windows to 6.1.2 This also adjusts our algorithm to favor our bundled ROCm. I've confirmed VRAM reporting still doesn't work properly so we can't yet enable concurrency by default. --- .github/workflows/release.yaml | 2 +- .github/workflows/test.yaml | 2 +- docs/faq.md | 2 +- gpu/amd_common.go | 23 +++++++++++------------ gpu/amd_windows.go | 4 ++-- llm/generate/gen_windows.ps1 | 12 +----------- 6 files changed, 17 insertions(+), 28 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 61ca3c43..5ae630c3 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -147,7 +147,7 @@ jobs: run: | $ErrorActionPreference = "Stop" write-host "downloading AMD HIP Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP" Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait write-host "Completed AMD HIP" diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 13d1c957..977d8da1 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -169,7 +169,7 @@ jobs: run: | $ErrorActionPreference = "Stop" write-host "downloading AMD HIP Installer" - Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" write-host "Installing AMD HIP" Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait write-host "Completed AMD HIP" diff --git a/docs/faq.md b/docs/faq.md index 57411246..da1848f7 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -272,4 +272,4 @@ The following server settings may be used to adjust how Ollama handles concurren - `OLLAMA_NUM_PARALLEL` - The maximum number of parallel requests each model will process at the same time. The default will auto-select either 4 or 1 based on available memory. - `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512 -Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM. \ No newline at end of file +Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM. \ No newline at end of file diff --git a/gpu/amd_common.go b/gpu/amd_common.go index 27a81e3f..7d1cab7c 100644 --- a/gpu/amd_common.go +++ b/gpu/amd_common.go @@ -49,9 +49,17 @@ func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { } func commonAMDValidateLibDir() (string, error) { - // We try to favor system paths first, so that we can wire up the subprocess to use - // the system version. Only use our bundled version if the system version doesn't work - // This gives users a more recovery options if versions have subtle problems at runtime + // Favor our bundled version + + // Installer payload location if we're running the installed binary + exe, err := os.Executable() + if err == nil { + rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm") + if rocmLibUsable(rocmTargetDir) { + slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir) + return rocmTargetDir, nil + } + } // Prefer explicit HIP env var hipPath := os.Getenv("HIP_PATH") @@ -87,14 +95,5 @@ func commonAMDValidateLibDir() (string, error) { } } - // Installer payload location if we're running the installed binary - exe, err := os.Executable() - if err == nil { - rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm") - if rocmLibUsable(rocmTargetDir) { - slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir) - return rocmTargetDir, nil - } - } return "", fmt.Errorf("no suitable rocm found, falling back to CPU") } diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index 8b6fabeb..5d09be8b 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -22,8 +22,8 @@ const ( var ( // Used to validate if the given ROCm lib is usable - ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here... - RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob? + ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // This is not sufficient to discern v5 vs v6 + RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob? ) func AMDGetGPUInfo() []RocmGPUInfo { diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 26bc4fa3..beb964f9 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -6,18 +6,9 @@ function amdGPUs { if ($env:AMDGPU_TARGETS) { return $env:AMDGPU_TARGETS } - # TODO - load from some common data file for linux + windows build consistency + # Current supported rocblas list from ROCm v6.1.2 on windows $GPU_LIST = @( - "gfx900" "gfx906:xnack-" - "gfx908:xnack-" - "gfx90a:xnack+" - "gfx90a:xnack-" - "gfx940" - "gfx941" - "gfx942" - "gfx1010" - "gfx1012" "gfx1030" "gfx1100" "gfx1101" @@ -395,7 +386,6 @@ function build_rocm() { sign install - # Assumes v5.7, may need adjustments for v6 rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" From 4e262eb2a8aaee31e228febc216c2a83a9a7e4d8 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Wed, 10 Jul 2024 13:17:13 -0700 Subject: [PATCH 26/48] remove `GGML_CUDA_FORCE_MMQ=on` from build (#5588) --- llm/generate/gen_linux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 304eadbd..5589f1ea 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" echo "Building custom CUDA GPU" else - CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DGGML_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=/usr/local/cuda/compat" + CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=/usr/local/cuda/compat" fi CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" From 5a739ff4cb27f7804903adfb674f8a1e197ea86f Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 10 Jul 2024 13:18:04 -0700 Subject: [PATCH 27/48] chatglm graph --- llm/ggml.go | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/llm/ggml.go b/llm/ggml.go index cfead450..fddb5039 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -424,6 +424,32 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui 4*batch*(3*embedding+vocab)+embedding*vocab*105/128, 4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16, ) + case "chatglm": + fullOffload = 4 * batch * (embedding + vocab) + partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128 + if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok { + fullOffload = max( + fullOffload, + 4*batch*(2+ + 2*embedding+ + context+ + context*heads+ + embeddingHeadsK*heads+ + qkvBias.Shape[0]), + ) + + partialOffload = max( + partialOffload, + 4*batch*(1+ + 2*embedding+ + embeddingHeadsK*heads+ + context+ + context*heads)+ + 4*embeddingHeadsK*context+ + 4*context*embeddingHeadsK+ + 4*qkvBias.Shape[0], + ) + } } return From 41be28096aa597ded1ef91774ba3e6dfc0a8ccbb Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 10 Jul 2024 11:00:07 -0700 Subject: [PATCH 28/48] add system prompt to first legacy template --- server/prompt_test.go | 2 +- server/routes_create_test.go | 4 +- template/template.go | 101 +++++++++++++++++++++++++++++++---- template/template_test.go | 61 ++++++++++++++++----- 4 files changed, 140 insertions(+), 28 deletions(-) diff --git a/server/prompt_test.go b/server/prompt_test.go index d4cee98c..1435b143 100644 --- a/server/prompt_test.go +++ b/server/prompt_test.go @@ -161,7 +161,7 @@ func TestChatPrompt(t *testing.T) { {Role: "user", Content: "A test. And a thumping good one at that, I'd wager."}, }, expect: expect{ - prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ", + prompt: "You are the Test Who Lived. You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ", }, }, } diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 269a0ba1..40477937 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -546,8 +546,8 @@ func TestCreateDetectTemplate(t *testing.T) { checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{ filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"), - filepath.Join(p, "blobs", "sha256-9512c372dfc7d84d6065b8dd2b601aeed8cc1a78e7a7aa784a42fff37f5524b7"), - filepath.Join(p, "blobs", "sha256-b8b78cb8c6eefd14c06f1af042e6161255bf87bbf2dd14fce57cdac893db8139"), + filepath.Join(p, "blobs", "sha256-68b0323b2f21572bc09ba07554b16b379a5713ee48ef8c25a7661a1f71cfce77"), + filepath.Join(p, "blobs", "sha256-eb72fb7c550ee1f1dec4039bd65382acecf5f7536a30fb7ccace39a8d0cb590b"), }) }) diff --git a/template/template.go b/template/template.go index b133b97e..0b8f2434 100644 --- a/template/template.go +++ b/template/template.go @@ -143,11 +143,14 @@ func (t *Template) Vars() []string { type Values struct { Messages []api.Message + + // forceLegacy is a flag used to test compatibility with legacy templates + forceLegacy bool } func (t *Template) Execute(w io.Writer, v Values) error { system, collated := collate(v.Messages) - if slices.Contains(t.Vars(), "messages") { + if !v.forceLegacy && slices.Contains(t.Vars(), "messages") { return t.Template.Execute(w, map[string]any{ "System": system, "Messages": collated, @@ -157,15 +160,19 @@ func (t *Template) Execute(w io.Writer, v Values) error { var b bytes.Buffer var prompt, response string for i, m := range collated { - if m.Role == "user" { + switch m.Role { + case "user": prompt = m.Content - } else { + if i != 0 { + system = "" + } + case "assistant": response = m.Content } if i != len(collated)-1 && prompt != "" && response != "" { if err := t.Template.Execute(&b, map[string]any{ - "System": "", + "System": system, "Prompt": prompt, "Response": response, }); err != nil { @@ -178,18 +185,21 @@ func (t *Template) Execute(w io.Writer, v Values) error { } var cut bool - tree := t.Template.Copy() - // for the last message, cut everything after "{{ .Response }}" - tree.Root.Nodes = slices.DeleteFunc(tree.Root.Nodes, func(n parse.Node) bool { - if slices.Contains(parseNode(n), "Response") { - cut = true + nodes := deleteNode(t.Template.Root.Copy(), func(n parse.Node) bool { + switch t := n.(type) { + case *parse.ActionNode: + case *parse.FieldNode: + if slices.Contains(t.Ident, "Response") { + cut = true + } } return cut }) - if err := template.Must(template.New("").AddParseTree("", tree)).Execute(&b, map[string]any{ - "System": system, + tree := parse.Tree{Root: nodes.(*parse.ListNode)} + if err := template.Must(template.New("").AddParseTree("", &tree)).Execute(&b, map[string]any{ + "System": "", "Prompt": prompt, }); err != nil { return err @@ -286,3 +296,72 @@ func parseNode(n parse.Node) []string { return nil } + +// deleteNode walks the node list and deletes nodes that match the predicate +// this is currently to remove the {{ .Response }} node from templates +func deleteNode(n parse.Node, fn func(parse.Node) bool) parse.Node { + var walk func(n parse.Node) parse.Node + walk = func(n parse.Node) parse.Node { + if fn(n) { + return nil + } + + switch t := n.(type) { + case *parse.ListNode: + var nodes []parse.Node + for _, c := range t.Nodes { + if n := walk(c); n != nil { + nodes = append(nodes, n) + } + } + + t.Nodes = nodes + return t + case *parse.IfNode: + t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode)) + case *parse.WithNode: + t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode)) + case *parse.RangeNode: + t.BranchNode = *(walk(&t.BranchNode).(*parse.BranchNode)) + case *parse.BranchNode: + t.List = walk(t.List).(*parse.ListNode) + if t.ElseList != nil { + t.ElseList = walk(t.ElseList).(*parse.ListNode) + } + case *parse.ActionNode: + n := walk(t.Pipe) + if n == nil { + return nil + } + + t.Pipe = n.(*parse.PipeNode) + case *parse.PipeNode: + var commands []*parse.CommandNode + for _, c := range t.Cmds { + var args []parse.Node + for _, a := range c.Args { + if n := walk(a); n != nil { + args = append(args, n) + } + } + + if len(args) == 0 { + return nil + } + + c.Args = args + commands = append(commands, c) + } + + if len(commands) == 0 { + return nil + } + + t.Cmds = commands + } + + return n + } + + return walk(n) +} diff --git a/template/template_test.go b/template/template_test.go index 428cdc77..e702a186 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -105,8 +105,8 @@ func TestTemplate(t *testing.T) { } for n, tt := range cases { + var actual bytes.Buffer t.Run(n, func(t *testing.T) { - var actual bytes.Buffer if err := tmpl.Execute(&actual, Values{Messages: tt}); err != nil { t.Fatal(err) } @@ -120,6 +120,25 @@ func TestTemplate(t *testing.T) { t.Errorf("mismatch (-got +want):\n%s", diff) } }) + + t.Run("legacy", func(t *testing.T) { + var legacy bytes.Buffer + if err := tmpl.Execute(&legacy, Values{Messages: tt, forceLegacy: true}); err != nil { + t.Fatal(err) + } + + legacyBytes := legacy.Bytes() + if slices.Contains([]string{"chatqa.gotmpl", "openchat.gotmpl", "vicuna.gotmpl"}, match) && legacyBytes[len(legacyBytes)-1] == ' ' { + t.Log("removing trailing space from legacy output") + legacyBytes = legacyBytes[:len(legacyBytes)-1] + } else if slices.Contains([]string{"codellama-70b-instruct.gotmpl", "llama2-chat.gotmpl", "mistral-instruct.gotmpl"}, match) { + t.Skip("legacy outputs cannot be compared to messages outputs") + } + + if diff := cmp.Diff(legacyBytes, actual.Bytes()); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) } }) } @@ -136,6 +155,21 @@ func TestParse(t *testing.T) { {"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}}, {"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}}, {"{{ range .Messages }}{{ if eq .Role \"system\" }}SYSTEM: {{ .Content }}{{ else if eq .Role \"user\" }}USER: {{ .Content }}{{ else if eq .Role \"assistant\" }}ASSISTANT: {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role"}}, + {`{{- if .Messages }} +{{- if .System }}<|im_start|>system +{{ .System }}<|im_end|> +{{ end }} +{{- range .Messages }}<|im_start|>{{ .Role }} +{{ .Content }}<|im_end|> +{{ end }}<|im_start|>assistant +{{ else -}} +{{ if .System }}<|im_start|>system +{{ .System }}<|im_end|> +{{ end }}{{ if .Prompt }}<|im_start|>user +{{ .Prompt }}<|im_end|> +{{ end }}<|im_start|>assistant +{{ .Response }}<|im_end|> +{{- end -}}`, []string{"content", "messages", "prompt", "response", "role", "system"}}, } for _, tt := range cases { @@ -145,9 +179,8 @@ func TestParse(t *testing.T) { t.Fatal(err) } - vars := tmpl.Vars() - if !slices.Equal(tt.vars, vars) { - t.Errorf("expected %v, got %v", tt.vars, vars) + if diff := cmp.Diff(tmpl.Vars(), tt.vars); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) } }) } @@ -170,7 +203,7 @@ func TestExecuteWithMessages(t *testing.T) { {"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `}, {"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, {"messages", `{{- range $index, $_ := .Messages }} -{{- if eq .Role "user" }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}{{ "\n\n" }} +{{- if eq .Role "user" }}[INST] {{ if and (eq $index 0) $.System }}{{ $.System }}{{ "\n\n" }} {{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} {{- end }} {{- end }}`}, @@ -191,7 +224,7 @@ func TestExecuteWithMessages(t *testing.T) { {"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, {"messages", ` {{- range $index, $_ := .Messages }} -{{- if eq .Role "user" }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}{{ "\n\n" }} +{{- if eq .Role "user" }}[INST] {{ if and (eq $index 0) $.System }}{{ $.System }}{{ "\n\n" }} {{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} {{- end }} {{- end }}`}, @@ -204,9 +237,9 @@ func TestExecuteWithMessages(t *testing.T) { {Role: "user", Content: "What is your name?"}, }, }, - `[INST] Hello friend![/INST] Hello human![INST] You are a helpful assistant! + `[INST] You are a helpful assistant! -What is your name?[/INST] `, +Hello friend![/INST] Hello human![INST] What is your name?[/INST] `, }, { "chatml", @@ -221,7 +254,7 @@ What is your name?[/INST] `, `}, {"messages", ` {{- range $index, $_ := .Messages }} -{{- if and (eq .Role "user") (eq (len (slice $.Messages $index)) 1) $.System }}<|im_start|>system +{{- if and (eq .Role "user") (eq $index 0) $.System }}<|im_start|>system {{ $.System }}<|im_end|>{{ "\n" }} {{- end }}<|im_start|>{{ .Role }} {{ .Content }}<|im_end|>{{ "\n" }} @@ -236,12 +269,12 @@ What is your name?[/INST] `, {Role: "user", Content: "What is your name?"}, }, }, - `<|im_start|>user + `<|im_start|>system +You are a helpful assistant!<|im_end|> +<|im_start|>user Hello friend!<|im_end|> <|im_start|>assistant Hello human!<|im_end|> -<|im_start|>system -You are a helpful assistant!<|im_end|> <|im_start|>user What is your name?<|im_end|> <|im_start|>assistant @@ -300,8 +333,8 @@ Answer: `, t.Fatal(err) } - if b.String() != tt.expected { - t.Errorf("expected\n%s,\ngot\n%s", tt.expected, b.String()) + if diff := cmp.Diff(b.String(), tt.expected); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) } }) } From 19753c18c01183b4c974e36e89b0c7cbdcc3c38a Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 10 Jul 2024 11:00:29 -0700 Subject: [PATCH 29/48] update embedded templates --- template/alfred.gotmpl | 4 ++-- template/alpaca.gotmpl | 8 +++++--- template/chatml.gotmpl | 4 ++-- template/chatqa.gotmpl | 7 ++++--- template/codellama-70b-instruct.gotmpl | 10 +++++----- template/falcon-instruct.gotmpl | 12 +++++++----- template/gemma-instruct.gotmpl | 7 ++++--- template/granite-instruct.gotmpl | 8 ++++---- template/llama2-chat.gotmpl | 8 ++++---- template/llama3-instruct.gotmpl | 4 ++-- template/magicoder.gotmpl | 5 +++-- template/mistral-instruct.gotmpl | 5 +++-- template/openchat.gotmpl | 12 ++++++------ template/phi-3.gotmpl | 4 ++-- template/solar-instruct.gotmpl | 7 ++++--- template/starcoder2-instruct.gotmpl | 5 ++--- .../alpaca.gotmpl/system-user-assistant-user | 4 +++- .../system-user-assistant-user | 1 + template/testdata/codellama-70b-instruct.gotmpl/user | 1 + .../user-assistant-user | 1 + .../openchat.gotmpl/system-user-assistant-user | 2 +- template/testdata/openchat.gotmpl/user | 2 +- .../testdata/openchat.gotmpl/user-assistant-user | 2 +- template/vicuna.gotmpl | 7 ++++--- template/zephyr.gotmpl | 4 ++-- 25 files changed, 74 insertions(+), 60 deletions(-) diff --git a/template/alfred.gotmpl b/template/alfred.gotmpl index 44284f04..71bc6706 100644 --- a/template/alfred.gotmpl +++ b/template/alfred.gotmpl @@ -3,6 +3,6 @@ {{- end }} {{- range .Messages }}{{ .Content }} {{- end }} -{{- else }} +{{- else -}} {{ if .System }}{{ .System }}{{ end }}{{ if .Prompt }}{{ .Prompt }}{{ end }}{{ .Response }} -{{- end }} \ No newline at end of file +{{- end -}} \ No newline at end of file diff --git a/template/alpaca.gotmpl b/template/alpaca.gotmpl index c1f69dc9..e9becb3d 100644 --- a/template/alpaca.gotmpl +++ b/template/alpaca.gotmpl @@ -1,6 +1,7 @@ {{- if .Messages }} {{- if .System }}{{ .System }} -{{- end }} + +{{ end }} {{- range .Messages }} {{- if eq .Role "user" }}### Instruction: {{- else if eq .Role "assistant" }}### Response: @@ -8,7 +9,7 @@ {{ .Content }} {{ end }}### Response: -{{ else }} +{{ else -}} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}### Instruction: @@ -16,4 +17,5 @@ {{ end }}### Response: {{ .Response }} -{{- end }} \ No newline at end of file + +{{ end -}} \ No newline at end of file diff --git a/template/chatml.gotmpl b/template/chatml.gotmpl index d945547c..eb8ab0dc 100644 --- a/template/chatml.gotmpl +++ b/template/chatml.gotmpl @@ -5,11 +5,11 @@ {{- range .Messages }}<|im_start|>{{ .Role }} {{ .Content }}<|im_end|> {{ end }}<|im_start|>assistant -{{ else }} +{{ else -}} {{ if .System }}<|im_start|>system {{ .System }}<|im_end|> {{ end }}{{ if .Prompt }}<|im_start|>user {{ .Prompt }}<|im_end|> {{ end }}<|im_start|>assistant {{ .Response }}<|im_end|> -{{- end }} \ No newline at end of file +{{ end -}} \ No newline at end of file diff --git a/template/chatqa.gotmpl b/template/chatqa.gotmpl index 7022c479..41c6ced5 100644 --- a/template/chatqa.gotmpl +++ b/template/chatqa.gotmpl @@ -8,10 +8,11 @@ {{- end }} {{ .Content }} {{ end }}Assistant: -{{- else }} +{{- else -}} {{ if .System }}System: {{ .System }} {{ end }}{{ if .Prompt }}User: {{ .Prompt }} -{{ end }}Assistant: <|begin_of_text|>{{ .Response }} -{{- end }} \ No newline at end of file +{{ end }}Assistant: {{ .Response }} + +{{ end -}} \ No newline at end of file diff --git a/template/codellama-70b-instruct.gotmpl b/template/codellama-70b-instruct.gotmpl index 392d839e..0a313d38 100644 --- a/template/codellama-70b-instruct.gotmpl +++ b/template/codellama-70b-instruct.gotmpl @@ -7,13 +7,13 @@ {{ .Content }} {{ end }}Source: assistant Destination: user -{{ else }} -{{ if .System }} Source: system + {{ else -}} +{{ if .System }}Source: system - {{ .System }} {{ end }} Source: user + {{ .System }} {{ end }}Source: user {{ .Prompt }} Source: assistant Destination: user - {{ .Response }} -{{- end }} \ No newline at end of file + {{ .Response }} +{{- end -}} \ No newline at end of file diff --git a/template/falcon-instruct.gotmpl b/template/falcon-instruct.gotmpl index 99d67f93..3a403007 100644 --- a/template/falcon-instruct.gotmpl +++ b/template/falcon-instruct.gotmpl @@ -6,8 +6,10 @@ {{ else if eq .Role "assistant" }}Falcon: {{ end }}{{ .Content }} {{ end }}Falcon: -{{ else }} -{{ if .System }}{{ .System }} -{{ end }}{{ if .Prompt }}User: {{ .Prompt }} -{{ end }}Assistant: {{ .Response }} -{{- end }} \ No newline at end of file +{{ else -}} +{{ if .System }}System: {{ .System }} +{{ end }}{{ if .Prompt }}User: +{{ .Prompt }} +{{ end }}Falcon: +{{ .Response }} +{{ end -}} \ No newline at end of file diff --git a/template/gemma-instruct.gotmpl b/template/gemma-instruct.gotmpl index 870a8f2e..6d778a70 100644 --- a/template/gemma-instruct.gotmpl +++ b/template/gemma-instruct.gotmpl @@ -8,9 +8,10 @@ {{- end }} {{ .Content }} {{ end }}model -{{ else }} +{{ else -}} user -{{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} +{{ if .System }}{{ .System }} +{{ end }}{{ .Prompt }} model {{ .Response }} -{{- end }} \ No newline at end of file +{{ end -}} \ No newline at end of file diff --git a/template/granite-instruct.gotmpl b/template/granite-instruct.gotmpl index 327ff3ee..4a85a97b 100644 --- a/template/granite-instruct.gotmpl +++ b/template/granite-instruct.gotmpl @@ -10,9 +10,8 @@ {{ .Content }} {{ end }}Answer: -{{ else }} -{{ if .System }} -System: +{{ else -}} +{{ if .System }}System: {{ .System }} {{ end }}{{ if .Prompt }}Question: @@ -20,4 +19,5 @@ System: {{ end }}Answer: {{ .Response }} -{{- end }} \ No newline at end of file + +{{ end -}} \ No newline at end of file diff --git a/template/llama2-chat.gotmpl b/template/llama2-chat.gotmpl index 6327d581..1816fefd 100644 --- a/template/llama2-chat.gotmpl +++ b/template/llama2-chat.gotmpl @@ -9,8 +9,8 @@ {{- else }} [/INST] {{ .Content }} {{- end }} {{- end }} [/INST] -{{- else }} -[INST] <>{{ .System }}<> +{{- else -}} +[INST] <>{{ if .System }}{{ .System }}{{ end }}<> -{{ .Prompt }} [/INST] {{ .Response }} -{{- end }} \ No newline at end of file +{{ .Prompt }} [/INST] {{ .Response }} +{{- end -}} \ No newline at end of file diff --git a/template/llama3-instruct.gotmpl b/template/llama3-instruct.gotmpl index 9c81a953..7947b8da 100644 --- a/template/llama3-instruct.gotmpl +++ b/template/llama3-instruct.gotmpl @@ -8,7 +8,7 @@ {{ .Content }}<|eot_id|> {{- end }}<|start_header_id|>assistant<|end_header_id|> -{{ else }} +{{ else -}} {{ if .System }}<|start_header_id|>system<|end_header_id|> {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|> @@ -16,4 +16,4 @@ {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|> {{ .Response }}<|eot_id|> -{{- end }} \ No newline at end of file +{{- end -}} \ No newline at end of file diff --git a/template/magicoder.gotmpl b/template/magicoder.gotmpl index 73a58127..9227b666 100644 --- a/template/magicoder.gotmpl +++ b/template/magicoder.gotmpl @@ -9,7 +9,7 @@ {{ .Content }} {{ end }}@@ Response -{{ else }} +{{ else -}} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}@@ Instruction @@ -17,4 +17,5 @@ {{ end }}@@ Response {{ .Response }} -{{- end }} \ No newline at end of file + +{{ end -}} \ No newline at end of file diff --git a/template/mistral-instruct.gotmpl b/template/mistral-instruct.gotmpl index eb3d5ced..1d746dfd 100644 --- a/template/mistral-instruct.gotmpl +++ b/template/mistral-instruct.gotmpl @@ -5,5 +5,6 @@ {{- else if eq .Role "assistant" }}[/INST] {{ .Content }} {{- end }} {{- end }}[/INST] -{{- else }}[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST] {{ .Response }} -{{- end }} \ No newline at end of file +{{- else -}} +[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}[/INST] {{ .Response }} +{{- end -}} \ No newline at end of file diff --git a/template/openchat.gotmpl b/template/openchat.gotmpl index d5e1cbb0..649f0509 100644 --- a/template/openchat.gotmpl +++ b/template/openchat.gotmpl @@ -1,11 +1,11 @@ {{- if .Messages }} -{{- if .System }}GPT Correct System: {{ .System }}<|end_of_turn|> +{{- if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|> {{- end }} -{{- range .Messages }}GPT Correct +{{- range .Messages }}GPT4 Correct {{- if eq .Role "user" }} User: {{- else if eq .Role "assistant" }} Assistant: {{- end }} {{ .Content }}<|end_of_turn|> -{{- end }}GPT Correct Assistant: -{{- else }} -{{ .System }}<|end_of_turn|>GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|> -{{- end }} \ No newline at end of file +{{- end }}GPT4 Correct Assistant: +{{- else -}} +{{ if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|>{{ end }}GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|> +{{- end -}} \ No newline at end of file diff --git a/template/phi-3.gotmpl b/template/phi-3.gotmpl index a3558d2b..4ca56e95 100644 --- a/template/phi-3.gotmpl +++ b/template/phi-3.gotmpl @@ -5,11 +5,11 @@ {{- range .Messages }}<|{{ .Role }}|> {{ .Content }}<|end|> {{ end }}<|assistant|> -{{ else }} +{{ else -}} {{ if .System }}<|system|> {{ .System }}<|end|> {{ end }}{{ if .Prompt }}<|user|> {{ .Prompt }}<|end|> {{ end }}<|assistant|> {{ .Response }}<|end|> -{{- end }} \ No newline at end of file +{{ end -}} \ No newline at end of file diff --git a/template/solar-instruct.gotmpl b/template/solar-instruct.gotmpl index caa6e8e7..8a8331ca 100644 --- a/template/solar-instruct.gotmpl +++ b/template/solar-instruct.gotmpl @@ -10,7 +10,7 @@ {{ .Content }} {{ end }} {{ end }}### Assistant: -{{ else }} +{{ else -}} {{ if .System }}### System: {{ .System }} @@ -18,5 +18,6 @@ {{ .Prompt }} {{ end }}### Assistant: -{{ .Response }} -{{- end }} \ No newline at end of file +{{ .Response }} + +{{ end -}} \ No newline at end of file diff --git a/template/starcoder2-instruct.gotmpl b/template/starcoder2-instruct.gotmpl index 7d7ff932..17c6ad75 100644 --- a/template/starcoder2-instruct.gotmpl +++ b/template/starcoder2-instruct.gotmpl @@ -11,14 +11,13 @@ {{ end }} {{- end }}### Response -{{ else }} +{{ else -}} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}### Instruction {{ .Prompt }} - {{ end }}### Response {{ .Response }}<|endoftext|> -{{- end }} \ No newline at end of file +{{ end -}} \ No newline at end of file diff --git a/template/testdata/alpaca.gotmpl/system-user-assistant-user b/template/testdata/alpaca.gotmpl/system-user-assistant-user index 20182d82..4caa8178 100644 --- a/template/testdata/alpaca.gotmpl/system-user-assistant-user +++ b/template/testdata/alpaca.gotmpl/system-user-assistant-user @@ -1,4 +1,6 @@ -You are a helpful assistant.### Instruction: +You are a helpful assistant. + +### Instruction: Hello, how are you? ### Response: diff --git a/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user b/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user index fdd0fc8b..d7528f80 100644 --- a/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user +++ b/template/testdata/codellama-70b-instruct.gotmpl/system-user-assistant-user @@ -9,3 +9,4 @@ Source: system I'd like to show off how chat templating works! Source: assistant Destination: user + \ No newline at end of file diff --git a/template/testdata/codellama-70b-instruct.gotmpl/user b/template/testdata/codellama-70b-instruct.gotmpl/user index 9e7174a8..8e07853c 100644 --- a/template/testdata/codellama-70b-instruct.gotmpl/user +++ b/template/testdata/codellama-70b-instruct.gotmpl/user @@ -3,3 +3,4 @@ Source: user Hello, how are you? Source: assistant Destination: user + \ No newline at end of file diff --git a/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user b/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user index b4ba1736..f732cc74 100644 --- a/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user +++ b/template/testdata/codellama-70b-instruct.gotmpl/user-assistant-user @@ -7,3 +7,4 @@ Source: user I'd like to show off how chat templating works! Source: assistant Destination: user + \ No newline at end of file diff --git a/template/testdata/openchat.gotmpl/system-user-assistant-user b/template/testdata/openchat.gotmpl/system-user-assistant-user index 1214c126..404b071a 100644 --- a/template/testdata/openchat.gotmpl/system-user-assistant-user +++ b/template/testdata/openchat.gotmpl/system-user-assistant-user @@ -1 +1 @@ -GPT Correct System: You are a helpful assistant.<|end_of_turn|>GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT Correct Assistant: \ No newline at end of file +GPT4 Correct System: You are a helpful assistant.<|end_of_turn|>GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT4 Correct Assistant: \ No newline at end of file diff --git a/template/testdata/openchat.gotmpl/user b/template/testdata/openchat.gotmpl/user index 611daa83..48229cb0 100644 --- a/template/testdata/openchat.gotmpl/user +++ b/template/testdata/openchat.gotmpl/user @@ -1 +1 @@ -GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: \ No newline at end of file +GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: \ No newline at end of file diff --git a/template/testdata/openchat.gotmpl/user-assistant-user b/template/testdata/openchat.gotmpl/user-assistant-user index f97b02b9..4719abb2 100644 --- a/template/testdata/openchat.gotmpl/user-assistant-user +++ b/template/testdata/openchat.gotmpl/user-assistant-user @@ -1 +1 @@ -GPT Correct User: Hello, how are you?<|end_of_turn|>GPT Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT Correct Assistant: \ No newline at end of file +GPT4 Correct User: Hello, how are you?<|end_of_turn|>GPT4 Correct Assistant: I'm doing great. How can I help you today?<|end_of_turn|>GPT4 Correct User: I'd like to show off how chat templating works!<|end_of_turn|>GPT4 Correct Assistant: \ No newline at end of file diff --git a/template/vicuna.gotmpl b/template/vicuna.gotmpl index 2e13e990..01465b99 100644 --- a/template/vicuna.gotmpl +++ b/template/vicuna.gotmpl @@ -7,8 +7,9 @@ {{ else if eq .Role "assistant" }}ASSISTANT: {{ .Content }} {{ end }} {{- end }}ASSISTANT: -{{- else }} +{{- else -}} {{ if .System }}{{ .System }} + {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} -{{ end }}ASSISTANT: {{ .Response }} -{{- end }} \ No newline at end of file +{{ end }}ASSISTANT: {{ .Response }} +{{ end -}} \ No newline at end of file diff --git a/template/zephyr.gotmpl b/template/zephyr.gotmpl index e6668848..3ca1d1a1 100644 --- a/template/zephyr.gotmpl +++ b/template/zephyr.gotmpl @@ -5,11 +5,11 @@ {{- range .Messages }}<|{{ .Role }}|> {{ .Content }} {{ end }}<|assistant|> -{{ else }} +{{ else -}} {{ if .System }}<|system|> {{ .System }} {{ end }}{{ if .Prompt }}<|user|> {{ .Prompt }} {{ end }}<|assistant|> {{ .Response }} -{{- end }} \ No newline at end of file +{{ end -}} \ No newline at end of file From efbf41ed8151098b942c142e2522b9ab8364f97a Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Wed, 10 Jul 2024 20:01:52 -0700 Subject: [PATCH 30/48] llm: dont link cuda with compat libs (#5621) --- llm/generate/gen_linux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 5589f1ea..db2c6c30 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -178,7 +178,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" echo "Building custom CUDA GPU" else - CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} -DCMAKE_LIBRARY_PATH=/usr/local/cuda/compat" + CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" fi CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" From 791650ddef9eb11e011506dbd5d22ed6bfcb6a10 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 11 Jul 2024 00:53:12 -0700 Subject: [PATCH 31/48] sched: only error when over-allocating system memory (#5626) --- llm/server.go | 9 +++++++++ server/sched.go | 37 ------------------------------------- 2 files changed, 9 insertions(+), 37 deletions(-) diff --git a/llm/server.go b/llm/server.go index aa504d19..07c58cff 100644 --- a/llm/server.go +++ b/llm/server.go @@ -122,6 +122,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } } + // On linux, over-allocating CPU memory will almost always result in an error + if runtime.GOOS == "linux" { + systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize + if systemMemoryRequired > systemTotalMemory { + slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "system", format.HumanBytes2(systemTotalMemory)) + return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(systemTotalMemory)) + } + } + estimate.log() // Loop through potential servers diff --git a/server/sched.go b/server/sched.go index 48047bfe..2daed3ab 100644 --- a/server/sched.go +++ b/server/sched.go @@ -135,11 +135,6 @@ func (s *Scheduler) processPending(ctx context.Context) { } for { - cpus := s.getCpuFn() - var systemMem gpu.GpuInfo - if len(cpus) > 0 { - systemMem = cpus[0] - } var runnerToExpire *runnerRef s.loadedMu.Lock() runner := s.loaded[pending.model.ModelPath] @@ -193,38 +188,6 @@ func (s *Scheduler) processPending(ctx context.Context) { break } - estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts) - maxSize := systemMem.FreeMemory - - // Add available GPU memory to the total pool - // macOS hardware has unified memory so don't double count - if runtime.GOOS != "darwin" { - for _, gpu := range gpus { - if gpu.Library == "cpu" { - continue - } - if loadedCount == 0 { - // If no other models are loaded, set the limit based on what's available - maxSize += gpu.FreeMemory - } else { - // Other models could be unloaded, favor total memory for limit - maxSize += gpu.TotalMemory - } - } - } - - // Block attempting to load a model larger than system memory + GPU memory - if estimate.TotalSize > maxSize { - slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize)) - - // Linux will crash if over-allocating memory - return an error to the user. - // TODO (jmorganca): add reasonable upper limits for darwin and windows as well - if runtime.GOOS == "linux" { - pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize)) - break - } - } - // Evaluate if the model will fit in the available system memory, or if we should unload a model first if len(gpus) == 1 && gpus[0].Library == "cpu" { // simplifying assumption of defaultParallel when in CPU mode From e64f9ebb44b584d94094274f62acd90a5195dd89 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 11 Jul 2024 13:10:13 -0700 Subject: [PATCH 32/48] do no automatically aggregate system messages --- template/template.go | 39 ++++++++++++++++++++------------------- template/template_test.go | 11 +++++++---- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/template/template.go b/template/template.go index 0b8f2434..8d5ac51b 100644 --- a/template/template.go +++ b/template/template.go @@ -102,8 +102,21 @@ var response = parse.ActionNode{ }, } +var funcs = template.FuncMap{ + "aggregate": func(v []*api.Message, role string) string { + var aggregated []string + for _, m := range v { + if m.Role == role { + aggregated = append(aggregated, m.Content) + } + } + + return strings.Join(aggregated, "\n\n") + }, +} + func Parse(s string) (*Template, error) { - tmpl := template.New("").Option("missingkey=zero") + tmpl := template.New("").Option("missingkey=zero").Funcs(funcs) tmpl, err := tmpl.Parse(s) if err != nil { @@ -149,23 +162,21 @@ type Values struct { } func (t *Template) Execute(w io.Writer, v Values) error { - system, collated := collate(v.Messages) + collated := collate(v.Messages) if !v.forceLegacy && slices.Contains(t.Vars(), "messages") { return t.Template.Execute(w, map[string]any{ - "System": system, "Messages": collated, }) } var b bytes.Buffer - var prompt, response string + var system, prompt, response string for i, m := range collated { switch m.Role { + case "system": + system = m.Content case "user": prompt = m.Content - if i != 0 { - system = "" - } case "assistant": response = m.Content } @@ -179,6 +190,7 @@ func (t *Template) Execute(w io.Writer, v Values) error { return err } + system = "" prompt = "" response = "" } @@ -209,25 +221,14 @@ func (t *Template) Execute(w io.Writer, v Values) error { return err } -type messages []*api.Message - // collate messages based on role. consecutive messages of the same role are merged // into a single message. collate also pulls out and merges messages with Role == "system" // which are templated separately. As a side effect, it mangles message content adding image // tags ([img-%d]) as needed -func collate(msgs []api.Message) (system string, collated messages) { +func collate(msgs []api.Message) (collated []*api.Message) { var n int for i := range msgs { msg := msgs[i] - if msg.Role == "system" { - if system != "" { - system += "\n\n" - } - - system += msg.Content - continue - } - for range msg.Images { imageTag := fmt.Sprintf("[img-%d]", n) if !strings.Contains(msg.Content, "[img]") { diff --git a/template/template_test.go b/template/template_test.go index e702a186..b020eb67 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -122,6 +122,7 @@ func TestTemplate(t *testing.T) { }) t.Run("legacy", func(t *testing.T) { + t.Skip("legacy outputs are currently default outputs") var legacy bytes.Buffer if err := tmpl.Execute(&legacy, Values{Messages: tt, forceLegacy: true}); err != nil { t.Fatal(err) @@ -154,11 +155,13 @@ func TestParse(t *testing.T) { {"{{ .System }} {{ .Prompt }} {{ .Response }}", []string{"prompt", "response", "system"}}, {"{{ with .Tools }}{{ . }}{{ end }} {{ .System }} {{ .Prompt }}", []string{"prompt", "response", "system", "tools"}}, {"{{ range .Messages }}{{ .Role }} {{ .Content }}{{ end }}", []string{"content", "messages", "role"}}, - {"{{ range .Messages }}{{ if eq .Role \"system\" }}SYSTEM: {{ .Content }}{{ else if eq .Role \"user\" }}USER: {{ .Content }}{{ else if eq .Role \"assistant\" }}ASSISTANT: {{ .Content }}{{ end }}{{ end }}", []string{"content", "messages", "role"}}, + {`{{- range .Messages }} +{{- if eq .Role "system" }}SYSTEM: +{{- else if eq .Role "user" }}USER: +{{- else if eq .Role "assistant" }}ASSISTANT: +{{- end }} {{ .Content }} +{{- end }}`, []string{"content", "messages", "role"}}, {`{{- if .Messages }} -{{- if .System }}<|im_start|>system -{{ .System }}<|im_end|> -{{ end }} {{- range .Messages }}<|im_start|>{{ .Role }} {{ .Content }}<|im_end|> {{ end }}<|im_start|>assistant From 57ec6901eb59cca9d0c29adca3f0fd4b95c1c989 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 11 Jul 2024 13:11:40 -0700 Subject: [PATCH 33/48] revert embedded templates to use prompt/response This reverts commit 19753c18c01183b4c974e36e89b0c7cbdcc3c38a. for compat. messages will be added at a later date --- server/routes_create_test.go | 4 +- template/alfred.gotmpl | 9 +-- template/alpaca.gotmpl | 13 ---- template/chatml.gotmpl | 9 --- template/chatqa.gotmpl | 12 ---- template/codellama-70b-instruct.gotmpl | 15 +---- template/falcon-instruct.gotmpl | 10 ---- template/gemma-instruct.gotmpl | 12 ---- template/granite-instruct.gotmpl | 14 ----- template/llama2-chat.gotmpl | 18 ++---- template/llama3-instruct.gotmpl | 14 +---- template/magicoder.gotmpl | 13 ---- template/mistral-instruct.gotmpl | 13 +--- template/openchat.gotmpl | 12 +--- template/phi-3.gotmpl | 9 --- template/solar-instruct.gotmpl | 14 ----- template/starcoder2-instruct.gotmpl | 15 ----- template/template_test.go | 59 ++++++++++++------- .../system-user-assistant-user | 4 +- .../llama2-chat.gotmpl/user-assistant-user | 4 +- .../system-user-assistant-user | 5 +- template/vicuna.gotmpl | 11 ---- template/zephyr.gotmpl | 9 --- 23 files changed, 63 insertions(+), 235 deletions(-) diff --git a/server/routes_create_test.go b/server/routes_create_test.go index 40477937..04174b92 100644 --- a/server/routes_create_test.go +++ b/server/routes_create_test.go @@ -546,8 +546,8 @@ func TestCreateDetectTemplate(t *testing.T) { checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{ filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"), - filepath.Join(p, "blobs", "sha256-68b0323b2f21572bc09ba07554b16b379a5713ee48ef8c25a7661a1f71cfce77"), - filepath.Join(p, "blobs", "sha256-eb72fb7c550ee1f1dec4039bd65382acecf5f7536a30fb7ccace39a8d0cb590b"), + filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"), + filepath.Join(p, "blobs", "sha256-f836ee110db21567f826332e4cedd746c06d10664fd5a9ea3659e3683a944510"), }) }) diff --git a/template/alfred.gotmpl b/template/alfred.gotmpl index 71bc6706..cecb9d2c 100644 --- a/template/alfred.gotmpl +++ b/template/alfred.gotmpl @@ -1,8 +1 @@ -{{- if .Messages }} -{{- if .System }}{{ .System }} -{{- end }} -{{- range .Messages }}{{ .Content }} -{{- end }} -{{- else -}} -{{ if .System }}{{ .System }}{{ end }}{{ if .Prompt }}{{ .Prompt }}{{ end }}{{ .Response }} -{{- end -}} \ No newline at end of file +{{ if .System }}{{ .System }}{{ end }}{{ if .Prompt }}{{ .Prompt }}{{ end }}{{ .Response }} \ No newline at end of file diff --git a/template/alpaca.gotmpl b/template/alpaca.gotmpl index e9becb3d..ec7a8edc 100644 --- a/template/alpaca.gotmpl +++ b/template/alpaca.gotmpl @@ -1,15 +1,3 @@ -{{- if .Messages }} -{{- if .System }}{{ .System }} - -{{ end }} -{{- range .Messages }} -{{- if eq .Role "user" }}### Instruction: -{{- else if eq .Role "assistant" }}### Response: -{{- end }} -{{ .Content }} - -{{ end }}### Response: -{{ else -}} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}### Instruction: @@ -18,4 +6,3 @@ {{ end }}### Response: {{ .Response }} -{{ end -}} \ No newline at end of file diff --git a/template/chatml.gotmpl b/template/chatml.gotmpl index eb8ab0dc..fb672601 100644 --- a/template/chatml.gotmpl +++ b/template/chatml.gotmpl @@ -1,15 +1,6 @@ -{{- if .Messages }} -{{- if .System }}<|im_start|>system -{{ .System }}<|im_end|> -{{ end }} -{{- range .Messages }}<|im_start|>{{ .Role }} -{{ .Content }}<|im_end|> -{{ end }}<|im_start|>assistant -{{ else -}} {{ if .System }}<|im_start|>system {{ .System }}<|im_end|> {{ end }}{{ if .Prompt }}<|im_start|>user {{ .Prompt }}<|im_end|> {{ end }}<|im_start|>assistant {{ .Response }}<|im_end|> -{{ end -}} \ No newline at end of file diff --git a/template/chatqa.gotmpl b/template/chatqa.gotmpl index 41c6ced5..91679a72 100644 --- a/template/chatqa.gotmpl +++ b/template/chatqa.gotmpl @@ -1,18 +1,6 @@ -{{- if .Messages }} -{{- if .System }}System: {{ .System }} - -{{ end }} -{{- range .Messages }} -{{- if eq .Role "user" }}User: -{{- else if eq .Role "assistant" }}Assistant: -{{- end }} {{ .Content }} - -{{ end }}Assistant: -{{- else -}} {{ if .System }}System: {{ .System }} {{ end }}{{ if .Prompt }}User: {{ .Prompt }} {{ end }}Assistant: {{ .Response }} -{{ end -}} \ No newline at end of file diff --git a/template/codellama-70b-instruct.gotmpl b/template/codellama-70b-instruct.gotmpl index 0a313d38..e5856042 100644 --- a/template/codellama-70b-instruct.gotmpl +++ b/template/codellama-70b-instruct.gotmpl @@ -1,19 +1,10 @@ -{{- if .Messages }} -{{- if .System }}Source: system - - {{ .System }} {{ end }} -{{- range .Messages }}Source: {{ .Role }} - - {{ .Content }} {{ end }}Source: assistant -Destination: user - - {{ else -}} {{ if .System }}Source: system {{ .System }} {{ end }}Source: user {{ .Prompt }} Source: assistant +{{- if not .Response }} Destination: user +{{- end }} - {{ .Response }} -{{- end -}} \ No newline at end of file + {{ .Response }} \ No newline at end of file diff --git a/template/falcon-instruct.gotmpl b/template/falcon-instruct.gotmpl index 3a403007..0a5fe48e 100644 --- a/template/falcon-instruct.gotmpl +++ b/template/falcon-instruct.gotmpl @@ -1,15 +1,5 @@ -{{- if .Messages }} -{{- if .System }}System: {{ .System }} -{{ end }} -{{- range .Messages }} -{{- if eq .Role "user" }}User: -{{ else if eq .Role "assistant" }}Falcon: -{{ end }}{{ .Content }} -{{ end }}Falcon: -{{ else -}} {{ if .System }}System: {{ .System }} {{ end }}{{ if .Prompt }}User: {{ .Prompt }} {{ end }}Falcon: {{ .Response }} -{{ end -}} \ No newline at end of file diff --git a/template/gemma-instruct.gotmpl b/template/gemma-instruct.gotmpl index 6d778a70..3c3a8425 100644 --- a/template/gemma-instruct.gotmpl +++ b/template/gemma-instruct.gotmpl @@ -1,17 +1,5 @@ -{{- if .Messages }} -{{- range $index, $_ := .Messages }} -{{- if eq .Role "user" }}user -{{- if and $.System (eq $index 0) }} -{{ $.System }} -{{- end }} -{{- else if eq .Role "assistant" }}model -{{- end }} -{{ .Content }} -{{ end }}model -{{ else -}} user {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} model {{ .Response }} -{{ end -}} \ No newline at end of file diff --git a/template/granite-instruct.gotmpl b/template/granite-instruct.gotmpl index 4a85a97b..56690fce 100644 --- a/template/granite-instruct.gotmpl +++ b/template/granite-instruct.gotmpl @@ -1,16 +1,3 @@ -{{- if .Messages }} -{{- if .System }}System: -{{ .System }} - -{{ end }} -{{- range .Messages }} -{{- if eq .Role "user" }}Question: -{{- else if eq .Role "assistant" }}Answer: -{{- end }} -{{ .Content }} - -{{ end }}Answer: -{{ else -}} {{ if .System }}System: {{ .System }} @@ -20,4 +7,3 @@ {{ end }}Answer: {{ .Response }} -{{ end -}} \ No newline at end of file diff --git a/template/llama2-chat.gotmpl b/template/llama2-chat.gotmpl index 1816fefd..013b414e 100644 --- a/template/llama2-chat.gotmpl +++ b/template/llama2-chat.gotmpl @@ -1,16 +1,6 @@ -{{- if .Messages }} -{{- range $index, $_ := .Messages }} -{{- if eq .Role "user" }}[INST] {{ if eq $index 0 }}<> -{{- if $.System }} -{{ $.System }} +[INST] <> +{{- if .System }} +{{ .System }} {{ end }}<> -{{ end }}{{ .Content }} -{{- else }} [/INST] {{ .Content }} -{{- end }} -{{- end }} [/INST] -{{- else -}} -[INST] <>{{ if .System }}{{ .System }}{{ end }}<> - -{{ .Prompt }} [/INST] {{ .Response }} -{{- end -}} \ No newline at end of file +{{ .Prompt }} [/INST] {{ .Response }} \ No newline at end of file diff --git a/template/llama3-instruct.gotmpl b/template/llama3-instruct.gotmpl index 7947b8da..36d0218b 100644 --- a/template/llama3-instruct.gotmpl +++ b/template/llama3-instruct.gotmpl @@ -1,19 +1,7 @@ -{{- if .Messages }} -{{- if .System }}<|start_header_id|>system<|end_header_id|> - -{{ .System }}<|eot_id|> -{{- end }} -{{- range .Messages }}<|start_header_id|>{{ .Role }}<|end_header_id|> - -{{ .Content }}<|eot_id|> -{{- end }}<|start_header_id|>assistant<|end_header_id|> - -{{ else -}} {{ if .System }}<|start_header_id|>system<|end_header_id|> {{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|> {{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|> -{{ .Response }}<|eot_id|> -{{- end -}} \ No newline at end of file +{{ .Response }}<|eot_id|> \ No newline at end of file diff --git a/template/magicoder.gotmpl b/template/magicoder.gotmpl index 9227b666..52abc01a 100644 --- a/template/magicoder.gotmpl +++ b/template/magicoder.gotmpl @@ -1,15 +1,3 @@ -{{- if .Messages }} -{{- if .System }}{{ .System }} - -{{ end }} -{{- range .Messages }} -{{- if eq .Role "user" }}@@ Instruction -{{- else if eq .Role "assistant" }}@@ Response -{{- end }} -{{ .Content }} - -{{ end }}@@ Response -{{ else -}} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}@@ Instruction @@ -18,4 +6,3 @@ {{ end }}@@ Response {{ .Response }} -{{ end -}} \ No newline at end of file diff --git a/template/mistral-instruct.gotmpl b/template/mistral-instruct.gotmpl index 1d746dfd..e489bd4c 100644 --- a/template/mistral-instruct.gotmpl +++ b/template/mistral-instruct.gotmpl @@ -1,10 +1,3 @@ -{{- if .Messages }} -{{- range $index, $_ := .Messages }} -{{- if eq .Role "user" }}[INST] {{ if and $.System (eq (len (slice $.Messages $index)) 1) }}{{ $.System }} -{{ end }}{{ .Content }} -{{- else if eq .Role "assistant" }}[/INST] {{ .Content }} -{{- end }} -{{- end }}[/INST] -{{- else -}} -[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}[/INST] {{ .Response }} -{{- end -}} \ No newline at end of file +[INST] {{ if .System }}{{ .System }} + +{{ end }}{{ .Prompt }}[/INST] {{ .Response }} \ No newline at end of file diff --git a/template/openchat.gotmpl b/template/openchat.gotmpl index 649f0509..9c183834 100644 --- a/template/openchat.gotmpl +++ b/template/openchat.gotmpl @@ -1,11 +1 @@ -{{- if .Messages }} -{{- if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|> -{{- end }} -{{- range .Messages }}GPT4 Correct -{{- if eq .Role "user" }} User: -{{- else if eq .Role "assistant" }} Assistant: -{{- end }} {{ .Content }}<|end_of_turn|> -{{- end }}GPT4 Correct Assistant: -{{- else -}} -{{ if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|>{{ end }}GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|> -{{- end -}} \ No newline at end of file +{{ if .System }}GPT4 Correct System: {{ .System }}<|end_of_turn|>{{ end }}GPT4 Correct User: {{ .Prompt }}<|end_of_turn|>GPT4 Correct Assistant: {{ .Response }}<|end_of_turn|> \ No newline at end of file diff --git a/template/phi-3.gotmpl b/template/phi-3.gotmpl index 4ca56e95..6c3610dd 100644 --- a/template/phi-3.gotmpl +++ b/template/phi-3.gotmpl @@ -1,15 +1,6 @@ -{{- if .Messages }} -{{- if .System }}<|system|> -{{ .System }}<|end|> -{{ end }} -{{- range .Messages }}<|{{ .Role }}|> -{{ .Content }}<|end|> -{{ end }}<|assistant|> -{{ else -}} {{ if .System }}<|system|> {{ .System }}<|end|> {{ end }}{{ if .Prompt }}<|user|> {{ .Prompt }}<|end|> {{ end }}<|assistant|> {{ .Response }}<|end|> -{{ end -}} \ No newline at end of file diff --git a/template/solar-instruct.gotmpl b/template/solar-instruct.gotmpl index 8a8331ca..1c14960d 100644 --- a/template/solar-instruct.gotmpl +++ b/template/solar-instruct.gotmpl @@ -1,16 +1,3 @@ -{{- if .Messages }} -{{- if .System }}### System: -{{ .System }} - -{{ end }} -{{- range .Messages }} -{{- if eq .Role "user" }}### User: -{{ .Content }} -{{ else if eq .Role "assistant" }}### Assistant: -{{ .Content }} -{{ end }} -{{ end }}### Assistant: -{{ else -}} {{ if .System }}### System: {{ .System }} @@ -20,4 +7,3 @@ {{ end }}### Assistant: {{ .Response }} -{{ end -}} \ No newline at end of file diff --git a/template/starcoder2-instruct.gotmpl b/template/starcoder2-instruct.gotmpl index 17c6ad75..6c93a7ab 100644 --- a/template/starcoder2-instruct.gotmpl +++ b/template/starcoder2-instruct.gotmpl @@ -1,17 +1,3 @@ -{{- if .Messages }} -{{- if .System }}{{ .System }} - -{{ end }} -{{- range .Messages }} -{{- if eq .Role "user" }}### Instruction -{{ .Content }} - -{{ else if eq .Role "assistant" }}### Response -{{ .Content }}<|endoftext|> - -{{ end }} -{{- end }}### Response -{{ else -}} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}### Instruction @@ -20,4 +6,3 @@ {{ end }}### Response {{ .Response }}<|endoftext|> -{{ end -}} \ No newline at end of file diff --git a/template/template_test.go b/template/template_test.go index b020eb67..9cfa0bea 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -116,7 +116,14 @@ func TestTemplate(t *testing.T) { t.Fatal(err) } - if diff := cmp.Diff(actual.Bytes(), expect); diff != "" { + bts := actual.Bytes() + + if slices.Contains([]string{"chatqa.gotmpl", "llama2-chat.gotmpl", "mistral-instruct.gotmpl", "openchat.gotmpl", "vicuna.gotmpl"}, match) && bts[len(bts)-1] == ' ' { + t.Log("removing trailing space from output") + bts = bts[:len(bts)-1] + } + + if diff := cmp.Diff(bts, expect); diff != "" { t.Errorf("mismatch (-got +want):\n%s", diff) } }) @@ -203,11 +210,18 @@ func TestExecuteWithMessages(t *testing.T) { { "mistral", []template{ - {"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `}, - {"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, - {"messages", `{{- range $index, $_ := .Messages }} -{{- if eq .Role "user" }}[INST] {{ if and (eq $index 0) $.System }}{{ $.System }}{{ "\n\n" }} -{{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} + {"no response", `[INST] {{ if .System }}{{ .System }} + +{{ end }}{{ .Prompt }}[/INST] `}, + {"response", `[INST] {{ if .System }}{{ .System }} + +{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, + {"messages", `{{- $system := aggregate $.Messages "system" -}} +{{- range $index, $_ := .Messages }} +{{- if eq .Role "user" }}[INST] {{ if $system }}{{ $system }} +{{- $system = "" }} + +{{ end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} {{- end }} {{- end }}`}, }, @@ -223,12 +237,18 @@ func TestExecuteWithMessages(t *testing.T) { { "mistral system", []template{ - {"no response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] `}, - {"response", `[INST] {{ if .System }}{{ .System }}{{ "\n\n" }}{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, - {"messages", ` + {"no response", `[INST] {{ if .System }}{{ .System }} + +{{ end }}{{ .Prompt }}[/INST] `}, + {"response", `[INST] {{ if .System }}{{ .System }} + +{{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, + {"messages", `{{- $system := aggregate $.Messages "system" -}} {{- range $index, $_ := .Messages }} -{{- if eq .Role "user" }}[INST] {{ if and (eq $index 0) $.System }}{{ $.System }}{{ "\n\n" }} -{{- end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} +{{- if eq .Role "user" }}[INST] {{ if $system }}{{ $system }} +{{- $system = "" }} + +{{ end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} {{- end }} {{- end }}`}, }, @@ -256,12 +276,9 @@ Hello friend![/INST] Hello human![INST] What is your name?[/INST] `, {{ .Response }}<|im_end|> `}, {"messages", ` -{{- range $index, $_ := .Messages }} -{{- if and (eq .Role "user") (eq $index 0) $.System }}<|im_start|>system -{{ $.System }}<|im_end|>{{ "\n" }} -{{- end }}<|im_start|>{{ .Role }} -{{ .Content }}<|im_end|>{{ "\n" }} -{{- end }}<|im_start|>assistant +{{- range $index, $_ := .Messages }}<|im_start|>{{ .Role }} +{{ .Content }}<|im_end|> +{{ end }}<|im_start|>assistant `}, }, Values{ @@ -294,9 +311,11 @@ What is your name?<|im_end|> `}, {"messages", ` {{- range .Messages }} -{{- if eq .Role "user" }}Question: {{ .Content }}{{ "\n\n" }} -{{- else if eq .Role "assistant" }}Answer: {{ .Content }}{{ "\n\n" }} -{{- end }} +{{- if eq .Role "user" }}Question: {{ .Content }} + +{{ else if eq .Role "assistant" }}Answer: {{ .Content }} + +{{ end }} {{- end }}Answer: `}, }, Values{ diff --git a/template/testdata/llama2-chat.gotmpl/system-user-assistant-user b/template/testdata/llama2-chat.gotmpl/system-user-assistant-user index fc2679bf..9db81cb4 100644 --- a/template/testdata/llama2-chat.gotmpl/system-user-assistant-user +++ b/template/testdata/llama2-chat.gotmpl/system-user-assistant-user @@ -2,4 +2,6 @@ You are a helpful assistant. <> -Hello, how are you? [/INST] I'm doing great. How can I help you today?[INST] I'd like to show off how chat templating works! [/INST] \ No newline at end of file +Hello, how are you? [/INST] I'm doing great. How can I help you today?[INST] <><> + +I'd like to show off how chat templating works! [/INST] \ No newline at end of file diff --git a/template/testdata/llama2-chat.gotmpl/user-assistant-user b/template/testdata/llama2-chat.gotmpl/user-assistant-user index 42b4c529..ca58954f 100644 --- a/template/testdata/llama2-chat.gotmpl/user-assistant-user +++ b/template/testdata/llama2-chat.gotmpl/user-assistant-user @@ -1,3 +1,5 @@ [INST] <><> -Hello, how are you? [/INST] I'm doing great. How can I help you today?[INST] I'd like to show off how chat templating works! [/INST] \ No newline at end of file +Hello, how are you? [/INST] I'm doing great. How can I help you today?[INST] <><> + +I'd like to show off how chat templating works! [/INST] \ No newline at end of file diff --git a/template/testdata/mistral-instruct.gotmpl/system-user-assistant-user b/template/testdata/mistral-instruct.gotmpl/system-user-assistant-user index b6b4bf93..2f1edaec 100644 --- a/template/testdata/mistral-instruct.gotmpl/system-user-assistant-user +++ b/template/testdata/mistral-instruct.gotmpl/system-user-assistant-user @@ -1,2 +1,3 @@ -[INST] Hello, how are you?[/INST] I'm doing great. How can I help you today?[INST] You are a helpful assistant. -I'd like to show off how chat templating works![/INST] \ No newline at end of file +[INST] You are a helpful assistant. + +Hello, how are you?[/INST] I'm doing great. How can I help you today?[INST] I'd like to show off how chat templating works![/INST] \ No newline at end of file diff --git a/template/vicuna.gotmpl b/template/vicuna.gotmpl index 01465b99..515b2fe9 100644 --- a/template/vicuna.gotmpl +++ b/template/vicuna.gotmpl @@ -1,15 +1,4 @@ -{{- if .Messages }} -{{- if .System }}{{ .System }} - -{{ end }} -{{- range .Messages }} -{{- if eq .Role "user" }}USER: {{ .Content }} -{{ else if eq .Role "assistant" }}ASSISTANT: {{ .Content }} -{{ end }} -{{- end }}ASSISTANT: -{{- else -}} {{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} {{ end }}ASSISTANT: {{ .Response }} -{{ end -}} \ No newline at end of file diff --git a/template/zephyr.gotmpl b/template/zephyr.gotmpl index 3ca1d1a1..1f889f26 100644 --- a/template/zephyr.gotmpl +++ b/template/zephyr.gotmpl @@ -1,15 +1,6 @@ -{{- if .Messages }} -{{- if .System }}<|system|> -{{ .System }} -{{ end }} -{{- range .Messages }}<|{{ .Role }}|> -{{ .Content }} -{{ end }}<|assistant|> -{{ else -}} {{ if .System }}<|system|> {{ .System }} {{ end }}{{ if .Prompt }}<|user|> {{ .Prompt }} {{ end }}<|assistant|> {{ .Response }} -{{ end -}} \ No newline at end of file From c4cf8ad55966cc61c73f119ab9cbfaf57264fc81 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 11 Jul 2024 16:42:57 -0700 Subject: [PATCH 34/48] llm: avoid loading model if system memory is too small (#5637) * llm: avoid loading model if system memory is too small * update log * Instrument swap free space On linux and windows, expose how much swap space is available so we can take that into consideration when scheduling models * use `systemSwapFreeMemory` in check --------- Co-authored-by: Daniel Hiltgen --- gpu/gpu.go | 3 +++ gpu/gpu_darwin.go | 1 + gpu/gpu_linux.go | 17 +++++++++-------- gpu/gpu_windows.go | 2 +- gpu/types.go | 1 + llm/server.go | 11 +++++++---- 6 files changed, 22 insertions(+), 13 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 58144991..6e25cb46 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -360,14 +360,17 @@ func GetGPUInfo() GpuInfoList { "before", "total", format.HumanBytes2(cpus[0].TotalMemory), "free", format.HumanBytes2(cpus[0].FreeMemory), + "free_swap", format.HumanBytes2(cpus[0].FreeSwap), ), slog.Group( "now", "total", format.HumanBytes2(mem.TotalMemory), "free", format.HumanBytes2(mem.FreeMemory), + "free_swap", format.HumanBytes2(mem.FreeSwap), ), ) cpus[0].FreeMemory = mem.FreeMemory + cpus[0].FreeSwap = mem.FreeSwap } var memInfo C.mem_info_t diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 39d8fcf8..cb066e58 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -57,6 +57,7 @@ func GetCPUMem() (memInfo, error) { return memInfo{ TotalMemory: uint64(C.getPhysicalMemory()), FreeMemory: uint64(C.getFreeMemory()), + // FreeSwap omitted as Darwin uses dynamic paging }, nil } diff --git a/gpu/gpu_linux.go b/gpu/gpu_linux.go index a099bf82..0d08ce8d 100644 --- a/gpu/gpu_linux.go +++ b/gpu/gpu_linux.go @@ -50,7 +50,7 @@ var OneapiMgmtName = "libze_intel_gpu.so" func GetCPUMem() (memInfo, error) { var mem memInfo - var total, available, free, buffers, cached uint64 + var total, available, free, buffers, cached, freeSwap uint64 f, err := os.Open("/proc/meminfo") if err != nil { return mem, err @@ -70,20 +70,21 @@ func GetCPUMem() (memInfo, error) { _, err = fmt.Sscanf(line, "Buffers:%d", &buffers) case strings.HasPrefix(line, "Cached:"): _, err = fmt.Sscanf(line, "Cached:%d", &cached) + case strings.HasPrefix(line, "SwapFree:"): + _, err = fmt.Sscanf(line, "SwapFree:%d", &freeSwap) default: continue } if err != nil { return mem, err } - - if total > 0 && available > 0 { - mem.TotalMemory = total * format.KibiByte - mem.FreeMemory = available * format.KibiByte - return mem, nil - } } mem.TotalMemory = total * format.KibiByte - mem.FreeMemory = (free + buffers + cached) * format.KibiByte + mem.FreeSwap = freeSwap * format.KibiByte + if available > 0 { + mem.FreeMemory = available * format.KibiByte + } else { + mem.FreeMemory = (free + buffers + cached) * format.KibiByte + } return mem, nil } diff --git a/gpu/gpu_windows.go b/gpu/gpu_windows.go index f8c2e76f..cd0629da 100644 --- a/gpu/gpu_windows.go +++ b/gpu/gpu_windows.go @@ -51,5 +51,5 @@ func GetCPUMem() (memInfo, error) { if r1 == 0 { return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err) } - return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil + return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys, FreeSwap: memStatus.AvailPageFile}, nil } diff --git a/gpu/types.go b/gpu/types.go index 7a7749b8..8d22b06b 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -10,6 +10,7 @@ import ( type memInfo struct { TotalMemory uint64 `json:"total_memory,omitempty"` FreeMemory uint64 `json:"free_memory,omitempty"` + FreeSwap uint64 `json:"free_swap,omitempty"` } // Beginning of an `ollama info` command diff --git a/llm/server.go b/llm/server.go index 07c58cff..8f37aa23 100644 --- a/llm/server.go +++ b/llm/server.go @@ -88,6 +88,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr var estimate MemoryEstimate var systemTotalMemory uint64 var systemFreeMemory uint64 + var systemSwapFreeMemory uint64 systemMemInfo, err := gpu.GetCPUMem() if err != nil { @@ -95,7 +96,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr } else { systemTotalMemory = systemMemInfo.TotalMemory systemFreeMemory = systemMemInfo.FreeMemory - slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory) + systemSwapFreeMemory = systemMemInfo.FreeSwap + slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory)) } // If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info @@ -125,9 +127,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // On linux, over-allocating CPU memory will almost always result in an error if runtime.GOOS == "linux" { systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize - if systemMemoryRequired > systemTotalMemory { - slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "system", format.HumanBytes2(systemTotalMemory)) - return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(systemTotalMemory)) + available := min(systemTotalMemory, systemFreeMemory+systemSwapFreeMemory) + if systemMemoryRequired > available { + slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory)) + return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available)) } } From 5056bb9c010f06316b0ff280b879b9c36a7c995c Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 11 Jul 2024 16:06:57 -0700 Subject: [PATCH 35/48] rename aggregate to contents --- template/template.go | 11 ++++++----- template/template_test.go | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/template/template.go b/template/template.go index 8d5ac51b..21e1614d 100644 --- a/template/template.go +++ b/template/template.go @@ -103,15 +103,16 @@ var response = parse.ActionNode{ } var funcs = template.FuncMap{ - "aggregate": func(v []*api.Message, role string) string { - var aggregated []string + // contents returns the contents of messages with an optional role filter + "contents": func(v []*api.Message, role ...string) string { + var parts []string for _, m := range v { - if m.Role == role { - aggregated = append(aggregated, m.Content) + if len(role) == 0 || role[0] == "" || m.Role == role[0] { + parts = append(parts, m.Content) } } - return strings.Join(aggregated, "\n\n") + return strings.Join(parts, "\n\n") }, } diff --git a/template/template_test.go b/template/template_test.go index 9cfa0bea..5e5f4257 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -216,7 +216,7 @@ func TestExecuteWithMessages(t *testing.T) { {"response", `[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, - {"messages", `{{- $system := aggregate $.Messages "system" -}} + {"messages", `{{- $system := contents .Messages "system" -}} {{- range $index, $_ := .Messages }} {{- if eq .Role "user" }}[INST] {{ if $system }}{{ $system }} {{- $system = "" }} @@ -243,7 +243,7 @@ func TestExecuteWithMessages(t *testing.T) { {"response", `[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, - {"messages", `{{- $system := aggregate $.Messages "system" -}} + {"messages", `{{- $system := contents .Messages "system" -}} {{- range $index, $_ := .Messages }} {{- if eq .Role "user" }}[INST] {{ if $system }}{{ $system }} {{- $system = "" }} @@ -363,3 +363,36 @@ Answer: `, }) } } + +func TestFuncs(t *testing.T) { + t.Run("contents", func(t *testing.T) { + cases := map[string]string{ + "": "A\n\nB\n\nC\n\nD\n\nE\n\nF", + "system": "A\n\nF", + "user": "B\n\nE", + "assistant": "C\n\nD", + } + + s := []*api.Message{ + {Role: "system", Content: "A"}, + {Role: "user", Content: "B"}, + {Role: "assistant", Content: "C"}, + {Role: "assistant", Content: "D"}, + {Role: "user", Content: "E"}, + {Role: "system", Content: "F"}, + } + + fn, ok := funcs["contents"].(func([]*api.Message, ...string) string) + if !ok { + t.Fatal("contents is not a function") + } + + for k, v := range cases { + t.Run(k, func(t *testing.T) { + if diff := cmp.Diff(fn(s, k), v); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + }) + } + }) +} From 10e768826c7d5a8f7d7fab13832299a466a01f87 Mon Sep 17 00:00:00 2001 From: Josh <76125168+joshyan1@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:24:29 -0700 Subject: [PATCH 36/48] fix: quant err message (#5616) --- llm/llm.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llm.go b/llm/llm.go index f2a5e557..d24507cc 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -33,7 +33,7 @@ func Quantize(infile, outfile string, ftype fileType) error { params.ftype = ftype.Value() if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 { - return fmt.Errorf("llama_model_quantize: %d", rc) + return fmt.Errorf("failed to quantize model. This model architecture may not be supported, or you may need to upgrade Ollama to the latest version") } return nil From 179737feb7311fc57c507a93378a3ac15da3a346 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 11 Jul 2024 22:53:46 -0700 Subject: [PATCH 37/48] Clean up old files when installing on Windows (#5645) * app: always clean up install dir; force close applications * remove wildcard * revert `CloseApplications` * whitespace * update `LOCALAPPDATA` var --- app/ollama.iss | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/ollama.iss b/app/ollama.iss index e6502abd..fef4a7b2 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -127,6 +127,9 @@ Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\models" Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\history" ; NOTE: if the user has a custom OLLAMA_MODELS it will be preserved +[InstallDelete] +Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama" + [Messages] WizardReady=Ollama Windows Preview ReadyLabel1=%nLet's get you up and running with your own large language models. From 36c87c433b7d880ef8b3a2b05ef93b0cd1675520 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 12 Jul 2024 11:48:06 -0700 Subject: [PATCH 38/48] template: preprocess message and collect system --- template/template.go | 37 +++++++++++---------------- template/template_test.go | 53 ++++++--------------------------------- 2 files changed, 23 insertions(+), 67 deletions(-) diff --git a/template/template.go b/template/template.go index 21e1614d..9b351666 100644 --- a/template/template.go +++ b/template/template.go @@ -102,22 +102,8 @@ var response = parse.ActionNode{ }, } -var funcs = template.FuncMap{ - // contents returns the contents of messages with an optional role filter - "contents": func(v []*api.Message, role ...string) string { - var parts []string - for _, m := range v { - if len(role) == 0 || role[0] == "" || m.Role == role[0] { - parts = append(parts, m.Content) - } - } - - return strings.Join(parts, "\n\n") - }, -} - func Parse(s string) (*Template, error) { - tmpl := template.New("").Option("missingkey=zero").Funcs(funcs) + tmpl := template.New("").Option("missingkey=zero") tmpl, err := tmpl.Parse(s) if err != nil { @@ -163,15 +149,16 @@ type Values struct { } func (t *Template) Execute(w io.Writer, v Values) error { - collated := collate(v.Messages) + system, collated := collate(v.Messages) if !v.forceLegacy && slices.Contains(t.Vars(), "messages") { return t.Template.Execute(w, map[string]any{ + "System": system, "Messages": collated, }) } var b bytes.Buffer - var system, prompt, response string + var prompt, response string for i, m := range collated { switch m.Role { case "system": @@ -223,11 +210,13 @@ func (t *Template) Execute(w io.Writer, v Values) error { } // collate messages based on role. consecutive messages of the same role are merged -// into a single message. collate also pulls out and merges messages with Role == "system" -// which are templated separately. As a side effect, it mangles message content adding image -// tags ([img-%d]) as needed -func collate(msgs []api.Message) (collated []*api.Message) { +// into a single message. collate also collects and returns all system messages. +// collate mutates message content adding image tags ([img-%d]) as needed +func collate(msgs []api.Message) (string, []*api.Message) { var n int + + var system []string + var collated []*api.Message for i := range msgs { msg := msgs[i] for range msg.Images { @@ -240,6 +229,10 @@ func collate(msgs []api.Message) (collated []*api.Message) { n++ } + if msg.Role == "system" { + system = append(system, msg.Content) + } + if len(collated) > 0 && collated[len(collated)-1].Role == msg.Role { collated[len(collated)-1].Content += "\n\n" + msg.Content } else { @@ -247,7 +240,7 @@ func collate(msgs []api.Message) (collated []*api.Message) { } } - return + return strings.Join(system, "\n\n"), collated } func parseNode(n parse.Node) []string { diff --git a/template/template_test.go b/template/template_test.go index 5e5f4257..c678f1b1 100644 --- a/template/template_test.go +++ b/template/template_test.go @@ -216,13 +216,11 @@ func TestExecuteWithMessages(t *testing.T) { {"response", `[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, - {"messages", `{{- $system := contents .Messages "system" -}} -{{- range $index, $_ := .Messages }} -{{- if eq .Role "user" }}[INST] {{ if $system }}{{ $system }} -{{- $system = "" }} + {"messages", `[INST] {{ if .System }}{{ .System }} -{{ end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} -{{- end }} +{{ end }} +{{- range .Messages }} +{{- if eq .Role "user" }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}[INST] {{ end }} {{- end }}`}, }, Values{ @@ -243,13 +241,11 @@ func TestExecuteWithMessages(t *testing.T) { {"response", `[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }}[/INST] {{ .Response }}`}, - {"messages", `{{- $system := contents .Messages "system" -}} -{{- range $index, $_ := .Messages }} -{{- if eq .Role "user" }}[INST] {{ if $system }}{{ $system }} -{{- $system = "" }} + {"messages", `[INST] {{ if .System }}{{ .System }} -{{ end }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }} -{{- end }} +{{ end }} +{{- range .Messages }} +{{- if eq .Role "user" }}{{ .Content }}[/INST] {{ else if eq .Role "assistant" }}{{ .Content }}[INST] {{ end }} {{- end }}`}, }, Values{ @@ -363,36 +359,3 @@ Answer: `, }) } } - -func TestFuncs(t *testing.T) { - t.Run("contents", func(t *testing.T) { - cases := map[string]string{ - "": "A\n\nB\n\nC\n\nD\n\nE\n\nF", - "system": "A\n\nF", - "user": "B\n\nE", - "assistant": "C\n\nD", - } - - s := []*api.Message{ - {Role: "system", Content: "A"}, - {Role: "user", Content: "B"}, - {Role: "assistant", Content: "C"}, - {Role: "assistant", Content: "D"}, - {Role: "user", Content: "E"}, - {Role: "system", Content: "F"}, - } - - fn, ok := funcs["contents"].(func([]*api.Message, ...string) string) - if !ok { - t.Fatal("contents is not a function") - } - - for k, v := range cases { - t.Run(k, func(t *testing.T) { - if diff := cmp.Diff(fn(s, k), v); diff != "" { - t.Errorf("mismatch (-got +want):\n%s", diff) - } - }) - } - }) -} From 33627331a370755ff5033c0fcd71d1c9210c9d96 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Fri, 12 Jul 2024 12:29:23 -0700 Subject: [PATCH 39/48] app: also clean up tempdir runners on install (#5646) --- app/ollama.iss | 1 + 1 file changed, 1 insertion(+) diff --git a/app/ollama.iss b/app/ollama.iss index fef4a7b2..6bedb9ff 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -128,6 +128,7 @@ Type: filesandordirs; Name: "{%USERPROFILE}\.ollama\history" ; NOTE: if the user has a custom OLLAMA_MODELS it will be preserved [InstallDelete] +Type: filesandordirs; Name: "{%TEMP}\ollama*" Type: filesandordirs; Name: "{%LOCALAPPDATA}\Programs\Ollama" [Messages] From 9ac0a7a50b8d7a0f0627b037c7632181bfbcca97 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Fri, 12 Jul 2024 15:41:31 -0700 Subject: [PATCH 40/48] remove template from tests --- cmd/interactive_test.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go index d9af01eb..711f3860 100644 --- a/cmd/interactive_test.go +++ b/cmd/interactive_test.go @@ -59,7 +59,6 @@ func TestModelfileBuilder(t *testing.T) { opts := runOptions{ Model: "hork", System: "You are part horse and part shark, but all hork. Do horklike things", - Template: "This is a template.", Messages: []api.Message{ {Role: "user", Content: "Hey there hork!"}, {Role: "assistant", Content: "Yes it is true, I am half horse, half shark."}, @@ -75,7 +74,6 @@ func TestModelfileBuilder(t *testing.T) { mf := buildModelfile(opts) expectedModelfile := `FROM {{.Model}} SYSTEM """{{.System}}""" -TEMPLATE """{{.Template}}""" PARAMETER penalize_newline false PARAMETER seed 42 PARAMETER stop [hi there] @@ -97,7 +95,6 @@ MESSAGE assistant """Yes it is true, I am half horse, half shark.""" mf = buildModelfile(opts) expectedModelfile = `FROM {{.ParentModel}} SYSTEM """{{.System}}""" -TEMPLATE """{{.Template}}""" PARAMETER penalize_newline false PARAMETER seed 42 PARAMETER stop [hi there] From 23ebbaa46ead40c44c20b707b0e53d954ea51dc5 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Fri, 12 Jul 2024 15:47:17 -0700 Subject: [PATCH 41/48] Revert "remove template from tests" This reverts commit 9ac0a7a50b8d7a0f0627b037c7632181bfbcca97. --- cmd/interactive_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go index 711f3860..d9af01eb 100644 --- a/cmd/interactive_test.go +++ b/cmd/interactive_test.go @@ -59,6 +59,7 @@ func TestModelfileBuilder(t *testing.T) { opts := runOptions{ Model: "hork", System: "You are part horse and part shark, but all hork. Do horklike things", + Template: "This is a template.", Messages: []api.Message{ {Role: "user", Content: "Hey there hork!"}, {Role: "assistant", Content: "Yes it is true, I am half horse, half shark."}, @@ -74,6 +75,7 @@ func TestModelfileBuilder(t *testing.T) { mf := buildModelfile(opts) expectedModelfile := `FROM {{.Model}} SYSTEM """{{.System}}""" +TEMPLATE """{{.Template}}""" PARAMETER penalize_newline false PARAMETER seed 42 PARAMETER stop [hi there] @@ -95,6 +97,7 @@ MESSAGE assistant """Yes it is true, I am half horse, half shark.""" mf = buildModelfile(opts) expectedModelfile = `FROM {{.ParentModel}} SYSTEM """{{.System}}""" +TEMPLATE """{{.Template}}""" PARAMETER penalize_newline false PARAMETER seed 42 PARAMETER stop [hi there] From 22c5451fc28b20dd83a389c49d9caf6a1e50a9e3 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 12 Jul 2024 21:04:44 -0700 Subject: [PATCH 42/48] fix system prompt (#5662) * fix system prompt * execute template when hitting previous roles * fix tests --------- Co-authored-by: jmorganca --- server/prompt.go | 23 +++++++---------------- server/prompt_test.go | 18 ++++++++++++++++++ template/template.go | 40 ++++++++++++++++++++++++++-------------- 3 files changed, 51 insertions(+), 30 deletions(-) diff --git a/server/prompt.go b/server/prompt.go index 51d691a9..abc5e61e 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -4,7 +4,6 @@ import ( "bytes" "context" "log/slog" - "slices" "github.com/ollama/ollama/api" "github.com/ollama/ollama/llm" @@ -17,26 +16,18 @@ type tokenizeFunc func(context.Context, string) ([]int, error) // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the // latest message and 2) system messages func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message) (prompt string, images []llm.ImageData, _ error) { - // pull out any system messages which should always be included in the prompt var system []api.Message - msgs = slices.DeleteFunc(msgs, func(m api.Message) bool { - if m.Role == "system" { - system = append(system, m) - return true - } - - return false - }) - - if len(system) == 0 && m.System != "" { - // add model system prompt since it wasn't provided - system = append(system, api.Message{Role: "system", Content: m.System}) - } - // always include the last message n := len(msgs) - 1 // in reverse, find all messages that fit into context window for i := n - 1; i >= 0; i-- { + system = make([]api.Message, 0) + for j := range i { + if msgs[j].Role == "system" { + system = append(system, msgs[j]) + } + } + var b bytes.Buffer if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...)}); err != nil { return "", nil, err diff --git a/server/prompt_test.go b/server/prompt_test.go index 1435b143..d8caf3ed 100644 --- a/server/prompt_test.go +++ b/server/prompt_test.go @@ -6,6 +6,7 @@ import ( "strings" "testing" + "github.com/google/go-cmp/cmp" "github.com/ollama/ollama/api" "github.com/ollama/ollama/template" ) @@ -164,6 +165,19 @@ func TestChatPrompt(t *testing.T) { prompt: "You are the Test Who Lived. You're a test, Harry! I-I'm a what? A test. And a thumping good one at that, I'd wager. ", }, }, + { + name: "out of order system", + limit: 2048, + msgs: []api.Message{ + {Role: "user", Content: "You're a test, Harry!"}, + {Role: "assistant", Content: "I-I'm a what?"}, + {Role: "system", Content: "You are the Test Who Lived."}, + {Role: "user", Content: "A test. And a thumping good one at that, I'd wager."}, + }, + expect: expect{ + prompt: "You're a test, Harry! I-I'm a what? You are the Test Who Lived. A test. And a thumping good one at that, I'd wager. ", + }, + }, } tmpl, err := template.Parse(` @@ -187,6 +201,10 @@ func TestChatPrompt(t *testing.T) { t.Errorf("expected %q, got %q", tt.prompt, prompt) } + if diff := cmp.Diff(prompt, tt.prompt); diff != "" { + t.Errorf("mismatch (-got +want):\n%s", diff) + } + if len(images) != len(tt.images) { t.Fatalf("expected %d images, got %d", len(tt.images), len(images)) } diff --git a/template/template.go b/template/template.go index 9b351666..90014ec1 100644 --- a/template/template.go +++ b/template/template.go @@ -149,27 +149,19 @@ type Values struct { } func (t *Template) Execute(w io.Writer, v Values) error { - system, collated := collate(v.Messages) + system, messages := collate(v.Messages) if !v.forceLegacy && slices.Contains(t.Vars(), "messages") { return t.Template.Execute(w, map[string]any{ "System": system, - "Messages": collated, + "Messages": messages, }) } + system = "" var b bytes.Buffer var prompt, response string - for i, m := range collated { - switch m.Role { - case "system": - system = m.Content - case "user": - prompt = m.Content - case "assistant": - response = m.Content - } - - if i != len(collated)-1 && prompt != "" && response != "" { + for _, m := range messages { + execute := func () error { if err := t.Template.Execute(&b, map[string]any{ "System": system, "Prompt": prompt, @@ -181,6 +173,26 @@ func (t *Template) Execute(w io.Writer, v Values) error { system = "" prompt = "" response = "" + return nil + } + + switch m.Role { + case "system": + if prompt != "" || response != "" { + if err := execute(); err != nil { + return err + } + } + system = m.Content + case "user": + if response != "" { + if err := execute(); err != nil { + return err + } + } + prompt = m.Content + case "assistant": + response = m.Content } } @@ -199,7 +211,7 @@ func (t *Template) Execute(w io.Writer, v Values) error { tree := parse.Tree{Root: nodes.(*parse.ListNode)} if err := template.Must(template.New("").AddParseTree("", &tree)).Execute(&b, map[string]any{ - "System": "", + "System": system, "Prompt": prompt, }); err != nil { return err From 02fea420e5a0042d5e4cfbb5024a6d7e092dc789 Mon Sep 17 00:00:00 2001 From: Jarek Date: Sat, 13 Jul 2024 17:33:46 +0200 Subject: [PATCH 43/48] Add Kerlig AI, an app for macOS (#5675) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 62f5cd65..eb5e8532 100644 --- a/README.md +++ b/README.md @@ -293,6 +293,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS) - [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama) - [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama) +- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS) ### Terminal From ef98803d63a4e4c56853688343f011256ced130d Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 13 Jul 2024 09:20:05 -0700 Subject: [PATCH 44/48] llm: looser checks for minimum memory (#5677) --- llm/server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/server.go b/llm/server.go index 8f37aa23..ffed9fc0 100644 --- a/llm/server.go +++ b/llm/server.go @@ -127,7 +127,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr // On linux, over-allocating CPU memory will almost always result in an error if runtime.GOOS == "linux" { systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize - available := min(systemTotalMemory, systemFreeMemory+systemSwapFreeMemory) + available := systemFreeMemory + systemSwapFreeMemory if systemMemoryRequired > available { slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "available", available, "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "swap", format.HumanBytes2(systemSwapFreeMemory)) return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(available)) From 1ed0aa8feab58a5cbdf2d79fdb718e3a5cc03525 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 13 Jul 2024 09:25:31 -0700 Subject: [PATCH 45/48] server: fix `context`, `load_duration` and `total_duration` fields (#5676) * server: fix `contet`, `load_duration` and `total_duration` fields * Update server/routes.go --- server/routes.go | 56 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/server/routes.go b/server/routes.go index 4059c7c5..5b6d0978 100644 --- a/server/routes.go +++ b/server/routes.go @@ -102,6 +102,7 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capabil } func (s *Server) GenerateHandler(c *gin.Context) { + checkpointStart := time.Now() var req api.GenerateRequest if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) @@ -129,6 +130,8 @@ func (s *Server) GenerateHandler(c *gin.Context) { return } + checkpointLoaded := time.Now() + if req.Prompt == "" { c.JSON(http.StatusOK, api.GenerateResponse{ Model: req.Model, @@ -191,26 +194,48 @@ func (s *Server) GenerateHandler(c *gin.Context) { ch := make(chan any) go func() { + // TODO (jmorganca): avoid building the response twice both here and below + var sb strings.Builder defer close(ch) if err := r.Completion(c.Request.Context(), llm.CompletionRequest{ Prompt: prompt, Images: images, Format: req.Format, Options: opts, - }, func(r llm.CompletionResponse) { - ch <- api.GenerateResponse{ + }, func(cr llm.CompletionResponse) { + res := api.GenerateResponse{ Model: req.Model, CreatedAt: time.Now().UTC(), - Response: r.Content, - Done: r.Done, - DoneReason: r.DoneReason, + Response: cr.Content, + Done: cr.Done, + DoneReason: cr.DoneReason, Metrics: api.Metrics{ - PromptEvalCount: r.PromptEvalCount, - PromptEvalDuration: r.PromptEvalDuration, - EvalCount: r.EvalCount, - EvalDuration: r.EvalDuration, + PromptEvalCount: cr.PromptEvalCount, + PromptEvalDuration: cr.PromptEvalDuration, + EvalCount: cr.EvalCount, + EvalDuration: cr.EvalDuration, }, } + + if _, err := sb.WriteString(cr.Content); err != nil { + ch <- gin.H{"error": err.Error()} + } + + if cr.Done { + res.TotalDuration = time.Since(checkpointStart) + res.LoadDuration = checkpointLoaded.Sub(checkpointStart) + + if !req.Raw { + tokens, err := r.Tokenize(c.Request.Context(), prompt+sb.String()) + if err != nil { + ch <- gin.H{"error": err.Error()} + return + } + res.Context = append(req.Context, tokens...) + } + } + + ch <- res }); err != nil { ch <- gin.H{"error": err.Error()} } @@ -1122,6 +1147,8 @@ func (s *Server) ProcessHandler(c *gin.Context) { } func (s *Server) ChatHandler(c *gin.Context) { + checkpointStart := time.Now() + var req api.ChatRequest if err := c.ShouldBindJSON(&req); errors.Is(err, io.EOF) { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "missing request body"}) @@ -1141,6 +1168,8 @@ func (s *Server) ChatHandler(c *gin.Context) { return } + checkpointLoaded := time.Now() + if len(req.Messages) == 0 { c.JSON(http.StatusOK, api.ChatResponse{ Model: req.Model, @@ -1169,7 +1198,7 @@ func (s *Server) ChatHandler(c *gin.Context) { Format: req.Format, Options: opts, }, func(r llm.CompletionResponse) { - ch <- api.ChatResponse{ + res := api.ChatResponse{ Model: req.Model, CreatedAt: time.Now().UTC(), Message: api.Message{Role: "assistant", Content: r.Content}, @@ -1182,6 +1211,13 @@ func (s *Server) ChatHandler(c *gin.Context) { EvalDuration: r.EvalDuration, }, } + + if r.Done { + res.TotalDuration = time.Since(checkpointStart) + res.LoadDuration = checkpointLoaded.Sub(checkpointStart) + } + + ch <- res }); err != nil { ch <- gin.H{"error": err.Error()} } From f7ee0123008dbdb3fd5954438d12196951b58b78 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 13 Jul 2024 15:08:00 -0700 Subject: [PATCH 46/48] server: prepend system message in chat handler --- server/routes.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/routes.go b/server/routes.go index 5b6d0978..edaec691 100644 --- a/server/routes.go +++ b/server/routes.go @@ -1181,6 +1181,10 @@ func (s *Server) ChatHandler(c *gin.Context) { return } + if req.Messages[0].Role != "system" { + req.Messages = append([]api.Message{{Role: "system", Content: m.System}}, req.Messages...) + } + prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, req.Messages) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) From 057d31861e3514b60a7eedf694899067b72bd2fa Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Sat, 13 Jul 2024 20:56:24 -0700 Subject: [PATCH 47/48] remove template (#5655) --- api/types.go | 2 ++ cmd/cmd.go | 2 -- cmd/interactive.go | 52 +++++++++++------------------------------ cmd/interactive_test.go | 3 --- server/routes.go | 7 ------ 5 files changed, 16 insertions(+), 50 deletions(-) diff --git a/api/types.go b/api/types.go index 87844c67..91c97c71 100644 --- a/api/types.go +++ b/api/types.go @@ -221,6 +221,8 @@ type DeleteRequest struct { type ShowRequest struct { Model string `json:"model"` System string `json:"system"` + + // Template is deprecated Template string `json:"template"` Verbose bool `json:"verbose"` diff --git a/cmd/cmd.go b/cmd/cmd.go index c898c7db..2252a905 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -843,7 +843,6 @@ type runOptions struct { WordWrap bool Format string System string - Template string Images []api.ImageData Options map[string]interface{} MultiModal bool @@ -1037,7 +1036,6 @@ func generate(cmd *cobra.Command, opts runOptions) error { Images: opts.Images, Format: opts.Format, System: opts.System, - Template: opts.Template, Options: opts.Options, KeepAlive: opts.KeepAlive, } diff --git a/cmd/interactive.go b/cmd/interactive.go index 9214f2db..adbc3e9f 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -27,7 +27,6 @@ const ( MultilineNone MultilineState = iota MultilinePrompt MultilineSystem - MultilineTemplate ) func loadModel(cmd *cobra.Command, opts *runOptions) error { @@ -94,7 +93,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { fmt.Fprintln(os.Stderr, "Available Commands:") fmt.Fprintln(os.Stderr, " /set parameter ... Set a parameter") fmt.Fprintln(os.Stderr, " /set system Set system message") - fmt.Fprintln(os.Stderr, " /set template Set prompt template") fmt.Fprintln(os.Stderr, " /set history Enable history") fmt.Fprintln(os.Stderr, " /set nohistory Disable history") fmt.Fprintln(os.Stderr, " /set wordwrap Enable wordwrap") @@ -204,10 +202,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { opts.Messages = append(opts.Messages, api.Message{Role: "system", Content: opts.System}) fmt.Println("Set system message.") sb.Reset() - case MultilineTemplate: - opts.Template = sb.String() - fmt.Println("Set prompt template.") - sb.Reset() } multiline = MultilineNone @@ -326,17 +320,13 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { } fmt.Printf("Set parameter '%s' to '%s'\n", args[2], strings.Join(params, ", ")) opts.Options[args[2]] = fp[args[2]] - case "system", "template": + case "system": if len(args) < 3 { usageSet() continue } - if args[1] == "system" { - multiline = MultilineSystem - } else if args[1] == "template" { - multiline = MultilineTemplate - } + multiline = MultilineSystem line := strings.Join(args[2:], " ") line, ok := strings.CutPrefix(line, `"""`) @@ -356,23 +346,17 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { continue } - if args[1] == "system" { - opts.System = sb.String() // for display in modelfile - newMessage := api.Message{Role: "system", Content: sb.String()} - // Check if the slice is not empty and the last message is from 'system' - if len(opts.Messages) > 0 && opts.Messages[len(opts.Messages)-1].Role == "system" { - // Replace the last message - opts.Messages[len(opts.Messages)-1] = newMessage - } else { - opts.Messages = append(opts.Messages, newMessage) - } - fmt.Println("Set system message.") - sb.Reset() - } else if args[1] == "template" { - opts.Template = sb.String() - fmt.Println("Set prompt template.") - sb.Reset() + opts.System = sb.String() // for display in modelfile + newMessage := api.Message{Role: "system", Content: sb.String()} + // Check if the slice is not empty and the last message is from 'system' + if len(opts.Messages) > 0 && opts.Messages[len(opts.Messages)-1].Role == "system" { + // Replace the last message + opts.Messages[len(opts.Messages)-1] = newMessage + } else { + opts.Messages = append(opts.Messages, newMessage) } + fmt.Println("Set system message.") + sb.Reset() sb.Reset() continue @@ -393,7 +377,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { req := &api.ShowRequest{ Name: opts.Model, System: opts.System, - Template: opts.Template, Options: opts.Options, } resp, err := client.Show(cmd.Context(), req) @@ -437,12 +420,9 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { fmt.Println("No system message was specified for this model.") } case "template": - switch { - case opts.Template != "": - fmt.Println(opts.Template + "\n") - case resp.Template != "": + if resp.Template != "" { fmt.Println(resp.Template) - default: + } else { fmt.Println("No prompt template was specified for this model.") } default: @@ -536,10 +516,6 @@ func buildModelfile(opts runOptions) string { fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System) } - if opts.Template != "" { - fmt.Fprintf(&mf, "TEMPLATE \"\"\"%s\"\"\"\n", opts.Template) - } - keys := make([]string, 0) for k := range opts.Options { keys = append(keys, k) diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go index d9af01eb..711f3860 100644 --- a/cmd/interactive_test.go +++ b/cmd/interactive_test.go @@ -59,7 +59,6 @@ func TestModelfileBuilder(t *testing.T) { opts := runOptions{ Model: "hork", System: "You are part horse and part shark, but all hork. Do horklike things", - Template: "This is a template.", Messages: []api.Message{ {Role: "user", Content: "Hey there hork!"}, {Role: "assistant", Content: "Yes it is true, I am half horse, half shark."}, @@ -75,7 +74,6 @@ func TestModelfileBuilder(t *testing.T) { mf := buildModelfile(opts) expectedModelfile := `FROM {{.Model}} SYSTEM """{{.System}}""" -TEMPLATE """{{.Template}}""" PARAMETER penalize_newline false PARAMETER seed 42 PARAMETER stop [hi there] @@ -97,7 +95,6 @@ MESSAGE assistant """Yes it is true, I am half horse, half shark.""" mf = buildModelfile(opts) expectedModelfile = `FROM {{.ParentModel}} SYSTEM """{{.System}}""" -TEMPLATE """{{.Template}}""" PARAMETER penalize_newline false PARAMETER seed 42 PARAMETER stop [hi there] diff --git a/server/routes.go b/server/routes.go index edaec691..0a00d9e2 100644 --- a/server/routes.go +++ b/server/routes.go @@ -574,13 +574,6 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { m.System = req.System } - if req.Template != "" { - m.Template, err = template.Parse(req.Template) - if err != nil { - return nil, err - } - } - msgs := make([]api.Message, len(m.Messages)) for i, msg := range m.Messages { msgs[i] = api.Message{Role: msg.Role, Content: msg.Content} From e9f7f3602961d2b0beaff27144ec89301c2173ca Mon Sep 17 00:00:00 2001 From: royjhan <65097070+royjhan@users.noreply.github.com> Date: Sat, 13 Jul 2024 22:07:45 -0700 Subject: [PATCH 48/48] Support image input for OpenAI chat compatibility (#5208) * OpenAI v1 models * Refactor Writers * Add Test Co-Authored-By: Attila Kerekes * Credit Co-Author Co-Authored-By: Attila Kerekes <439392+keriati@users.noreply.github.com> * Empty List Testing * Use Namespace for Ownedby * Update Test * Add back envconfig * v1/models docs * Use ModelName Parser * Test Names * Remove Docs * Clean Up * Test name Co-authored-by: Jeffrey Morgan * Add Middleware for Chat and List * Testing Cleanup * Test with Fatal * Add functionality to chat test * Support image input for OpenAI chat * Decoding * Fix message processing logic * openai vision test * type errors * clean up * redundant check * merge conflicts * merge conflicts * merge conflicts * flattening and smaller image * add test * support python and js SDKs and mandate prefixing * clean up --------- Co-authored-by: Attila Kerekes <439392+keriati@users.noreply.github.com> Co-authored-by: Jeffrey Morgan --- openai/openai.go | 76 +++++++++++++++++++++++++++++++++++++++---- openai/openai_test.go | 49 ++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 6 deletions(-) diff --git a/openai/openai.go b/openai/openai.go index 1707da14..b289d73e 100644 --- a/openai/openai.go +++ b/openai/openai.go @@ -3,11 +3,13 @@ package openai import ( "bytes" + "encoding/base64" "encoding/json" "fmt" "io" "math/rand" "net/http" + "strings" "time" "github.com/gin-gonic/gin" @@ -28,7 +30,7 @@ type ErrorResponse struct { type Message struct { Role string `json:"role"` - Content string `json:"content"` + Content any `json:"content"` } type Choice struct { @@ -269,10 +271,66 @@ func toModel(r api.ShowResponse, m string) Model { } } -func fromChatRequest(r ChatCompletionRequest) api.ChatRequest { +func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) { var messages []api.Message for _, msg := range r.Messages { - messages = append(messages, api.Message{Role: msg.Role, Content: msg.Content}) + switch content := msg.Content.(type) { + case string: + messages = append(messages, api.Message{Role: msg.Role, Content: content}) + case []any: + message := api.Message{Role: msg.Role} + for _, c := range content { + data, ok := c.(map[string]any) + if !ok { + return nil, fmt.Errorf("invalid message format") + } + switch data["type"] { + case "text": + text, ok := data["text"].(string) + if !ok { + return nil, fmt.Errorf("invalid message format") + } + message.Content = text + case "image_url": + var url string + if urlMap, ok := data["image_url"].(map[string]any); ok { + if url, ok = urlMap["url"].(string); !ok { + return nil, fmt.Errorf("invalid message format") + } + } else { + if url, ok = data["image_url"].(string); !ok { + return nil, fmt.Errorf("invalid message format") + } + } + + types := []string{"jpeg", "jpg", "png"} + valid := false + for _, t := range types { + prefix := "data:image/" + t + ";base64," + if strings.HasPrefix(url, prefix) { + url = strings.TrimPrefix(url, prefix) + valid = true + break + } + } + + if !valid { + return nil, fmt.Errorf("invalid image input") + } + + img, err := base64.StdEncoding.DecodeString(url) + if err != nil { + return nil, fmt.Errorf("invalid message format") + } + message.Images = append(message.Images, img) + default: + return nil, fmt.Errorf("invalid message format") + } + } + messages = append(messages, message) + default: + return nil, fmt.Errorf("invalid message content type: %T", content) + } } options := make(map[string]interface{}) @@ -323,13 +381,13 @@ func fromChatRequest(r ChatCompletionRequest) api.ChatRequest { format = "json" } - return api.ChatRequest{ + return &api.ChatRequest{ Model: r.Model, Messages: messages, Format: format, Options: options, Stream: &r.Stream, - } + }, nil } func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) { @@ -656,7 +714,13 @@ func ChatMiddleware() gin.HandlerFunc { } var b bytes.Buffer - if err := json.NewEncoder(&b).Encode(fromChatRequest(req)); err != nil { + + chatReq, err := fromChatRequest(req) + if err != nil { + c.AbortWithStatusJSON(http.StatusBadRequest, NewError(http.StatusBadRequest, err.Error())) + } + + if err := json.NewEncoder(&b).Encode(chatReq); err != nil { c.AbortWithStatusJSON(http.StatusInternalServerError, NewError(http.StatusInternalServerError, err.Error())) return } diff --git a/openai/openai_test.go b/openai/openai_test.go index 5f1ae52e..99f8baaf 100644 --- a/openai/openai_test.go +++ b/openai/openai_test.go @@ -2,6 +2,7 @@ package openai import ( "bytes" + "encoding/base64" "encoding/json" "io" "net/http" @@ -15,6 +16,10 @@ import ( "github.com/stretchr/testify/assert" ) +const prefix = `data:image/jpeg;base64,` +const image = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=` +const imageURL = prefix + image + func TestMiddlewareRequests(t *testing.T) { type testCase struct { Name string @@ -112,6 +117,50 @@ func TestMiddlewareRequests(t *testing.T) { } }, }, + { + Name: "chat handler with image content", + Method: http.MethodPost, + Path: "/api/chat", + Handler: ChatMiddleware, + Setup: func(t *testing.T, req *http.Request) { + body := ChatCompletionRequest{ + Model: "test-model", + Messages: []Message{ + { + Role: "user", Content: []map[string]any{ + {"type": "text", "text": "Hello"}, + {"type": "image_url", "image_url": map[string]string{"url": imageURL}}, + }, + }, + }, + } + + bodyBytes, _ := json.Marshal(body) + + req.Body = io.NopCloser(bytes.NewReader(bodyBytes)) + req.Header.Set("Content-Type", "application/json") + }, + Expected: func(t *testing.T, req *http.Request) { + var chatReq api.ChatRequest + if err := json.NewDecoder(req.Body).Decode(&chatReq); err != nil { + t.Fatal(err) + } + + if chatReq.Messages[0].Role != "user" { + t.Fatalf("expected 'user', got %s", chatReq.Messages[0].Role) + } + + if chatReq.Messages[0].Content != "Hello" { + t.Fatalf("expected 'Hello', got %s", chatReq.Messages[0].Content) + } + + img, _ := base64.StdEncoding.DecodeString(imageURL[len(prefix):]) + + if !bytes.Equal(chatReq.Messages[0].Images[0], img) { + t.Fatalf("expected image encoding, got %s", chatReq.Messages[0].Images[0]) + } + }, + }, } gin.SetMode(gin.TestMode)