ollama/server/prompt.go

package server

import (
	"bytes"
	"context"
	"encoding/binary"
	"errors"
	"fmt"
	"log/slog"
	"strings"

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/llm"
	"github.com/ollama/ollama/server/imageproc"
	"github.com/ollama/ollama/template"
)

type tokenizeFunc func(context.Context, string) ([]int, error)

var errTooManyImages = errors.New("vision model only supports a single image per message")

// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
// latest message and 2) system messages
func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
	var system []api.Message

	isMllama := checkMllamaModelFamily(m)

	var imageNumTokens int
	// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent
	if isMllama {
		// Our mllama implementation packs all of the embeddings into a single token
		imageNumTokens = 1
	} else {
		// Clip images are represented as 768 tokens, each an embedding
		imageNumTokens = 768
	}

	n := len(msgs) - 1
	// in reverse, find all messages that fit into context window
	for i := n; i >= 0; i-- {
		if isMllama && len(msgs[i].Images) > 1 {
			return "", nil, errTooManyImages
		}

		// always include the last message
		if i == n {
			continue
		}

		system = make([]api.Message, 0)
		for j := range i {
			if msgs[j].Role == "system" {
				system = append(system, msgs[j])
			}
		}

		var b bytes.Buffer
		if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools}); err != nil {
			return "", nil, err
		}

		s, err := tokenize(ctx, b.String())
		if err != nil {
			return "", nil, err
		}

		ctxLen := len(s)
		if m.ProjectorPaths != nil {
			for _, m := range msgs[i:] {
				ctxLen += imageNumTokens * len(m.Images)
			}
		}

		if ctxLen > opts.NumCtx {
			slog.Debug("truncating input messages which exceed context length", "truncated", len(msgs[i:]))
			break
		} else {
			n = i
		}
	}

	currMsgIdx := n

	for cnt, msg := range msgs[currMsgIdx:] {
		prefix := ""
		imgPrompt := ""
		prompt := msg.Content

		for _, i := range msg.Images {
			var imgData llm.ImageData

			if isMllama {
				data, aspectRatioID, err := imageproc.Preprocess(i)
				if err != nil {
					return "", nil, err
				}

				buf := new(bytes.Buffer)
				err = binary.Write(buf, binary.LittleEndian, data)
				if err != nil {
					return "", nil, err
				}

				imgData = llm.ImageData{
					ID:            len(images),
					Data:          buf.Bytes(),
					AspectRatioID: aspectRatioID,
				}
				imgPrompt = "<|image|>"
			} else {
				imgData = llm.ImageData{
					ID:   len(images),
					Data: i,
				}
				imgPrompt = " "
			}

			imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
			if !strings.Contains(prompt, "[img]") {
				prefix += imgTag
			} else {
				prompt = strings.Replace(prompt, "[img]", imgTag, 1)
			}

			images = append(images, imgData)
		}
		msgs[currMsgIdx+cnt].Content = strings.TrimSpace(prefix + imgPrompt + prompt)
	}

	// truncate any messages that do not fit into the context window
	var b bytes.Buffer
	if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools}); err != nil {
		return "", nil, err
	}

	return b.String(), images, nil
}

func checkMllamaModelFamily(m *Model) bool {
	for _, arch := range m.Config.ModelFamilies {
		if arch == "mllama" {
			return true
		}
	}
	return false
}
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`package server`

			`import (`
update message processing 2024-06-17 17:38:55 +00:00			`"bytes"`
			`"context"`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`"encoding/binary"`
			`"errors"`
			`"fmt"`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`"log/slog"`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`"strings"`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00
change `github.com/jmorganca/ollama` to `github.com/ollama/ollama` (#3347) 2024-03-26 20:04:17 +00:00			`"github.com/ollama/ollama/api"`
update message processing 2024-06-17 17:38:55 +00:00			`"github.com/ollama/ollama/llm"`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`"github.com/ollama/ollama/server/imageproc"`
rename templates to template 2024-06-10 21:54:42 +00:00			`"github.com/ollama/ollama/template"`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`)`

comments 2024-06-20 18:00:08 +00:00			`type tokenizeFunc func(context.Context, string) ([]int, error)`

image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`var errTooManyImages = errors.New("vision model only supports a single image per message")`

comments 2024-06-20 18:00:08 +00:00			`// chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.`
			`// chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the`
			`// latest message and 2) system messages`
tools 2024-06-20 20:45:47 +00:00			`func chatPrompt(ctx context.Context, m Model, tokenize tokenizeFunc, opts api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {`
update message processing 2024-06-17 17:38:55 +00:00			`var system []api.Message`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00
			`isMllama := checkMllamaModelFamily(m)`

prompt: Use a single token when estimating mllama context size Currently we assume that images take 768 tokens of context size for the purposes of clipping old messages that exceed the context window. However, our mllama implementation stores the full image embedding in a single token. As a result, there is significant waste of context space. Ideally, we would handle this more generically and have the implementation report the number of tokens. However, at the moment this would just result in a similar set of 'if' conditions in the runner plus APIs to report it back. So for now, we just keep this simple. 2024-11-05 01:30:20 +00:00			`var imageNumTokens int`
			`// TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent`
			`if isMllama {`
			`// Our mllama implementation packs all of the embeddings into a single token`
			`imageNumTokens = 1`
			`} else {`
			`// Clip images are represented as 768 tokens, each an embedding`
			`imageNumTokens = 768`
			`}`

update message processing 2024-06-17 17:38:55 +00:00			`n := len(msgs) - 1`
comments 2024-06-20 18:00:08 +00:00			`// in reverse, find all messages that fit into context window`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`for i := n; i >= 0; i-- {`
			`if isMllama && len(msgs[i].Images) > 1 {`
			`return "", nil, errTooManyImages`
			`}`

			`// always include the last message`
			`if i == n {`
			`continue`
			`}`

fix system prompt (#5662) * fix system prompt * execute template when hitting previous roles * fix tests --------- Co-authored-by: jmorganca <jmorganca@gmail.com> 2024-07-13 04:04:44 +00:00			`system = make([]api.Message, 0)`
			`for j := range i {`
			`if msgs[j].Role == "system" {`
			`system = append(system, msgs[j])`
			`}`
			`}`

update message processing 2024-06-17 17:38:55 +00:00			`var b bytes.Buffer`
tools 2024-06-20 20:45:47 +00:00			`if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools}); err != nil {`
update message processing 2024-06-17 17:38:55 +00:00			`return "", nil, err`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`}`

comments 2024-06-20 18:00:08 +00:00			`s, err := tokenize(ctx, b.String())`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`if err != nil {`
update message processing 2024-06-17 17:38:55 +00:00			`return "", nil, err`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`}`

image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`ctxLen := len(s)`
comments 2024-06-20 18:00:08 +00:00			`if m.ProjectorPaths != nil {`
update message processing 2024-06-17 17:38:55 +00:00			`for _, m := range msgs[i:] {`
prompt: Use a single token when estimating mllama context size Currently we assume that images take 768 tokens of context size for the purposes of clipping old messages that exceed the context window. However, our mllama implementation stores the full image embedding in a single token. As a result, there is significant waste of context space. Ideally, we would handle this more generically and have the implementation report the number of tokens. However, at the moment this would just result in a similar set of 'if' conditions in the runner plus APIs to report it back. So for now, we just keep this simple. 2024-11-05 01:30:20 +00:00			`ctxLen += imageNumTokens * len(m.Images)`
update message processing 2024-06-17 17:38:55 +00:00			`}`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`}`

image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`if ctxLen > opts.NumCtx {`
update message processing 2024-06-17 17:38:55 +00:00			`slog.Debug("truncating input messages which exceed context length", "truncated", len(msgs[i:]))`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`break`
update message processing 2024-06-17 17:38:55 +00:00			`} else {`
			`n = i`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`}`
update message processing 2024-06-17 17:38:55 +00:00			`}`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`currMsgIdx := n`

runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00			`for cnt, msg := range msgs[currMsgIdx:] {`
			`prefix := ""`
			`imgPrompt := ""`
			`prompt := msg.Content`

			`for _, i := range msg.Images {`
			`var imgData llm.ImageData`

			`if isMllama {`
			`data, aspectRatioID, err := imageproc.Preprocess(i)`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`if err != nil {`
			`return "", nil, err`
			`}`

			`buf := new(bytes.Buffer)`
			`err = binary.Write(buf, binary.LittleEndian, data)`
			`if err != nil {`
			`return "", nil, err`
			`}`

runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00			`imgData = llm.ImageData{`
			`ID: len(images),`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`Data: buf.Bytes(),`
			`AspectRatioID: aspectRatioID,`
			`}`
runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00			`imgPrompt = "<\|image\|>"`
			`} else {`
			`imgData = llm.ImageData{`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`ID: len(images),`
			`Data: i,`
			`}`
runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00			`imgPrompt = " "`
			`}`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00
runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00			`imgTag := fmt.Sprintf("[img-%d]", imgData.ID)`
			`if !strings.Contains(prompt, "[img]") {`
			`prefix += imgTag`
			`} else {`
			`prompt = strings.Replace(prompt, "[img]", imgTag, 1)`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`}`
runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00
			`images = append(images, imgData)`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`}`
runner.go: Better abstract vision model integration -Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me> 2024-10-11 22:34:01 +00:00			`msgs[currMsgIdx+cnt].Content = strings.TrimSpace(prefix + imgPrompt + prompt)`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`}`

comments 2024-06-20 18:00:08 +00:00			`// truncate any messages that do not fit into the context window`
update message processing 2024-06-17 17:38:55 +00:00			`var b bytes.Buffer`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools}); err != nil {`
update message processing 2024-06-17 17:38:55 +00:00			`return "", nil, err`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`}`

image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`return b.String(), images, nil`
			`}`

			`func checkMllamaModelFamily(m *Model) bool {`
			`for _, arch := range m.Config.ModelFamilies {`
			`if arch == "mllama" {`
			`return true`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`}`
			`}`
image processing for llama3.2 (#6963) Co-authored-by: jmorganca <jmorganca@gmail.com> Co-authored-by: Michael Yang <mxyng@pm.me> Co-authored-by: Jesse Gross <jesse@ollama.com> 2024-10-18 23:12:35 +00:00			`return false`
Fix issues with templating prompt in chat mode (#2460) 2024-02-12 23:06:57 +00:00			`}`