From 34a75102f7b128ba9675be544ac86dc6e2fc8392 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 4 Nov 2024 17:30:20 -0800 Subject: [PATCH] prompt: Use a single token when estimating mllama context size Currently we assume that images take 768 tokens of context size for the purposes of clipping old messages that exceed the context window. However, our mllama implementation stores the full image embedding in a single token. As a result, there is significant waste of context space. Ideally, we would handle this more generically and have the implementation report the number of tokens. However, at the moment this would just result in a similar set of 'if' conditions in the runner plus APIs to report it back. So for now, we just keep this simple. --- server/prompt.go | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/server/prompt.go b/server/prompt.go index f91b94d8..a6401983 100644 --- a/server/prompt.go +++ b/server/prompt.go @@ -27,6 +27,16 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. isMllama := checkMllamaModelFamily(m) + var imageNumTokens int + // TODO: Ideally we would compute this from the projector metadata but some pieces are implementation dependent + if isMllama { + // Our mllama implementation packs all of the embeddings into a single token + imageNumTokens = 1 + } else { + // Clip images are represented as 768 tokens, each an embedding + imageNumTokens = 768 + } + n := len(msgs) - 1 // in reverse, find all messages that fit into context window for i := n; i >= 0; i-- { @@ -59,9 +69,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api. ctxLen := len(s) if m.ProjectorPaths != nil { for _, m := range msgs[i:] { - // images are represented as 768 sized embeddings - // TODO: get embedding length from project metadata - ctxLen += 768 * len(m.Images) + ctxLen += imageNumTokens * len(m.Images) } }