From c4b34f2a2af5ce3fe7b05ae2d3334e155029ce6b Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Nov 2024 10:39:56 -0800 Subject: [PATCH] runner.go: Truncate inputs that exceed context rather than shifting Previous versions of the runner would truncate inputs to the context window before beginning processing. The main processing loop relied on this behavior if the context needed to be shifted later (due to token generation). If truncation did not occur then invariants would be broken, causing crashes or infinite loops. Later versions attempted to fix these bugs and make the logic less subtle so that all inputs could be handled. Truncation was removed to make things consistent. However, truncation is much faster than processing and shifting, so removing it caused performance problems when the input vastly exceeded the context size. This restores the input truncation as a performance optimization while keeping the more robust processing logic. Fixes #7762 --- llama/runner/runner.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 1ed25c27..c7662b33 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -122,7 +122,10 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen params.numKeep = min(params.numKeep, s.cache.numCtx-1) if len(inputs) > s.cache.numCtx { - slog.Warn("input exceeds context length", "prompt", len(inputs), "limit", s.cache.numCtx) + slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep) + newInputs := inputs[:params.numKeep] + newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...) + inputs = newInputs } var sc *llama.SamplingContext