From c4b34f2a2af5ce3fe7b05ae2d3334e155029ce6b Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@ollama.com>
Date: Wed, 20 Nov 2024 10:39:56 -0800
Subject: [PATCH] runner.go: Truncate inputs that exceed context rather than
 shifting

Previous versions of the runner would truncate inputs to the context
window before beginning processing. The main processing loop relied
on this behavior if the context needed to be shifted later (due to
token generation). If truncation did not occur then invariants
would be broken, causing crashes or infinite loops.

Later versions attempted to fix these bugs and make the logic less
subtle so that all inputs could be handled. Truncation was removed
to make things consistent.

However, truncation is much faster than processing and shifting, so
removing it caused performance problems when the input vastly exceeded
the context size. This restores the input truncation as a performance
optimization while keeping the more robust processing logic.

Fixes #7762
---
 llama/runner/runner.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 1ed25c27..c7662b33 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -122,7 +122,10 @@ func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequen
 	params.numKeep = min(params.numKeep, s.cache.numCtx-1)
 
 	if len(inputs) > s.cache.numCtx {
-		slog.Warn("input exceeds context length", "prompt", len(inputs), "limit", s.cache.numCtx)
+		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "numKeep", params.numKeep)
+		newInputs := inputs[:params.numKeep]
+		newInputs = append(newInputs, inputs[len(inputs)-s.cache.numCtx+params.numKeep:]...)
+		inputs = newInputs
 	}
 
 	var sc *llama.SamplingContext