Merge pull request #7657 from ollama/mxyng/sync

fix(mllama): sync backend between batches
2024-11-14 09:40:04 -08:00 · 2024-11-14 09:40:04 -08:00 · 549c2bdfcf
commit 549c2bdfcf
parent 67691e410d 5b3393b6a2
2 changed files with 11 additions and 0 deletions
--- a/llama/llama.go
+++ b/llama/llama.go
@ -600,6 +600,10 @@ func (c *Context) SetCrossAttention(state bool) {
 	C.llama_set_cross_attention(c.c, C.bool(state))
 }
 func (c *Context) Synchronize() {
 	C.llama_synchronize(c.c)
 }
 // sampling
 // TODO: this is a temporary wrapper to allow calling C++ code from CGo
 type SamplingContext struct {
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@ -409,6 +409,13 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return
 	}
 	if crossAttention {
 		// synchronize state to ensure the cross attention batch is complete.
 		// needed specifically for multi-GPU systems otherwise an inflight
 		// task may be incorrectly invalidated causing a crash
 		s.lc.Synchronize()
 	}
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue