Merge pull request #6260 from ollama/mxyng/mem

llama3.1 memory
This commit is contained in:
Michael Yang 2024-09-05 13:22:08 -07:00 committed by GitHub
commit bf612cd608
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -360,11 +360,13 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
switch llm.KV().Architecture() {
case "llama":
fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
fullOffload = max(
4*batch*(1+4*embedding+context*(1+heads)),
4*batch*(embedding+vocab),
)
partialOffload = 4 * batch * embedding
partialOffload += max(
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128,
)