diff --git a/llm/patches/01-cache.diff b/llm/patches/01-cache.diff index 9565d60b..9ccea86f 100644 --- a/llm/patches/01-cache.diff +++ b/llm/patches/01-cache.diff @@ -1,30 +1,21 @@ diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index d86d7e04..7d71c766 100644 +index d86d7e04..2694e92e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp -@@ -1598,12 +1598,6 @@ struct llama_server_context - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); - } +@@ -901,13 +901,15 @@ struct llama_server_context + slot.sent_count += result.text_to_send.size(); + // add the token to slot queue and cache + } +- slot.add_token_string(result); ++ + if (slot.params.stream) + { + send_partial_response(slot, result); + } + } -- LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); -- -- llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); -- -- slot.cache_tokens = prompt_tokens; -- - if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) - { - // we have to evaluate at least 1 token to generate logits. -@@ -1615,6 +1609,12 @@ struct llama_server_context - } - } - -+ LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); ++ slot.add_token_string(result); + -+ llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); -+ -+ slot.cache_tokens = prompt_tokens; -+ - LOG_VERBOSE("prompt ingested", { - {"n_past", slot.n_past}, - {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, + if (incomplete) + { + slot.has_next_token = true;