From 93ac3760cb4abdd0f54ad755ed15ec4254026282 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 11 Sep 2024 14:00:20 -0700 Subject: [PATCH] runner: Flush pending responses before returning If there are any pending reponses (such as from potential stop tokens) then we should send them back before ending the sequence. Otherwise, we can be missing tokens at the end of a response. Fixes #6707 --- llm/ext_server/server.cpp | 60 +++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index fc673c47..6ce457ae 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -913,7 +913,9 @@ struct llama_server_context slot.sampled = result.tok; // search stop word and delete it - slot.generated_text += token_str; + if (!llama_token_is_eog(model, result.tok)) + slot.generated_text += token_str; + slot.has_next_token = true; if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) @@ -954,30 +956,36 @@ struct llama_server_context if (!incomplete) { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); - const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; - size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); - if (stop_pos != std::string::npos) - { - is_stop_full = true; - slot.generated_text.erase( - slot.generated_text.begin() + pos + stop_pos, - slot.generated_text.end()); - pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } - else - { - is_stop_full = false; - stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot); - } - // check if there is any token to predict - if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) - { - // no send the stop word in the response - result.text_to_send = slot.generated_text.substr(pos, std::string::npos); - slot.n_sent_text += result.text_to_send.size(); - // add the token to slot queue and cache + if (!llama_token_is_eog(model, result.tok)) { + const std::string str_test = slot.generated_text.substr(pos); + bool is_stop_full = false; + size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); + if (stop_pos != std::string::npos) + { + is_stop_full = true; + slot.generated_text.erase( + slot.generated_text.begin() + pos + stop_pos, + slot.generated_text.end()); + pos = std::min(slot.n_sent_text, slot.generated_text.size()); + } + else + { + is_stop_full = false; + stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot); + } + + // check if there is any token to predict + if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) + { + // no send the stop word in the response + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.n_sent_text += result.text_to_send.size(); + // add the token to slot queue and cache + } + } else { + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.n_sent_text += result.text_to_send.size(); } if (slot.params.stream) @@ -1117,9 +1125,7 @@ struct llama_server_context {"multimodal", multimodal} }; - if (!llama_token_is_eog(model, tkn.tok)) { - res.result_json["content"] = tkn.text_to_send; - } + res.result_json["content"] = tkn.text_to_send; if (slot.sparams.n_probs > 0) {