runner: Flush pending responses before returning

If there are any pending reponses (such as from potential stop
tokens) then we should send them back before ending the sequence.
Otherwise, we can be missing tokens at the end of a response.

Fixes #6707
This commit is contained in:
Jesse Gross 2024-09-11 14:00:20 -07:00
parent abed273de3
commit 93ac3760cb

View file

@ -913,7 +913,9 @@ struct llama_server_context
slot.sampled = result.tok; slot.sampled = result.tok;
// search stop word and delete it // search stop word and delete it
if (!llama_token_is_eog(model, result.tok))
slot.generated_text += token_str; slot.generated_text += token_str;
slot.has_next_token = true; slot.has_next_token = true;
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
@ -954,6 +956,8 @@ struct llama_server_context
if (!incomplete) if (!incomplete)
{ {
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
if (!llama_token_is_eog(model, result.tok)) {
const std::string str_test = slot.generated_text.substr(pos); const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false; bool is_stop_full = false;
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
@ -979,6 +983,10 @@ struct llama_server_context
slot.n_sent_text += result.text_to_send.size(); slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache // add the token to slot queue and cache
} }
} else {
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
}
if (slot.params.stream) if (slot.params.stream)
{ {
@ -1117,9 +1125,7 @@ struct llama_server_context
{"multimodal", multimodal} {"multimodal", multimodal}
}; };
if (!llama_token_is_eog(model, tkn.tok)) {
res.result_json["content"] = tkn.text_to_send; res.result_json["content"] = tkn.text_to_send;
}
if (slot.sparams.n_probs > 0) if (slot.sparams.n_probs > 0)
{ {