runner: Flush pending responses before returning

If there are any pending reponses (such as from potential stop
tokens) then we should send them back before ending the sequence.
Otherwise, we can be missing tokens at the end of a response.

Fixes #6707
This commit is contained in:
Jesse Gross 2024-09-11 14:00:20 -07:00
parent abed273de3
commit 93ac3760cb

View file

@ -913,7 +913,9 @@ struct llama_server_context
slot.sampled = result.tok; slot.sampled = result.tok;
// search stop word and delete it // search stop word and delete it
slot.generated_text += token_str; if (!llama_token_is_eog(model, result.tok))
slot.generated_text += token_str;
slot.has_next_token = true; slot.has_next_token = true;
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
@ -954,30 +956,36 @@ struct llama_server_context
if (!incomplete) if (!incomplete)
{ {
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
if (stop_pos != std::string::npos)
{
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
}
else
{
is_stop_full = false;
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
}
// check if there is any token to predict if (!llama_token_is_eog(model, result.tok)) {
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) const std::string str_test = slot.generated_text.substr(pos);
{ bool is_stop_full = false;
// no send the stop word in the response size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
result.text_to_send = slot.generated_text.substr(pos, std::string::npos); if (stop_pos != std::string::npos)
slot.n_sent_text += result.text_to_send.size(); {
// add the token to slot queue and cache is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
}
else
{
is_stop_full = false;
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
}
// check if there is any token to predict
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
{
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
} else {
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
} }
if (slot.params.stream) if (slot.params.stream)
@ -1117,9 +1125,7 @@ struct llama_server_context
{"multimodal", multimodal} {"multimodal", multimodal}
}; };
if (!llama_token_is_eog(model, tkn.tok)) { res.result_json["content"] = tkn.text_to_send;
res.result_json["content"] = tkn.text_to_send;
}
if (slot.sparams.n_probs > 0) if (slot.sparams.n_probs > 0)
{ {