runner: Flush pending responses before returning
If there are any pending reponses (such as from potential stop tokens) then we should send them back before ending the sequence. Otherwise, we can be missing tokens at the end of a response. Fixes #6707
This commit is contained in:
parent
abed273de3
commit
93ac3760cb
1 changed files with 33 additions and 27 deletions
10
llm/ext_server/server.cpp
vendored
10
llm/ext_server/server.cpp
vendored
|
@ -913,7 +913,9 @@ struct llama_server_context
|
||||||
slot.sampled = result.tok;
|
slot.sampled = result.tok;
|
||||||
|
|
||||||
// search stop word and delete it
|
// search stop word and delete it
|
||||||
|
if (!llama_token_is_eog(model, result.tok))
|
||||||
slot.generated_text += token_str;
|
slot.generated_text += token_str;
|
||||||
|
|
||||||
slot.has_next_token = true;
|
slot.has_next_token = true;
|
||||||
|
|
||||||
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
||||||
|
@ -954,6 +956,8 @@ struct llama_server_context
|
||||||
if (!incomplete)
|
if (!incomplete)
|
||||||
{
|
{
|
||||||
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
|
|
||||||
|
if (!llama_token_is_eog(model, result.tok)) {
|
||||||
const std::string str_test = slot.generated_text.substr(pos);
|
const std::string str_test = slot.generated_text.substr(pos);
|
||||||
bool is_stop_full = false;
|
bool is_stop_full = false;
|
||||||
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
||||||
|
@ -979,6 +983,10 @@ struct llama_server_context
|
||||||
slot.n_sent_text += result.text_to_send.size();
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
// add the token to slot queue and cache
|
// add the token to slot queue and cache
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||||
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
|
}
|
||||||
|
|
||||||
if (slot.params.stream)
|
if (slot.params.stream)
|
||||||
{
|
{
|
||||||
|
@ -1117,9 +1125,7 @@ struct llama_server_context
|
||||||
{"multimodal", multimodal}
|
{"multimodal", multimodal}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!llama_token_is_eog(model, tkn.tok)) {
|
|
||||||
res.result_json["content"] = tkn.text_to_send;
|
res.result_json["content"] = tkn.text_to_send;
|
||||||
}
|
|
||||||
|
|
||||||
if (slot.sparams.n_probs > 0)
|
if (slot.sparams.n_probs > 0)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in a new issue