diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index fc673c47..6ce457ae 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -913,7 +913,9 @@ struct llama_server_context slot.sampled = result.tok; // search stop word and delete it - slot.generated_text += token_str; + if (!llama_token_is_eog(model, result.tok)) + slot.generated_text += token_str; + slot.has_next_token = true; if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) @@ -954,30 +956,36 @@ struct llama_server_context if (!incomplete) { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); - const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; - size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); - if (stop_pos != std::string::npos) - { - is_stop_full = true; - slot.generated_text.erase( - slot.generated_text.begin() + pos + stop_pos, - slot.generated_text.end()); - pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } - else - { - is_stop_full = false; - stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot); - } - // check if there is any token to predict - if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) - { - // no send the stop word in the response - result.text_to_send = slot.generated_text.substr(pos, std::string::npos); - slot.n_sent_text += result.text_to_send.size(); - // add the token to slot queue and cache + if (!llama_token_is_eog(model, result.tok)) { + const std::string str_test = slot.generated_text.substr(pos); + bool is_stop_full = false; + size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); + if (stop_pos != std::string::npos) + { + is_stop_full = true; + slot.generated_text.erase( + slot.generated_text.begin() + pos + stop_pos, + slot.generated_text.end()); + pos = std::min(slot.n_sent_text, slot.generated_text.size()); + } + else + { + is_stop_full = false; + stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot); + } + + // check if there is any token to predict + if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) + { + // no send the stop word in the response + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.n_sent_text += result.text_to_send.size(); + // add the token to slot queue and cache + } + } else { + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.n_sent_text += result.text_to_send.size(); } if (slot.params.stream) @@ -1117,9 +1125,7 @@ struct llama_server_context {"multimodal", multimodal} }; - if (!llama_token_is_eog(model, tkn.tok)) { - res.result_json["content"] = tkn.text_to_send; - } + res.result_json["content"] = tkn.text_to_send; if (slot.sparams.n_probs > 0) {