diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 61ef20dc..52ac653f 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1032,7 +1032,7 @@ struct llama_server_context slot.has_next_token = false; } - if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) + if (llama_token_is_eog(model, result.tok)) { slot.stopped_eos = true; slot.has_next_token = false; @@ -1144,12 +1144,15 @@ struct llama_server_context res.result_json = json { - {"content", tkn.text_to_send}, {"stop", false}, {"slot_id", slot.id}, {"multimodal", multimodal} }; + if (!llama_token_is_eog(model, tkn.tok)) { + res.result_json["content"] = tkn.text_to_send; + } + if (slot.sparams.n_probs > 0) { std::vector probs_output = {}; diff --git a/llm/llama.cpp b/llm/llama.cpp index f4ab2a41..f364eb6f 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit f4ab2a41476600a98067a9474ea8f9e6db41bcfa +Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961