runner: Flush pending responses before returning

If there are any pending reponses (such as from potential stop tokens) then we should send them back before ending the sequence. Otherwise, we can be missing tokens at the end of a response. Fixes #6707
2024-09-11 14:00:20 -07:00 · 2024-09-11 14:00:20 -07:00 · 93ac3760cb
commit 93ac3760cb
parent abed273de3
1 changed files with 33 additions and 27 deletions
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@ -913,7 +913,9 @@ struct llama_server_context
        slot.sampled = result.tok;

        // search stop word and delete it
-        slot.generated_text += token_str;
+        if (!llama_token_is_eog(model, result.tok))
+            slot.generated_text += token_str;
+
        slot.has_next_token = true;

        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
@ -954,30 +956,36 @@ struct llama_server_context
        if (!incomplete)
        {
            size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
-            const std::string str_test = slot.generated_text.substr(pos);
-            bool is_stop_full = false;
-            size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
-            if (stop_pos != std::string::npos)
-            {
-                is_stop_full = true;
-                slot.generated_text.erase(
-                    slot.generated_text.begin() + pos + stop_pos,
-                    slot.generated_text.end());
-                pos = std::min(slot.n_sent_text, slot.generated_text.size());
-            }
-            else
-            {
-                is_stop_full = false;
-                stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
-            }

-            // check if there is any token to predict
-            if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
-            {
-                // no send the stop word in the response
-                result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
-                slot.n_sent_text += result.text_to_send.size();
-                // add the token to slot queue and cache
+            if (!llama_token_is_eog(model, result.tok)) {
+                const std::string str_test = slot.generated_text.substr(pos);
+                bool is_stop_full = false;
+                size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
+                if (stop_pos != std::string::npos)
+                {
+                    is_stop_full = true;
+                    slot.generated_text.erase(
+                        slot.generated_text.begin() + pos + stop_pos,
+                        slot.generated_text.end());
+                    pos = std::min(slot.n_sent_text, slot.generated_text.size());
+                }
+                else
+                {
+                    is_stop_full = false;
+                    stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
+                }
+
+                // check if there is any token to predict
+                if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
+                {
+                    // no send the stop word in the response
+                    result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                    slot.n_sent_text += result.text_to_send.size();
+                    // add the token to slot queue and cache
+                }
+            } else {
+                    result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
+                    slot.n_sent_text += result.text_to_send.size();
            }

            if (slot.params.stream)
@ -1117,9 +1125,7 @@ struct llama_server_context
            {"multimodal", multimodal}
        };

-        if (!llama_token_is_eog(model, tkn.tok)) {
-            res.result_json["content"] = tkn.text_to_send;
-        }
+        res.result_json["content"] = tkn.text_to_send;

        if (slot.sparams.n_probs > 0)
        {