Bump llama.cpp to b2081

2024-02-06 12:06:43 -08:00 · 2024-02-06 12:06:43 -08:00 · de76b95dd4
commit de76b95dd4
parent 59ec837ef6
3 changed files with 17 additions and 22 deletions
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@ -1 +1 @@
-Subproject commit d2f650cb5b04ee2726663e79b47da5efe196ce00
+Subproject commit f57fadc009cbff741a1961cb7896c47d73978d2c
--- a/llm/patches/01-cache.diff
+++ b/llm/patches/01-cache.diff
@ -1,8 +1,8 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index a48582ad..9fffffd8 100644
+index d86d7e04..7d71c766 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -1564,12 +1564,6 @@ struct llama_server_context
+@@ -1598,12 +1598,6 @@ struct llama_server_context
                         LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                     }
 
@ -15,7 +15,7 @@ index a48582ad..9fffffd8 100644
                     if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
                     {
                         // we have to evaluate at least 1 token to generate logits.
-@@ -1581,6 +1575,12 @@ struct llama_server_context
+@@ -1615,6 +1609,12 @@ struct llama_server_context
                         }
                     }
 
--- a/llm/patches/02-shutdown.diff
+++ b/llm/patches/02-shutdown.diff
@ -37,26 +37,18 @@ index 11dd82c3..311495a8 100644
 
     llama_backend_free();
 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
-index 70cce072..2acb1eab 100644
+index 70cce072..9124869a 100644
 --- a/examples/server/utils.hpp
 +++ b/examples/server/utils.hpp
-@@ -6,6 +6,7 @@
- #include <mutex>
- #include <condition_variable>
- #include <unordered_map>
-+#include <atomic>
- 
- #include "json.hpp"
- 
-@@ -190,6 +191,7 @@ inline std::string format_chatml(std::vector<json> messages)
+@@ -190,6 +190,7 @@ inline std::string format_chatml(std::vector<json> messages)
 struct llama_server_queue {
     int id = 0;
     std::mutex mutex_tasks;
-+    std::atomic<bool> running;
+    bool running;
     // queues
     std::vector<task_server> queue_tasks;
     std::vector<task_server> queue_tasks_deferred;
-@@ -248,9 +250,15 @@ struct llama_server_queue {
+@@ -248,9 +249,18 @@ struct llama_server_queue {
         queue_tasks_deferred.clear();
     }
 
@ -64,7 +56,10 @@ index 70cce072..2acb1eab 100644
 -    [[noreturn]]
 +    // end the start_loop routine
 +    void terminate() {
+        {
+            std::unique_lock<std::mutex> lock(mutex_tasks);
 +            running = false;
+        }
 +        condition_tasks.notify_all();
 +    }
 +
@ -74,17 +69,17 @@ index 70cce072..2acb1eab 100644
         while (true) {
             // new task arrived
             LOG_VERBOSE("have new task", {});
-@@ -294,8 +302,12 @@ struct llama_server_queue {
+@@ -294,8 +304,12 @@ struct llama_server_queue {
             {
                 std::unique_lock<std::mutex> lock(mutex_tasks);
                 if (queue_tasks.empty()) {
-+                    if (!running.load()) {
+                    if (!running) {
 +                        LOG_VERBOSE("ending start_loop", {});
 +                        return;
 +                    }
                     condition_tasks.wait(lock, [&]{
 -                        return !queue_tasks.empty();
-+                        return (!queue_tasks.empty() || !running.load());
+                        return (!queue_tasks.empty() || !running);
                     });
                 }
             }