From de76b95dd454798d041cdec18c927ae7c5f1e7a3 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <daniel@ollama.com>
Date: Tue, 6 Feb 2024 12:06:43 -0800
Subject: [PATCH] Bump llama.cpp to b2081

---
 llm/llama.cpp                |  2 +-
 llm/patches/01-cache.diff    | 10 +++++-----
 llm/patches/02-shutdown.diff | 27 +++++++++++----------------
 3 files changed, 17 insertions(+), 22 deletions(-)
diff --git a/llm/llama.cpp b/llm/llama.cpp
index d2f650cb..f57fadc0 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit d2f650cb5b04ee2726663e79b47da5efe196ce00
+Subproject commit f57fadc009cbff741a1961cb7896c47d73978d2c
diff --git a/llm/patches/01-cache.diff b/llm/patches/01-cache.diff
index 79f8d002..9565d60b 100644
--- a/llm/patches/01-cache.diff
+++ b/llm/patches/01-cache.diff
@@ -1,8 +1,8 @@
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index a48582ad..9fffffd8 100644
+index d86d7e04..7d71c766 100644
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
-@@ -1564,12 +1564,6 @@ struct llama_server_context
+@@ -1598,12 +1598,6 @@ struct llama_server_context
                          LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                      }
  
@@ -15,7 +15,7 @@ index a48582ad..9fffffd8 100644
                      if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
                      {
                          // we have to evaluate at least 1 token to generate logits.
-@@ -1581,6 +1575,12 @@ struct llama_server_context
+@@ -1615,6 +1609,12 @@ struct llama_server_context
                          }
                      }
  
@@ -26,5 +26,5 @@ index a48582ad..9fffffd8 100644
 +                    slot.cache_tokens = prompt_tokens;
 +
                      LOG_VERBOSE("prompt ingested", {
-                                                     {"n_past", slot.n_past},
-                                                     {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
+                                                     {"n_past",  slot.n_past},
+                                                     {"cached",  tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
diff --git a/llm/patches/02-shutdown.diff b/llm/patches/02-shutdown.diff
index 4c247cc0..c588dcfb 100644
--- a/llm/patches/02-shutdown.diff
+++ b/llm/patches/02-shutdown.diff
@@ -37,26 +37,18 @@ index 11dd82c3..311495a8 100644
  
      llama_backend_free();
 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
-index 70cce072..2acb1eab 100644
+index 70cce072..9124869a 100644
 --- a/examples/server/utils.hpp
 +++ b/examples/server/utils.hpp
-@@ -6,6 +6,7 @@
- #include <mutex>
- #include <condition_variable>
- #include <unordered_map>
-+#include <atomic>
- 
- #include "json.hpp"
- 
-@@ -190,6 +191,7 @@ inline std::string format_chatml(std::vector<json> messages)
+@@ -190,6 +190,7 @@ inline std::string format_chatml(std::vector<json> messages)
  struct llama_server_queue {
      int id = 0;
      std::mutex mutex_tasks;
-+    std::atomic<bool> running;
++    bool running;
      // queues
      std::vector<task_server> queue_tasks;
      std::vector<task_server> queue_tasks_deferred;
-@@ -248,9 +250,15 @@ struct llama_server_queue {
+@@ -248,9 +249,18 @@ struct llama_server_queue {
          queue_tasks_deferred.clear();
      }
  
@@ -64,7 +56,10 @@ index 70cce072..2acb1eab 100644
 -    [[noreturn]]
 +    // end the start_loop routine
 +    void terminate() {
-+        running = false;
++        {
++            std::unique_lock<std::mutex> lock(mutex_tasks);
++            running = false;
++        }
 +        condition_tasks.notify_all();
 +    }
 +
@@ -74,17 +69,17 @@ index 70cce072..2acb1eab 100644
          while (true) {
              // new task arrived
              LOG_VERBOSE("have new task", {});
-@@ -294,8 +302,12 @@ struct llama_server_queue {
+@@ -294,8 +304,12 @@ struct llama_server_queue {
              {
                  std::unique_lock<std::mutex> lock(mutex_tasks);
                  if (queue_tasks.empty()) {
-+                    if (!running.load()) {
++                    if (!running) {
 +                        LOG_VERBOSE("ending start_loop", {});
 +                        return;
 +                    }
                      condition_tasks.wait(lock, [&]{
 -                        return !queue_tasks.empty();
-+                        return (!queue_tasks.empty() || !running.load());
++                        return (!queue_tasks.empty() || !running);
                      });
                  }
              }