ollama/llm/patches/02-shutdown.diff

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a0b46970..7800c6e7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -28,6 +28,7 @@
 #include <chrono>
 #include <condition_variable>
 #include <atomic>
+#include <signal.h>
 
 using json = nlohmann::json;
 
@@ -2511,6 +2512,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
     }
 }
 
+std::function<void(int)> shutdown_handler;
+inline void signal_handler(int signal) { shutdown_handler(signal); }
+
 int main(int argc, char **argv)
 {
 #if SERVER_VERBOSE != 1
@@ -3128,8 +3132,25 @@ int main(int argc, char **argv)
         std::placeholders::_2,
         std::placeholders::_3
     ));
-    llama.queue_tasks.start_loop();
 
+    shutdown_handler = [&](int) {
+        llama.queue_tasks.terminate();
+    };
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+    llama.queue_tasks.start_loop();
+    svr.stop();
     t.join();
 
     llama_backend_free();
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 54854896..0ee670db 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)
 struct llama_server_queue {
     int id = 0;
     std::mutex mutex_tasks;
+    bool running;
     // queues
     std::vector<task_server> queue_tasks;
     std::vector<task_server> queue_tasks_deferred;
@@ -278,9 +279,18 @@ struct llama_server_queue {
         queue_tasks_deferred.clear();
     }
 
-    // Start the main loop. This call is blocking
-    [[noreturn]]
+    // end the start_loop routine
+    void terminate() {
+        {
+            std::unique_lock<std::mutex> lock(mutex_tasks);
+            running = false;
+        }
+        condition_tasks.notify_all();
+    }
+
+    // Start the main loop.
     void start_loop() {
+        running = true;
         while (true) {
             // new task arrived
             LOG_VERBOSE("have new task", {});
@@ -324,8 +334,12 @@ struct llama_server_queue {
             {
                 std::unique_lock<std::mutex> lock(mutex_tasks);
                 if (queue_tasks.empty()) {
+                    if (!running) {
+                        LOG_VERBOSE("ending start_loop", {});
+                        return;
+                    }
                     condition_tasks.wait(lock, [&]{
-                        return !queue_tasks.empty();
+                        return (!queue_tasks.empty() || !running);
                     });
                 }
             }
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`diff --git a/examples/server/server.cpp b/examples/server/server.cpp`
Fix cuda leaks This should resolve the problem where we don't fully unload from the GPU when we go idle. 2024-02-18 23:50:38 +00:00			`index a0b46970..7800c6e7 100644`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`--- a/examples/server/server.cpp`
			`+++ b/examples/server/server.cpp`
			`@@ -28,6 +28,7 @@`
			`#include <chrono>`
			`#include <condition_variable>`
			`#include <atomic>`
			`+#include <signal.h>`

			`using json = nlohmann::json;`

Fix cuda leaks This should resolve the problem where we don't fully unload from the GPU when we go idle. 2024-02-18 23:50:38 +00:00			`@@ -2511,6 +2512,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`}`
			`}`

			`+std::function<void(int)> shutdown_handler;`
			`+inline void signal_handler(int signal) { shutdown_handler(signal); }`
			`+`
			`int main(int argc, char **argv)`
			`{`
			`#if SERVER_VERBOSE != 1`
Fix cuda leaks This should resolve the problem where we don't fully unload from the GPU when we go idle. 2024-02-18 23:50:38 +00:00			`@@ -3128,8 +3132,25 @@ int main(int argc, char **argv)`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`std::placeholders::_2,`
			`std::placeholders::_3`
			`));`
			`- llama.queue_tasks.start_loop();`

			`+ shutdown_handler = [&](int) {`
			`+ llama.queue_tasks.terminate();`
			`+ };`
Fix cuda leaks This should resolve the problem where we don't fully unload from the GPU when we go idle. 2024-02-18 23:50:38 +00:00			`+`
			`+#if defined (__unix__) \|\| (defined (__APPLE__) && defined (__MACH__))`
			`+ struct sigaction sigint_action;`
			`+ sigint_action.sa_handler = signal_handler;`
			`+ sigemptyset (&sigint_action.sa_mask);`
			`+ sigint_action.sa_flags = 0;`
			`+ sigaction(SIGINT, &sigint_action, NULL);`
			`+#elif defined (_WIN32)`
			`+ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {`
			`+ return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;`
			`+ };`
			`+ SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);`
			`+#endif`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`+ llama.queue_tasks.start_loop();`
			`+ svr.stop();`
			`t.join();`

			`llama_backend_free();`
			`diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp`
Fix cuda leaks This should resolve the problem where we don't fully unload from the GPU when we go idle. 2024-02-18 23:50:38 +00:00			`index 54854896..0ee670db 100644`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`--- a/examples/server/utils.hpp`
			`+++ b/examples/server/utils.hpp`
Fix cuda leaks This should resolve the problem where we don't fully unload from the GPU when we go idle. 2024-02-18 23:50:38 +00:00			`@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector<json> messages)`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`struct llama_server_queue {`
			`int id = 0;`
			`std::mutex mutex_tasks;`
Bump llama.cpp to b2081 2024-02-06 20:06:43 +00:00			`+ bool running;`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`// queues`
			`std::vector<task_server> queue_tasks;`
			`std::vector<task_server> queue_tasks_deferred;`
Fix cuda leaks This should resolve the problem where we don't fully unload from the GPU when we go idle. 2024-02-18 23:50:38 +00:00			`@@ -278,9 +279,18 @@ struct llama_server_queue {`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`queue_tasks_deferred.clear();`
			`}`

			`- // Start the main loop. This call is blocking`
			`- [[noreturn]]`
			`+ // end the start_loop routine`
			`+ void terminate() {`
Bump llama.cpp to b2081 2024-02-06 20:06:43 +00:00			`+ {`
			`+ std::unique_lock<std::mutex> lock(mutex_tasks);`
			`+ running = false;`
			`+ }`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`+ condition_tasks.notify_all();`
			`+ }`
			`+`
			`+ // Start the main loop.`
			`void start_loop() {`
			`+ running = true;`
			`while (true) {`
			`// new task arrived`
			`LOG_VERBOSE("have new task", {});`
Fix cuda leaks This should resolve the problem where we don't fully unload from the GPU when we go idle. 2024-02-18 23:50:38 +00:00			`@@ -324,8 +334,12 @@ struct llama_server_queue {`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`{`
			`std::unique_lock<std::mutex> lock(mutex_tasks);`
			`if (queue_tasks.empty()) {`
Bump llama.cpp to b2081 2024-02-06 20:06:43 +00:00			`+ if (!running) {`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`+ LOG_VERBOSE("ending start_loop", {});`
			`+ return;`
			`+ }`
			`condition_tasks.wait(lock, [&]{`
			`- return !queue_tasks.empty();`
Bump llama.cpp to b2081 2024-02-06 20:06:43 +00:00			`+ return (!queue_tasks.empty() \|\| !running);`
Bump llama.cpp to b1999 This requires an upstream change to support graceful termination, carried as a patch. 2024-01-29 20:58:17 +00:00			`});`
			`}`
			`}`