diff --git a/llm/patches/02-shutdown.diff b/llm/patches/02-shutdown.diff index c588dcfb..fc13e328 100644 --- a/llm/patches/02-shutdown.diff +++ b/llm/patches/02-shutdown.diff @@ -1,5 +1,5 @@ diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 11dd82c3..311495a8 100644 +index a0b46970..7800c6e7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -28,6 +28,7 @@ @@ -10,7 +10,7 @@ index 11dd82c3..311495a8 100644 using json = nlohmann::json; -@@ -2394,6 +2395,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con +@@ -2511,6 +2512,9 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con } } @@ -20,7 +20,7 @@ index 11dd82c3..311495a8 100644 int main(int argc, char **argv) { #if SERVER_VERBOSE != 1 -@@ -3014,8 +3018,14 @@ int main(int argc, char **argv) +@@ -3128,8 +3132,25 @@ int main(int argc, char **argv) std::placeholders::_2, std::placeholders::_3 )); @@ -29,18 +29,29 @@ index 11dd82c3..311495a8 100644 + shutdown_handler = [&](int) { + llama.queue_tasks.terminate(); + }; -+ signal(SIGTERM, signal_handler); -+ signal(SIGINT, signal_handler); ++ ++#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) ++ struct sigaction sigint_action; ++ sigint_action.sa_handler = signal_handler; ++ sigemptyset (&sigint_action.sa_mask); ++ sigint_action.sa_flags = 0; ++ sigaction(SIGINT, &sigint_action, NULL); ++#elif defined (_WIN32) ++ auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { ++ return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; ++ }; ++ SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); ++#endif + llama.queue_tasks.start_loop(); + svr.stop(); t.join(); llama_backend_free(); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp -index 70cce072..9124869a 100644 +index 54854896..0ee670db 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp -@@ -190,6 +190,7 @@ inline std::string format_chatml(std::vector messages) +@@ -220,6 +220,7 @@ inline std::string format_chatml(std::vector messages) struct llama_server_queue { int id = 0; std::mutex mutex_tasks; @@ -48,7 +59,7 @@ index 70cce072..9124869a 100644 // queues std::vector queue_tasks; std::vector queue_tasks_deferred; -@@ -248,9 +249,18 @@ struct llama_server_queue { +@@ -278,9 +279,18 @@ struct llama_server_queue { queue_tasks_deferred.clear(); } @@ -69,7 +80,7 @@ index 70cce072..9124869a 100644 while (true) { // new task arrived LOG_VERBOSE("have new task", {}); -@@ -294,8 +304,12 @@ struct llama_server_queue { +@@ -324,8 +334,12 @@ struct llama_server_queue { { std::unique_lock lock(mutex_tasks); if (queue_tasks.empty()) { diff --git a/llm/patches/03-cudaleaks.diff b/llm/patches/03-cudaleaks.diff new file mode 100644 index 00000000..674f8b1a --- /dev/null +++ b/llm/patches/03-cudaleaks.diff @@ -0,0 +1,116 @@ +diff --git a/examples/server/server.cpp b/examples/server/server.cpp +index 3102762c..568ac1d0 100644 +--- a/examples/server/server.cpp ++++ b/examples/server/server.cpp +@@ -307,6 +307,10 @@ struct llama_client_slot + } + }; + ++#ifdef GGML_USE_CUBLAS ++extern "C" GGML_CALL void ggml_free_cublas(void); ++#endif ++ + struct llama_server_context + { + llama_model *model = nullptr; +@@ -353,6 +357,10 @@ struct llama_server_context + llama_free_model(model); + model = nullptr; + } ++#ifdef GGML_USE_CUBLAS ++ ggml_free_cublas(); ++#endif ++ + } + + bool load_model(const gpt_params ¶ms_) +@@ -3093,6 +3101,7 @@ int main(int argc, char **argv) + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); ++ sigaction(SIGUSR1, &sigint_action, NULL); + #elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; +@@ -3106,3 +3115,4 @@ int main(int argc, char **argv) + llama_backend_free(); + return 0; + } ++ +diff --git a/ggml-cuda.cu b/ggml-cuda.cu +index 96976f24..3543920e 100644 +--- a/ggml-cuda.cu ++++ b/ggml-cuda.cu +@@ -39,6 +39,7 @@ + #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) + #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 + #define cublasCreate hipblasCreate ++#define cublasDestroy hipblasDestroy + #define cublasGemmEx hipblasGemmEx + #define cublasGemmBatchedEx hipblasGemmBatchedEx + #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +@@ -7928,10 +7929,11 @@ GGML_CALL bool ggml_cublas_loaded(void) { + return g_cublas_loaded; + } + ++static bool g_cublas_initialized = false; ++ + GGML_CALL void ggml_init_cublas() { +- static bool initialized = false; + +- if (!initialized) { ++ if (!g_cublas_initialized) { + + #ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: +@@ -7941,7 +7943,7 @@ GGML_CALL void ggml_init_cublas() { + #endif + + if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) { +- initialized = true; ++ g_cublas_initialized = true; + g_cublas_loaded = false; + return; + } +@@ -8011,7 +8013,7 @@ GGML_CALL void ggml_init_cublas() { + // configure logging to stdout + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); + +- initialized = true; ++ g_cublas_initialized = true; + g_cublas_loaded = true; + } + } +@@ -11528,3 +11530,17 @@ GGML_CALL int ggml_backend_cuda_reg_devices() { + } + return device_count; + } ++ ++extern "C" GGML_CALL void ggml_free_cublas(void); ++GGML_CALL void ggml_free_cublas(void) { ++ for (int id = 0; id < g_device_count; ++id) { ++#if !defined(GGML_USE_HIPBLAS) ++ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id])); ++ g_cuda_pool_size[id] = 0; ++ g_cuda_pool_addr[id] = 0; ++#endif ++ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id])); ++ g_cublas_handles[id] = nullptr; ++ } ++ g_cublas_initialized = false; ++} +\ No newline at end of file +diff --git a/ggml-cuda.h b/ggml-cuda.h +index b1ebd61d..b4c80c2c 100644 +--- a/ggml-cuda.h ++++ b/ggml-cuda.h +@@ -20,6 +20,9 @@ extern "C" { + // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`. + GGML_API GGML_CALL void ggml_init_cublas(void); + ++// Release CUDA resources ++GGML_API GGML_CALL void ggml_free_cublas(void); ++ + // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`. + GGML_API GGML_CALL bool ggml_cublas_loaded(void); +