ollama/llm/patches/02-cudaleaks.diff
2024-03-11 12:57:48 -07:00

117 lines
3.9 KiB
Diff

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8fe5e0b1..53bf39c1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -31,6 +31,10 @@
#include <atomic>
#include <signal.h>
+#ifdef GGML_USE_CUBLAS
+extern "C" GGML_CALL void ggml_free_cublas(void);
+#endif
+
using json = nlohmann::json;
struct server_params {
@@ -363,6 +367,10 @@ struct llama_server_context
llama_free_model(model);
model = nullptr;
}
+
+#ifdef GGML_USE_CUBLAS
+ ggml_free_cublas();
+#endif
}
bool load_model(const gpt_params &params_)
@@ -3543,6 +3551,7 @@ int main(int argc, char **argv)
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
+ sigaction(SIGUSR1, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 72bcec8c..6c934e8c 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -43,6 +43,7 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
+#define cublasDestroy hipblasDestroy
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
@@ -8751,10 +8752,10 @@ GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
-GGML_CALL void ggml_init_cublas() {
- static bool initialized = false;
+static bool g_cublas_initialized = false;
- if (!initialized) {
+GGML_CALL void ggml_init_cublas() {
+ if (!g_cublas_initialized) {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
@@ -8764,7 +8765,7 @@ GGML_CALL void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = false;
fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__);
return;
@@ -8835,7 +8836,7 @@ GGML_CALL void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
- initialized = true;
+ g_cublas_initialized = true;
g_cublas_loaded = true;
}
}
@@ -12490,3 +12491,23 @@ GGML_CALL int ggml_backend_cuda_reg_devices() {
}
return device_count;
}
+
+
+extern "C" GGML_CALL void ggml_free_cublas(void);
+GGML_CALL void ggml_free_cublas(void) {
+ for (int id = 0; id < g_device_count; ++id) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+ if (g_device_caps[id].vmm) {
+ CU_CHECK(cuMemUnmap(g_cuda_pool_addr[id], g_cuda_pool_size[id]));
+ g_cuda_pool_size[id] = 0;
+ g_cuda_pool_addr[id] = 0;
+ }
+#endif
+ // TODO: free legacy non-vmm memory
+ // destroy cublas handle
+ CUBLAS_CHECK(cublasDestroy(g_cublas_handles[id]));
+ g_cublas_handles[id] = nullptr;
+ }
+
+ g_cublas_initialized = false;
+}
\ No newline at end of file
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b1ebd61d..6dd58ddf 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -23,6 +23,9 @@ GGML_API GGML_CALL void ggml_init_cublas(void);
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
+// Release CUDA resources
+GGML_API GGML_CALL void ggml_free_cublas(void);
+
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);