diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index 71bb6dcf..08be0895 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -34,6 +34,8 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_typ // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); +GGML_API GGML_CALL int ggml_backend_cuda_reg_devices(); + GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void); GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index ba280e06..d5c3fe49 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -83,7 +83,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer->iface.free_buffer != NULL) { buffer->iface.free_buffer(buffer); } + +// TODO: this needs to be freed in cuda and hipblas backends because +// the cuda backend implementation compiled with msvc +#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIPBLAS) free(buffer); +#endif } size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 6efdab14..809d6ab1 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -469,6 +469,10 @@ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; delete ctx; + + // TODO: this needs to be freed in cuda and hipblas backends because + // the cuda backend implementation compiled with msvc + free(buffer); } GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { @@ -3204,8 +3208,6 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, GGML_UNUSED(params); } -extern "C" GGML_CALL int ggml_backend_cuda_reg_devices(); - GGML_CALL int ggml_backend_cuda_reg_devices() { int device_count = ggml_backend_cuda_get_device_count(); //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization