From ec3764538d4755448b326400093608040219fd5f Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sun, 21 Jan 2024 15:39:59 -0800 Subject: [PATCH] Probe GPUs before backend init Detect potential error scenarios so we can fallback to CPU mode without hitting asserts. --- llm/ext_server/ext_server.cpp | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/llm/ext_server/ext_server.cpp b/llm/ext_server/ext_server.cpp index c6ecf719..ab6fa7e8 100644 --- a/llm/ext_server/ext_server.cpp +++ b/llm/ext_server/ext_server.cpp @@ -3,6 +3,27 @@ // Necessary evil since the server types are not defined in a header #include "server.cpp" +// Low level API access to verify GPU access +#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif // __HIP_PLATFORM_AMD__ +#define cudaGetDevice hipGetDevice +#define cudaError_t hipError_t +#define cudaSuccess hipSuccess +#define cudaGetErrorString hipGetErrorString +#else +#include +#include +#include +#endif // defined(GGML_USE_HIPBLAS) +#endif // GGML_USE_CUBLAS + // Expose the llama server as a callable extern "C" API llama_server_context *llama = NULL; std::atomic ext_server_running(false); @@ -12,7 +33,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) { #if SERVER_VERBOSE != 1 log_disable(); #endif - LOG_TEE("system info: %s", llama_print_system_info()); + LOG_TEE("system info: %s\n", llama_print_system_info()); assert(err != NULL && sparams != NULL); err->id = 0; err->msg[0] = '\0'; @@ -60,6 +81,18 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) { params.mmproj = std::string(sparams->mmproj); } +#if defined(GGML_USE_CUBLAS) + // Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible + LOG_TEE("Performing pre-initialization of GPU\n"); + int id; + cudaError_t cudaErr = cudaGetDevice(&id); + if (cudaErr != cudaSuccess) { + err->id = -1; + snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr)); + return; + } +#endif + llama_backend_init(params.numa); // load the model