From 0e4669b04f1553e46221d8e8e789a91b664015da Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Fri, 8 Mar 2024 00:26:20 -0800 Subject: [PATCH] update llama.cpp submodule to `6cdabe6` (#2999) --- llm/ext_server/ext_server.cpp | 24 ++++++++++++------------ llm/llama.cpp | 2 +- llm/patches/01-cache.diff | 14 ++++++-------- llm/patches/02-cudaleaks.diff | 25 +++++++++++++------------ 4 files changed, 32 insertions(+), 33 deletions(-) diff --git a/llm/ext_server/ext_server.cpp b/llm/ext_server/ext_server.cpp index 679029d9..0d879e1e 100644 --- a/llm/ext_server/ext_server.cpp +++ b/llm/ext_server/ext_server.cpp @@ -26,7 +26,7 @@ #endif // GGML_USE_CUBLAS // Expose the llama server as a callable extern "C" API -llama_server_context *llama = NULL; +server_context *llama = NULL; std::thread ext_server_thread; bool shutting_down = false; std::atomic_int recv_counter; @@ -57,7 +57,7 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) { err->id = 0; err->msg[0] = '\0'; try { - llama = new llama_server_context; + llama = new server_context; gpt_params params; params.n_ctx = sparams->n_ctx; params.n_batch = sparams->n_batch; @@ -144,13 +144,13 @@ void llama_server_start() { LOG_TEE("llama server main loop starting\n"); ggml_time_init(); llama->queue_tasks.on_new_task(std::bind( - &llama_server_context::process_single_task, llama, std::placeholders::_1)); + &server_context::process_single_task, llama, std::placeholders::_1)); llama->queue_tasks.on_finish_multitask(std::bind( - &llama_server_context::on_finish_multitask, llama, std::placeholders::_1)); + &server_context::on_finish_multitask, llama, std::placeholders::_1)); llama->queue_tasks.on_run_slots(std::bind( - &llama_server_context::update_slots, llama)); + &server_context::update_slots, llama)); llama->queue_results.on_multitask_update(std::bind( - &llama_server_queue::update_multitask, + &server_queue::update_multitask, &llama->queue_tasks, std::placeholders::_1, std::placeholders::_2, @@ -198,7 +198,7 @@ void llama_server_completion(const char *json_req, ext_server_resp_t *resp) { json data = json::parse(json_req); resp->id = llama->queue_tasks.get_new_id(); llama->queue_results.add_waiting_task_id(resp->id); - llama->request_completion(resp->id, data, false, false, -1); + llama->request_completion(resp->id, -1, data, false, false); } catch (std::exception &e) { snprintf(resp->msg, resp->msg_len, "exception %s", e.what()); } catch (...) { @@ -216,9 +216,9 @@ void llama_server_completion_next_result(const int task_id, std::string result_json; try { atomicRecv ar(recv_counter); - task_result result = llama->queue_results.recv(task_id); + server_task_result result = llama->queue_results.recv(task_id); result_json = - result.result_json.dump(-1, ' ', false, json::error_handler_t::replace); + result.data.dump(-1, ' ', false, json::error_handler_t::replace); resp->id = result.id; resp->stop = result.stop; resp->error = result.error; @@ -363,10 +363,10 @@ void llama_server_embedding(const char *json_req, char **json_resp, } const int task_id = llama->queue_tasks.get_new_id(); llama->queue_results.add_waiting_task_id(task_id); - llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1); + llama->request_completion(task_id, -1, {{"prompt", prompt}, {"n_predict", 0}}, false, true); atomicRecv ar(recv_counter); - task_result result = llama->queue_results.recv(task_id); - std::string result_json = result.result_json.dump(); + server_task_result result = llama->queue_results.recv(task_id); + std::string result_json = result.data.dump(); const std::string::size_type size = result_json.size() + 1; *json_resp = new char[size]; snprintf(*json_resp, size, "%s", result_json.c_str()); diff --git a/llm/llama.cpp b/llm/llama.cpp index c29af7e2..6cdabe65 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit c29af7e2252d288f2ea58a7d437c1cb7c0abf160 +Subproject commit 6cdabe652695167263c8b447520987b11856f7ca diff --git a/llm/patches/01-cache.diff b/llm/patches/01-cache.diff index ab81ee9d..dbef08de 100644 --- a/llm/patches/01-cache.diff +++ b/llm/patches/01-cache.diff @@ -1,21 +1,19 @@ diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 2b2f4a0f..afac49af 100644 +index f255ad76..914ecfdd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp -@@ -997,13 +997,15 @@ struct llama_server_context - slot.n_sent_text += result.text_to_send.size(); +@@ -1101,12 +1101,13 @@ struct server_context { // add the token to slot queue and cache } + - slot.add_token_string(result); -+ - if (slot.params.stream) - { + if (slot.params.stream) { send_partial_response(slot, result); } } + slot.add_token_string(result); + - if (incomplete) - { + if (incomplete) { slot.has_next_token = true; + } diff --git a/llm/patches/02-cudaleaks.diff b/llm/patches/02-cudaleaks.diff index 206bb270..0c4298ba 100644 --- a/llm/patches/02-cudaleaks.diff +++ b/llm/patches/02-cudaleaks.diff @@ -1,9 +1,9 @@ diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 2b2f4a0f..25857bdd 100644 +index f255ad76..5b83acb1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp -@@ -31,6 +31,10 @@ - #include +@@ -28,6 +28,10 @@ + #include #include +#ifdef GGML_USE_CUBLAS @@ -12,18 +12,19 @@ index 2b2f4a0f..25857bdd 100644 + using json = nlohmann::json; - struct server_params { -@@ -363,6 +367,9 @@ struct llama_server_context + bool server_verbose = false; +@@ -648,6 +652,10 @@ struct server_context { llama_free_model(model); model = nullptr; } ++ +#ifdef GGML_USE_CUBLAS + ggml_free_cublas(); +#endif } - bool load_model(const gpt_params ¶ms_) -@@ -3494,6 +3501,7 @@ int main(int argc, char **argv) + bool load_model(const gpt_params & params_) { +@@ -3339,6 +3347,7 @@ int main(int argc, char ** argv) { sigemptyset (&sigint_action.sa_mask); sigint_action.sa_flags = 0; sigaction(SIGINT, &sigint_action, NULL); @@ -32,7 +33,7 @@ index 2b2f4a0f..25857bdd 100644 auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; diff --git a/ggml-cuda.cu b/ggml-cuda.cu -index 0c6501e9..75c12723 100644 +index 72bcec8c..50a45e3d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -43,6 +43,7 @@ @@ -43,7 +44,7 @@ index 0c6501e9..75c12723 100644 #define cublasGemmEx hipblasGemmEx #define cublasGemmBatchedEx hipblasGemmBatchedEx #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -@@ -8694,10 +8695,10 @@ GGML_CALL bool ggml_cublas_loaded(void) { +@@ -8751,10 +8752,10 @@ GGML_CALL bool ggml_cublas_loaded(void) { return g_cublas_loaded; } @@ -57,7 +58,7 @@ index 0c6501e9..75c12723 100644 #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: -@@ -8707,7 +8708,7 @@ GGML_CALL void ggml_init_cublas() { +@@ -8764,7 +8765,7 @@ GGML_CALL void ggml_init_cublas() { #endif if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) { @@ -66,7 +67,7 @@ index 0c6501e9..75c12723 100644 g_cublas_loaded = false; fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__); return; -@@ -8778,7 +8779,7 @@ GGML_CALL void ggml_init_cublas() { +@@ -8835,7 +8836,7 @@ GGML_CALL void ggml_init_cublas() { // configure logging to stdout // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); @@ -75,7 +76,7 @@ index 0c6501e9..75c12723 100644 g_cublas_loaded = true; } } -@@ -12345,3 +12346,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() { +@@ -12490,3 +12491,22 @@ GGML_CALL int ggml_backend_cuda_reg_devices() { } return device_count; }