diff --git a/llm/dyn_ext_server.go b/llm/dyn_ext_server.go index fa5a3477..e981be94 100644 --- a/llm/dyn_ext_server.go +++ b/llm/dyn_ext_server.go @@ -149,7 +149,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts slog.Info("Initializing llama server") slog.Debug(fmt.Sprintf("server params: %+v", sparams)) - initResp := newExtServerResp(128) + initResp := newExtServerResp(512) defer freeExtServerResp(initResp) C.dyn_llama_server_init(llm.s, &sparams, &initResp) if initResp.id < 0 { diff --git a/llm/ext_server/ext_server.cpp b/llm/ext_server/ext_server.cpp index 679029d9..4a9d120d 100644 --- a/llm/ext_server/ext_server.cpp +++ b/llm/ext_server/ext_server.cpp @@ -114,16 +114,12 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) { llama_backend_init(); llama_numa_init(params.numa); - // load the model - if (!llama->load_model(params)) { - // TODO - consider modifying the logging logic or patching load_model so - // we can capture more detailed error messages and pass them back to the - // caller for better UX - err->id = -1; - snprintf(err->msg, err->msg_len, "error loading model %s", - params.model.c_str()); - return; - } + if (!llama->load_model(params)) { + // an error occured that was not thrown + err->id = -1; + snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str()); + return; + } llama->initialize(); } catch (std::exception &e) { diff --git a/llm/patches/03-load_exception.diff b/llm/patches/03-load_exception.diff new file mode 100644 index 00000000..9e838fa9 --- /dev/null +++ b/llm/patches/03-load_exception.diff @@ -0,0 +1,44 @@ +diff --git a/llama.cpp b/llama.cpp +index 4225f955..7b762f86 100644 +--- a/llama.cpp ++++ b/llama.cpp +@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam + } + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); +- return -1; ++ throw; + } + + return 0; +@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file( + }; + } + +- int status = llama_model_load(path_model, *model, params); +- GGML_ASSERT(status <= 0); +- if (status < 0) { +- if (status == -1) { +- LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); +- } else if (status == -2) { +- LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); ++ try { ++ int status = llama_model_load(path_model, *model, params); ++ GGML_ASSERT(status <= 0); ++ if (status < 0) { ++ if (status == -1) { ++ LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); ++ } else if (status == -2) { ++ LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); ++ } ++ delete model; ++ return nullptr; + } ++ } catch (...) { ++ LLAMA_LOG_ERROR("%s: exception loading model\n", __func__); + delete model; +- return nullptr; ++ throw; + } + + return model;