diff --git a/llm/dyn_ext_server.go b/llm/dyn_ext_server.go
index fa5a3477..e981be94 100644
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -149,7 +149,7 @@ func newDynExtServer(library, model string, adapters, projectors []string, opts
 
 	slog.Info("Initializing llama server")
 	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
-	initResp := newExtServerResp(128)
+	initResp := newExtServerResp(512)
 	defer freeExtServerResp(initResp)
 	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
 	if initResp.id < 0 {
diff --git a/llm/ext_server/ext_server.cpp b/llm/ext_server/ext_server.cpp
index 679029d9..4a9d120d 100644
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -114,16 +114,12 @@ void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    // load the model
-    if (!llama->load_model(params)) {
-      // TODO - consider modifying the logging logic or patching load_model so
-      // we can capture more detailed error messages and pass them back to the
-      // caller for better UX
-      err->id = -1;
-      snprintf(err->msg, err->msg_len, "error loading model %s",
-               params.model.c_str());
-      return;
-    }
+  if (!llama->load_model(params)) { 
+    // an error occured that was not thrown
+    err->id = -1;
+    snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
+    return;
+  }
 
     llama->initialize();
   } catch (std::exception &e) {
diff --git a/llm/patches/03-load_exception.diff b/llm/patches/03-load_exception.diff
new file mode 100644
index 00000000..9e838fa9
--- /dev/null
+++ b/llm/patches/03-load_exception.diff
@@ -0,0 +1,44 @@
+diff --git a/llama.cpp b/llama.cpp
+index 4225f955..7b762f86 100644
+--- a/llama.cpp
++++ b/llama.cpp
+@@ -4756,7 +4756,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
+         }
+     } catch (const std::exception & err) {
+         LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
+-        return -1;
++        throw;
+     }
+ 
+     return 0;
+@@ -12102,16 +12102,22 @@ struct llama_model * llama_load_model_from_file(
+         };
+     }
+ 
+-    int status = llama_model_load(path_model, *model, params);
+-    GGML_ASSERT(status <= 0);
+-    if (status < 0) {
+-        if (status == -1) {
+-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+-        } else if (status == -2) {
+-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
++    try {
++        int status = llama_model_load(path_model, *model, params);
++        GGML_ASSERT(status <= 0);
++        if (status < 0) {
++            if (status == -1) {
++                LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
++            } else if (status == -2) {
++                LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
++            }
++            delete model;
++            return nullptr;
+         }
++    } catch (...) {
++        LLAMA_LOG_ERROR("%s: exception loading model\n", __func__);
+         delete model;
+-        return nullptr;
++        throw;
+     }
+ 
+     return model;