update llama.cpp submodule to 1e6f6554
(#6208)
This commit is contained in:
parent
d4a7216c82
commit
e04c7012c2
4 changed files with 25 additions and 45 deletions
14
llm/ext_server/server.cpp
vendored
14
llm/ext_server/server.cpp
vendored
|
@ -403,7 +403,9 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
auto init_result = llama_init_from_gpt_params(params);
|
||||||
|
model = init_result.model;
|
||||||
|
ctx = init_result.context;
|
||||||
if (model == nullptr)
|
if (model == nullptr)
|
||||||
{
|
{
|
||||||
LOG_ERROR("unable to load model", {{"model", params.model}});
|
LOG_ERROR("unable to load model", {{"model", params.model}});
|
||||||
|
@ -2422,7 +2424,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
params.lora_adapters.push_back({
|
||||||
|
std::string(argv[i]),
|
||||||
|
1.0,
|
||||||
|
});
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
}
|
}
|
||||||
else if (arg == "--lora-scaled")
|
else if (arg == "--lora-scaled")
|
||||||
|
@ -2438,7 +2443,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
params.lora_adapters.push_back({
|
||||||
|
lora_adapter,
|
||||||
|
std::stof(argv[i])
|
||||||
|
});
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
}
|
}
|
||||||
else if (arg == "-v" || arg == "--verbose")
|
else if (arg == "-v" || arg == "--verbose")
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972
|
Subproject commit 1e6f6554aa11fa10160a5fda689e736c3c34169f
|
|
@ -1,40 +1,32 @@
|
||||||
diff --git a/common/common.cpp b/common/common.cpp
|
diff --git a/common/common.cpp b/common/common.cpp
|
||||||
index dbb724fb..c26fe6ee 100644
|
index 2e8374d5..70d0afde 100644
|
||||||
--- a/common/common.cpp
|
--- a/common/common.cpp
|
||||||
+++ b/common/common.cpp
|
+++ b/common/common.cpp
|
||||||
@@ -2087,14 +2087,27 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
||||||
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
if (loaded_la.adapter == nullptr) {
|
||||||
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
+
|
|
||||||
+ // try to load as gguf
|
|
||||||
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
|
|
||||||
if (adapter == nullptr) {
|
|
||||||
- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
|
||||||
- llama_free(lctx);
|
- llama_free(lctx);
|
||||||
- llama_free_model(model);
|
- llama_free_model(model);
|
||||||
- return std::make_tuple(nullptr, nullptr);
|
- return iparams;
|
||||||
+ fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
|
|
||||||
+
|
+
|
||||||
+ // if that fails, try loading as ggla for compatibility
|
+ // if that fails, try loading as ggla for compatibility
|
||||||
+ int err = llama_model_apply_lora_from_file(model,
|
+ int err = llama_model_apply_lora_from_file(model,
|
||||||
+ lora_adapter.c_str(),
|
+ la.path.c_str(),
|
||||||
+ lora_scale,
|
+ la.scale,
|
||||||
+ nullptr,
|
+ nullptr,
|
||||||
+ params.n_threads);
|
+ params.n_threads);
|
||||||
+ if (err != 0) {
|
+ if (err != 0) {
|
||||||
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
+ llama_free(lctx);
|
+ llama_free(lctx);
|
||||||
+ llama_free_model(model);
|
+ llama_free_model(model);
|
||||||
+ return std::make_tuple(nullptr, nullptr);
|
+ return iparams;
|
||||||
|
+ } else {
|
||||||
|
+ break;
|
||||||
+ }
|
+ }
|
||||||
+ } else {
|
|
||||||
+ llama_lora_adapter_set(lctx, adapter, lora_scale);
|
|
||||||
}
|
}
|
||||||
- llama_lora_adapter_set(lctx, adapter, lora_scale);
|
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.ignore_eos) {
|
|
||||||
diff --git a/include/llama.h b/include/llama.h
|
diff --git a/include/llama.h b/include/llama.h
|
||||||
index 93fd77ca..b0fb37a6 100644
|
index 93fd77ca..b0fb37a6 100644
|
||||||
--- a/include/llama.h
|
--- a/include/llama.h
|
||||||
|
@ -355,4 +347,4 @@ index 80a0dd0f..9d7b0e17 100644
|
||||||
+ return 1;
|
+ return 1;
|
||||||
+ }
|
+ }
|
||||||
+}
|
+}
|
||||||
\ No newline at end of file
|
\ No newline at end of file
|
|
@ -1,20 +0,0 @@
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
|
||||||
index a207451f..fba6b175 100644
|
|
||||||
--- a/src/llama.cpp
|
|
||||||
+++ b/src/llama.cpp
|
|
||||||
@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
|
|
||||||
hparams.attn_soft_cap = true;
|
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
|
||||||
+ case 26: model.type = e_model::MODEL_2B; break;
|
|
||||||
case 42: model.type = e_model::MODEL_9B; break;
|
|
||||||
case 46: model.type = e_model::MODEL_27B; break;
|
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
|
||||||
@@ -11736,6 +11737,7 @@ struct llm_build_context {
|
|
||||||
|
|
||||||
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
|
|
||||||
switch (model.type) {
|
|
||||||
+ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
|
||||||
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
|
||||||
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
|
||||||
default: GGML_ABORT("fatal error");
|
|
Loading…
Reference in a new issue