update llama.cpp submodule to 6eeaeba1
(#6039)
This commit is contained in:
parent
f26aef9a8b
commit
68ee42f995
5 changed files with 8 additions and 89 deletions
9
llm/ext_server/server.cpp
vendored
9
llm/ext_server/server.cpp
vendored
|
@ -2438,15 +2438,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
}
|
}
|
||||||
else if (arg == "--lora-base")
|
|
||||||
{
|
|
||||||
if (++i >= argc)
|
|
||||||
{
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
params.lora_base = argv[i];
|
|
||||||
}
|
|
||||||
else if (arg == "-v" || arg == "--verbose")
|
else if (arg == "-v" || arg == "--verbose")
|
||||||
{
|
{
|
||||||
server_verbose = true;
|
server_verbose = true;
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa
|
Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972
|
|
@ -1,8 +1,8 @@
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
index 8fe51971..7113ba64 100644
|
index a207451f..2ddf431d 100644
|
||||||
--- a/src/llama.cpp
|
--- a/src/llama.cpp
|
||||||
+++ b/src/llama.cpp
|
+++ b/src/llama.cpp
|
||||||
@@ -5433,16 +5433,7 @@ static void llm_load_vocab(
|
@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
|
||||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||||
vocab.tokenizer_add_space_prefix = false;
|
vocab.tokenizer_add_space_prefix = false;
|
||||||
vocab.tokenizer_clean_spaces = true;
|
vocab.tokenizer_clean_spaces = true;
|
||||||
|
@ -20,9 +20,9 @@ index 8fe51971..7113ba64 100644
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "llama3" ||
|
tokenizer_pre == "llama3" ||
|
||||||
@@ -5526,7 +5517,8 @@ static void llm_load_vocab(
|
@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
|
tokenizer_pre == "codeshell") {
|
||||||
vocab.tokenizer_clean_spaces = false;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
||||||
} else {
|
} else {
|
||||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||||
|
|
|
@ -2,7 +2,7 @@ diff --git a/common/common.cpp b/common/common.cpp
|
||||||
index dbb724fb..c26fe6ee 100644
|
index dbb724fb..c26fe6ee 100644
|
||||||
--- a/common/common.cpp
|
--- a/common/common.cpp
|
||||||
+++ b/common/common.cpp
|
+++ b/common/common.cpp
|
||||||
@@ -2087,14 +2087,29 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
@@ -2087,14 +2087,27 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
||||||
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
||||||
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
||||||
|
@ -20,9 +20,7 @@ index dbb724fb..c26fe6ee 100644
|
||||||
+ int err = llama_model_apply_lora_from_file(model,
|
+ int err = llama_model_apply_lora_from_file(model,
|
||||||
+ lora_adapter.c_str(),
|
+ lora_adapter.c_str(),
|
||||||
+ lora_scale,
|
+ lora_scale,
|
||||||
+ ((i > 0) || params.lora_base.empty())
|
+ nullptr,
|
||||||
+ ? NULL
|
|
||||||
+ : params.lora_base.c_str(),
|
|
||||||
+ params.n_threads);
|
+ params.n_threads);
|
||||||
+ if (err != 0) {
|
+ if (err != 0) {
|
||||||
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
|
|
|
@ -1,70 +0,0 @@
|
||||||
From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Michael Yang <mxyng@pm.me>
|
|
||||||
Date: Tue, 23 Jul 2024 14:33:29 -0700
|
|
||||||
Subject: [PATCH] llama 3.1 rope scaling
|
|
||||||
|
|
||||||
---
|
|
||||||
src/llama.cpp | 14 ++++++++++++--
|
|
||||||
1 file changed, 12 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
|
||||||
index 8fe51971..a9969df8 100644
|
|
||||||
--- a/src/llama.cpp
|
|
||||||
+++ b/src/llama.cpp
|
|
||||||
@@ -2472,6 +2472,7 @@ struct llama_layer {
|
|
||||||
// long rope factors
|
|
||||||
struct ggml_tensor * rope_long = nullptr;
|
|
||||||
struct ggml_tensor * rope_short = nullptr;
|
|
||||||
+ struct ggml_tensor * rope_freqs = nullptr;
|
|
||||||
|
|
||||||
// bitnet scale
|
|
||||||
struct ggml_tensor * wq_scale;
|
|
||||||
@@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
|
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
||||||
|
|
||||||
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
||||||
+
|
|
||||||
if (n_expert == 0) {
|
|
||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
||||||
@@ -8620,6 +8623,10 @@ struct llm_build_context {
|
|
||||||
// choose long/short freq factors based on the context size
|
|
||||||
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
||||||
|
|
||||||
+ if (model.layers[il].rope_freqs != nullptr) {
|
|
||||||
+ return model.layers[il].rope_freqs;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
|
|
||||||
return model.layers[il].rope_long;
|
|
||||||
}
|
|
||||||
@@ -8814,6 +8821,9 @@ struct llm_build_context {
|
|
||||||
|
|
||||||
// self-attention
|
|
||||||
{
|
|
||||||
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
||||||
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
|
|
||||||
+
|
|
||||||
// compute Q and K and RoPE them
|
|
||||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
@@ -8837,14 +8847,14 @@ struct llm_build_context {
|
|
||||||
}
|
|
||||||
|
|
||||||
Qcur = ggml_rope_ext(
|
|
||||||
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
||||||
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
|
||||||
);
|
|
||||||
cb(Qcur, "Qcur", il);
|
|
||||||
|
|
||||||
Kcur = ggml_rope_ext(
|
|
||||||
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
||||||
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow
|
|
||||||
);
|
|
||||||
--
|
|
||||||
2.45.2
|
|
Loading…
Reference in a new issue