diff --git a/llm/patches/0001-llama-3.1-rope-scaling.diff b/llm/patches/0001-llama-3.1-rope-scaling.diff new file mode 100644 index 00000000..45dcb4f5 --- /dev/null +++ b/llm/patches/0001-llama-3.1-rope-scaling.diff @@ -0,0 +1,71 @@ +From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001 +From: Michael Yang +Date: Tue, 23 Jul 2024 14:33:29 -0700 +Subject: [PATCH] llama 3.1 rope scaling + +--- + src/llama.cpp | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/src/llama.cpp b/src/llama.cpp +index 8fe51971..a9969df8 100644 +--- a/src/llama.cpp ++++ b/src/llama.cpp +@@ -2472,6 +2472,7 @@ struct llama_layer { + // long rope factors + struct ggml_tensor * rope_long = nullptr; + struct ggml_tensor * rope_short = nullptr; ++ struct ggml_tensor * rope_freqs = nullptr; + + // bitnet scale + struct ggml_tensor * wq_scale; +@@ -6143,6 +6144,8 @@ static bool llm_load_tensors( + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + ++ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); ++ + if (n_expert == 0) { + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); +@@ -8620,6 +8623,10 @@ struct llm_build_context { + // choose long/short freq factors based on the context size + const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; + ++ if (model.layers[il].rope_freqs != nullptr) { ++ return model.layers[il].rope_freqs; ++ } ++ + if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { + return model.layers[il].rope_long; + } +@@ -8814,6 +8821,9 @@ struct llm_build_context { + + // self-attention + { ++ // rope freq factors for llama3; may return nullptr for llama2 and other models ++ struct ggml_tensor * rope_factors = build_rope_factors(il); ++ + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); +@@ -8837,14 +8847,14 @@ struct llm_build_context { + } + + Qcur = ggml_rope_ext( +- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr, ++ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( +- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr, ++ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); +-- +2.45.2 +