Fix embeddings memory corruption (#6467)

* Fix embeddings memory corruption The patch was leading to a buffer overrun corruption. Once removed though, parallism in server.cpp lead to hitting an assert due to slot/seq IDs being >= token count. To work around this, only use slot 0 for embeddings. * Fix embed integration test assumption The token eval count has changed with recent llama.cpp bumps (0.3.5+)
2024-08-22 14:51:42 -07:00 · 2024-08-22 14:51:42 -07:00 · 90ca84172c
commit 90ca84172c
parent 6bd8a4b0a1
4 changed files with 16 additions and 65 deletions
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
 	}
-	if res.PromptEvalCount != 8 {
+	if res.PromptEvalCount != 6 {
-		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
+		t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }
@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
 	}
-	if res.PromptEvalCount != 16 {
+	if res.PromptEvalCount != 12 {
-		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
+		t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
 	}
 }
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@ -1429,7 +1429,13 @@ struct llama_server_context
        switch (task.type)
        {
            case TASK_TYPE_COMPLETION: {
-                server_slot *slot = prefix_slot(task.data["prompt"]);
+                server_slot *slot = nullptr;
                if (task.embedding_mode) {
                    // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
                    slot = slots[0].available() ? &slots[0] : nullptr;
                } else {
                    slot = prefix_slot(task.data["prompt"]);
                }
                if (slot == nullptr)
                {
                    // if no slot is available, we defer this task for processing later
--- a/llm/patches/08-pooling.diff
+++ b/llm/patches/08-pooling.diff
@ -1,60 +0,0 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
 index 721b8f4e..cfe7ac40 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -8420,14 +8420,14 @@ struct llm_build_context {
     }
     struct ggml_tensor * build_inp_mean() {
 -        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
 +        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
         cb(lctx.inp_mean, "inp_mean", -1);
         ggml_set_input(lctx.inp_mean);
         return lctx.inp_mean;
     }
     struct ggml_tensor * build_inp_cls() {
 -        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
 +        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
         cb(lctx.inp_cls, "inp_cls", -1);
         ggml_set_input(lctx.inp_cls);
         return lctx.inp_cls;
@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
         float * data = (float *) lctx.inp_mean->data;
 -        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
 +        memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
         std::vector<uint64_t> sum(n_tokens, 0);
         for (int i = 0; i < n_tokens; ++i) {
             const llama_seq_id seq_id = batch.seq_id[i][0];
 -
 -            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
 -
             sum[seq_id] += 1;
         }
 -        std::vector<float> div(n_tokens, 0.0f);
 -        for (int i = 0; i < n_tokens; ++i) {
 +        std::vector<float> div(cparams.n_seq_max, 0.0f);
 +        for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
             const uint64_t s = sum[i];
             if (s > 0) {
                 div[i] = 1.0f/float(s);
@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
         uint32_t * data = (uint32_t *) lctx.inp_cls->data;
 -        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
 +        memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
         for (int i = 0; i < n_tokens; ++i) {
             const llama_seq_id seq_id = batch.seq_id[i][0];
             const llama_pos    pos    = batch.pos[i];
 -
 -            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
 -
             if (pos == 0) {
                 data[seq_id] = i;
             }
--- a/server/sched.go
+++ b/server/sched.go
@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
 					// Embedding models should always be loaded with parallel=1
 					if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
 						numParallel = 1
 					}
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
 						// simplifying assumption of defaultParallel when in CPU mode