ollama/llm/llama.cpp/ggml_patch/0004-metal-add-missing-barriers-for-mul-mat-2699.patch

From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001
From: Shouzheng Liu <lshzh.hi@gmail.com>
Date: Tue, 22 Aug 2023 02:18:40 -0400
Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)

---
 ggml-metal.metal | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml-metal.metal b/ggml-metal.metal
index 88d48f6..ce3541f 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         //load data and store to threadgroup memory
         half4x4 temp_a;
         dequantize_func(x, il, temp_a);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
         #pragma unroll(16)
         for (int i = 0; i < 16; i++) {
             *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const  uchar * src0,
         }
     } else {
         // block is smaller than 64x32, we should avoid writing data outside of the matrix
+        threadgroup_barrier(mem_flags::mem_threadgroup);
         threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
                                       + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
         for (int i = 0; i < 8; i++) {
-            threadgroup_barrier(mem_flags::mem_device);
             simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
         }
 
-        threadgroup_barrier(mem_flags::mem_device);
+        threadgroup_barrier(mem_flags::mem_threadgroup);
         device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
         if (sgitg==0) {
             for (int i = 0; i < n_rows; i++) {
-- 
2.41.0
metal: add missing barriers for mul-mat (#469) 2023-09-05 23:37:13 +00:00			`From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001`
			`From: Shouzheng Liu <lshzh.hi@gmail.com>`
			`Date: Tue, 22 Aug 2023 02:18:40 -0400`
			`Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)`

			`---`
			`ggml-metal.metal \| 5 +++--`
			`1 file changed, 3 insertions(+), 2 deletions(-)`

			`diff --git a/ggml-metal.metal b/ggml-metal.metal`
			`index 88d48f6..ce3541f 100644`
			`--- a/ggml-metal.metal`
			`+++ b/ggml-metal.metal`
			`@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,`
			`//load data and store to threadgroup memory`
			`half4x4 temp_a;`
			`dequantize_func(x, il, temp_a);`
			`+ threadgroup_barrier(mem_flags::mem_threadgroup);`
			`#pragma unroll(16)`
			`for (int i = 0; i < 16; i++) {`
			`(sa + SG_MAT_SIZE ((tiitg / THREAD_PER_ROW / 8) \`
			`@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0,`
			`}`
			`} else {`
			`// block is smaller than 64x32, we should avoid writing data outside of the matrix`
			`+ threadgroup_barrier(mem_flags::mem_threadgroup);`
			`threadgroup float temp_str = ((threadgroup float )shared_memory) \`
			`+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;`
			`for (int i = 0; i < 8; i++) {`
			`- threadgroup_barrier(mem_flags::mem_device);`
			`simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);`
			`}`

			`- threadgroup_barrier(mem_flags::mem_device);`
			`+ threadgroup_barrier(mem_flags::mem_threadgroup);`
			`device float C = dst + BLOCK_SIZE_M r0 + (BLOCK_SIZE_N * r1) * ne0 + imne1ne0;`
			`if (sgitg==0) {`
			`for (int i = 0; i < n_rows; i++) {`
			`--`
			`2.41.0`