diff --git a/llm/ggml-alloc.c b/llm/ggml-alloc.c index a5008b9c..66456cd3 100644 --- a/llm/ggml-alloc.c +++ b/llm/ggml-alloc.c @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/ggml-alloc.h b/llm/ggml-alloc.h index fa37e60f..03b41784 100644 --- a/llm/ggml-alloc.h +++ b/llm/ggml-alloc.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/ggml-cuda.cu b/llm/ggml-cuda.cu index 08fc6d34..4fc9cbeb 100644 --- a/llm/ggml-cuda.cu +++ b/llm/ggml-cuda.cu @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * @@ -1779,7 +1779,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( } // contiguous u/y values -// also used for q5_K static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { @@ -1789,19 +1788,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( float sumf_m = 0.0f; #pragma unroll - for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) { + for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { int sumi_d = 0; #pragma unroll - for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) { - sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product - sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product } - const float2 ds8f = __half22float2(ds8[i0 / 4]); + const float2 ds8f = __half22float2(ds8[i]); - sumf_d += ds8f.x * (sc[i0/4] * sumi_d); - sumf_m += ds8f.y * m[i0/4]; // sum of q8_1 block * q4_K min val + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val } const float2 dm4f = __half22float2(dm4); @@ -1818,7 +1816,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( #define VDR_Q5_K_Q8_1_MMQ 8 // contiguous v/x values -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl( +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { @@ -1855,6 +1853,40 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl( #endif // __CUDA_ARCH__ >= MIN_CC_DP4A } +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + assert(false); + return 0.0f; // only to satisfy the compiler +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + #define VDR_Q6_K_Q8_1_MMVQ 1 #define VDR_Q6_K_Q8_1_MMQ 8 @@ -2850,18 +2882,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - int v[QR4_K*VDR_Q4_K_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) { - v[l + 0] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F; - v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F; - } - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; - return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q5_K_q8_1( @@ -2908,7 +2933,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( u[2*i+1] = q8[4]; } - return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8); + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); #else @@ -3051,7 +3076,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; - return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); + return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q6_K_q8_1( @@ -3327,7 +3353,11 @@ template static __global__ void mul_mat_q4_0( #define MMQ_Y_Q4_1_PASCAL 64 #define NWARPS_Q4_1_PASCAL 8 -template static __global__ void mul_mat_q4_1( +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q4_1( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { @@ -3497,7 +3527,11 @@ template static __global__ void mul_mat_q2_K( #define MMQ_Y_Q3_K_PASCAL 64 #define NWARPS_Q3_K_PASCAL 8 -template static __global__ void mul_mat_q3_K( +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q3_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { @@ -3527,11 +3561,15 @@ template static __global__ void mul_mat_q3_K( #define MMQ_X_Q4_K_AMPERE 64 #define MMQ_Y_Q4_K_AMPERE 128 #define NWARPS_Q4_K_AMPERE 4 -#define MMQ_X_Q4_K_PASCAL 32 +#define MMQ_X_Q4_K_PASCAL 64 #define MMQ_Y_Q4_K_PASCAL 64 #define NWARPS_Q4_K_PASCAL 8 -template static __global__ void mul_mat_q4_K( +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q4_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { @@ -3595,11 +3633,15 @@ template static __global__ void mul_mat_q5_K( #define MMQ_X_Q6_K_AMPERE 64 #define MMQ_Y_Q6_K_AMPERE 64 #define NWARPS_Q6_K_AMPERE 4 -#define MMQ_X_Q6_K_PASCAL 32 +#define MMQ_X_Q6_K_PASCAL 64 #define MMQ_Y_Q6_K_PASCAL 64 #define NWARPS_Q6_K_PASCAL 8 -template static __global__ void mul_mat_q6_K( +template static __global__ void +#if __CUDA_ARCH__ < CC_TURING + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_TURING + mul_mat_q6_K( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { diff --git a/llm/ggml-cuda.h b/llm/ggml-cuda.h index 52db8a76..1fc27125 100644 --- a/llm/ggml-cuda.h +++ b/llm/ggml-cuda.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/ggml-metal.h b/llm/ggml-metal.h index 0df0c8db..74cdc77f 100644 --- a/llm/ggml-metal.h +++ b/llm/ggml-metal.h @@ -1,7 +1,7 @@ //go:build darwin /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/ggml-metal.m b/llm/ggml-metal.m index b772b8b7..78eb5b8a 100644 --- a/llm/ggml-metal.m +++ b/llm/ggml-metal.m @@ -1,7 +1,7 @@ //go:build darwin /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * @@ -154,7 +154,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error]; if (error) { fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); + return NULL; } } #else @@ -172,7 +172,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error]; if (error) { fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); + return NULL; } #ifdef GGML_QKK_64 @@ -184,7 +184,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { #endif if (error) { fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); - exit(1); + return NULL; } } #endif diff --git a/llm/ggml-metal.metal b/llm/ggml-metal.metal index 36daafbd..51467d63 100644 --- a/llm/ggml-metal.metal +++ b/llm/ggml-metal.metal @@ -1,7 +1,7 @@ //go:build darwin /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/ggml-mpi.c b/llm/ggml-mpi.c index 9d9f81dd..30e3f557 100644 --- a/llm/ggml-mpi.c +++ b/llm/ggml-mpi.c @@ -1,7 +1,7 @@ //go:build mpi /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/ggml-mpi.h b/llm/ggml-mpi.h index c2c240ed..6390aead 100644 --- a/llm/ggml-mpi.h +++ b/llm/ggml-mpi.h @@ -1,7 +1,7 @@ //go:build mpi /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/ggml-opencl.cpp b/llm/ggml-opencl.cpp index 24c46afc..f3670812 100644 --- a/llm/ggml-opencl.cpp +++ b/llm/ggml-opencl.cpp @@ -1,7 +1,7 @@ //go:build opencl /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/ggml-opencl.h b/llm/ggml-opencl.h index c48028da..bad827d0 100644 --- a/llm/ggml-opencl.h +++ b/llm/ggml-opencl.h @@ -1,7 +1,7 @@ //go:build opencl /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/ggml.c b/llm/ggml.c index 28d74a95..4cf22755 100644 --- a/llm/ggml.c +++ b/llm/ggml.c @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/ggml.h b/llm/ggml.h index b3359e23..2506bd34 100644 --- a/llm/ggml.h +++ b/llm/ggml.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/k_quants.c b/llm/k_quants.c index 26d5eeb7..3241be1d 100644 --- a/llm/k_quants.c +++ b/llm/k_quants.c @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/k_quants.h b/llm/k_quants.h index 36b75341..bd363319 100644 --- a/llm/k_quants.h +++ b/llm/k_quants.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * diff --git a/llm/llama-util.h b/llm/llama-util.h index 5dcef45d..f65eefb4 100644 --- a/llm/llama-util.h +++ b/llm/llama-util.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * @@ -297,20 +297,29 @@ struct llama_mmap { throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); } - #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 if (prefetch) { - // Advise the kernel to preload the mapped memory - WIN32_MEMORY_RANGE_ENTRY range; - range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; - if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); + // The PrefetchVirtualMemory API is only present on Windows 8 and above, so we + // will dynamically load it using GetProcAddress. + BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); + HMODULE hKernel32; + + // This call is guaranteed to succeed. + hKernel32 = GetModuleHandleW(L"kernel32.dll"); + + // This call may fail if on a pre-Win8 system. + pPrefetchVirtualMemory = reinterpret_cast (GetProcAddress(hKernel32, "PrefetchVirtualMemory")); + + if (pPrefetchVirtualMemory) { + // Advise the kernel to preload the mapped memory. + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } } } - #else - #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") - #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 } ~llama_mmap() { diff --git a/llm/llama.cpp b/llm/llama.cpp index 89ef7550..e6a6d36e 100644 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License * @@ -3363,6 +3363,12 @@ struct llama_context * llama_new_context_with_model( // this allocates all Metal resources and memory buffers ctx->ctx_metal = ggml_metal_init(1); + if (!ctx->ctx_metal) { + LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__); + llama_free(ctx); + return NULL; + } + void * data_ptr = NULL; size_t data_size = 0; diff --git a/llm/llama.h b/llm/llama.h index 06aa9cf0..77c6330f 100644 --- a/llm/llama.h +++ b/llm/llama.h @@ -1,5 +1,5 @@ /** - * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e + * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380 * * MIT License *