/** * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file * * MIT License * * Copyright (c) 2023-2024 The ggml authors * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "ggml.h" // GGML internal header #include #include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ #include #include #include // memcpy #include // fabsf #undef MIN #undef MAX #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b)) #if defined(_MSC_VER) #define m512bh(p) p #define m512i(p) p #else #define m512bh(p) (__m512bh)(p) #define m512i(p) (__m512i)(p) #endif /** * Converts brain16 to float32. * * The bfloat16 floating point format has the following structure: * * ┌sign * │ * │ ┌exponent * │ │ * │ │ ┌mantissa * │ │ │ * │┌──┴───┐┌─┴───┐ * 0b0000000000000000 brain16 * * Since bf16 has the same number of exponent bits as a 32bit float, * encoding and decoding numbers becomes relatively straightforward. * * ┌sign * │ * │ ┌exponent * │ │ * │ │ ┌mantissa * │ │ │ * │┌──┴───┐┌─┴───────────────────┐ * 0b00000000000000000000000000000000 IEEE binary32 * * For comparison, the standard fp16 format has fewer exponent bits. * * ┌sign * │ * │ ┌exponent * │ │ * │ │ ┌mantissa * │ │ │ * │┌─┴─┐┌─┴──────┐ * 0b0000000000000000 IEEE binary16 * * @see IEEE 754-2008 */ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { union { float f; uint32_t i; } u; u.i = (uint32_t)h.bits << 16; return u.f; } /** * Converts float32 to brain16. * * This is binary identical with Google Brain float conversion. * Floats shall round to nearest even, and NANs shall be quiet. * Subnormals aren't flushed to zero, except perhaps when used. * This code should vectorize nicely if using modern compilers. */ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { ggml_bf16_t h; union { float f; uint32_t i; } u; u.f = s; if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ h.bits = (u.i >> 16) | 64; /* force to quiet */ return h; } h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; return h; } #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) #ifdef __cplusplus extern "C" { #endif // static_assert should be a #define, but if it's not, // fall back to the _Static_assert C11 keyword. // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 #ifndef __cplusplus #ifndef static_assert #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) #define static_assert(cond, msg) _Static_assert(cond, msg) #else #define static_assert(cond, msg) struct global_scope_noop_trick #endif #endif #endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__)) #ifndef __FMA__ #define __FMA__ #endif #ifndef __F16C__ #define __F16C__ #endif #endif // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)) #ifndef __SSE3__ #define __SSE3__ #endif #ifndef __SSSE3__ #define __SSSE3__ #endif #endif #if defined(__ARM_FEATURE_SVE) #include #include #endif // 16-bit float // on Arm, we use __fp16 // on x86, we use uint16_t #if defined(__ARM_NEON) // if YCM cannot find , make a symbolic link to it, for example: // // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ // #include #ifdef _MSC_VER typedef uint16_t ggml_fp16_internal_t; #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } #else typedef __fp16 ggml_fp16_internal_t; #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } #endif // _MSC_VER #if !defined(__aarch64__) // 32-bit ARM compatibility // vaddvq_s16 // vpaddq_s16 // vpaddq_s32 // vaddvq_s32 // vaddvq_f32 // vmaxvq_f32 // vcvtnq_s32_f32 // vzip1_u8 // vzip2_u8 inline static int32_t vaddvq_s16(int16x8_t v) { return (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); } inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); return vcombine_s16(a0, b0); } inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); return vcombine_s32(a0, b0); } inline static int32_t vaddvq_s32(int32x4_t v) { return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); } inline static float vaddvq_f32(float32x4_t v) { return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); } inline static float vmaxvq_f32(float32x4_t v) { return MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); } inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { int32x4_t res; res[0] = roundf(vgetq_lane_f32(v, 0)); res[1] = roundf(vgetq_lane_f32(v, 1)); res[2] = roundf(vgetq_lane_f32(v, 2)); res[3] = roundf(vgetq_lane_f32(v, 3)); return res; } inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { uint8x8_t res; res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1]; res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3]; return res; } inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { uint8x8_t res; res[0] = a[4]; res[1] = b[4]; res[2] = a[5]; res[3] = b[5]; res[4] = a[6]; res[5] = b[6]; res[6] = a[7]; res[7] = b[7]; return res; } // vld1q_s16_x2 // vld1q_u8_x2 // vld1q_u8_x4 // vld1q_s8_x2 // vld1q_s8_x4 // TODO: double-check these work correctly typedef struct ggml_int16x8x2_t { int16x8_t val[2]; } ggml_int16x8x2_t; inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { ggml_int16x8x2_t res; res.val[0] = vld1q_s16(ptr + 0); res.val[1] = vld1q_s16(ptr + 8); return res; } typedef struct ggml_uint8x16x2_t { uint8x16_t val[2]; } ggml_uint8x16x2_t; inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { ggml_uint8x16x2_t res; res.val[0] = vld1q_u8(ptr + 0); res.val[1] = vld1q_u8(ptr + 16); return res; } typedef struct ggml_uint8x16x4_t { uint8x16_t val[4]; } ggml_uint8x16x4_t; inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { ggml_uint8x16x4_t res; res.val[0] = vld1q_u8(ptr + 0); res.val[1] = vld1q_u8(ptr + 16); res.val[2] = vld1q_u8(ptr + 32); res.val[3] = vld1q_u8(ptr + 48); return res; } typedef struct ggml_int8x16x2_t { int8x16_t val[2]; } ggml_int8x16x2_t; inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { ggml_int8x16x2_t res; res.val[0] = vld1q_s8(ptr + 0); res.val[1] = vld1q_s8(ptr + 16); return res; } typedef struct ggml_int8x16x4_t { int8x16_t val[4]; } ggml_int8x16x4_t; inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { ggml_int8x16x4_t res; res.val[0] = vld1q_s8(ptr + 0); res.val[1] = vld1q_s8(ptr + 16); res.val[2] = vld1q_s8(ptr + 32); res.val[3] = vld1q_s8(ptr + 48); return res; } // NOTE: not tested inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { int8x16_t res; res[ 0] = a[b[ 0]]; res[ 1] = a[b[ 1]]; res[ 2] = a[b[ 2]]; res[ 3] = a[b[ 3]]; res[ 4] = a[b[ 4]]; res[ 5] = a[b[ 5]]; res[ 6] = a[b[ 6]]; res[ 7] = a[b[ 7]]; res[ 8] = a[b[ 8]]; res[ 9] = a[b[ 9]]; res[10] = a[b[10]]; res[11] = a[b[11]]; res[12] = a[b[12]]; res[13] = a[b[13]]; res[14] = a[b[14]]; res[15] = a[b[15]]; return res; } // NOTE: not tested inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { uint8x16_t res; res[ 0] = a[b[ 0]]; res[ 1] = a[b[ 1]]; res[ 2] = a[b[ 2]]; res[ 3] = a[b[ 3]]; res[ 4] = a[b[ 4]]; res[ 5] = a[b[ 5]]; res[ 6] = a[b[ 6]]; res[ 7] = a[b[ 7]]; res[ 8] = a[b[ 8]]; res[ 9] = a[b[ 9]]; res[10] = a[b[10]]; res[11] = a[b[11]]; res[12] = a[b[12]]; res[13] = a[b[13]]; res[14] = a[b[14]]; res[15] = a[b[15]]; return res; } #else #define ggml_int16x8x2_t int16x8x2_t #define ggml_uint8x16x2_t uint8x16x2_t #define ggml_uint8x16x4_t uint8x16x4_t #define ggml_int8x16x2_t int8x16x2_t #define ggml_int8x16x4_t int8x16x4_t #define ggml_vld1q_s16_x2 vld1q_s16_x2 #define ggml_vld1q_u8_x2 vld1q_u8_x2 #define ggml_vld1q_u8_x4 vld1q_u8_x4 #define ggml_vld1q_s8_x2 vld1q_s8_x2 #define ggml_vld1q_s8_x4 vld1q_s8_x4 #define ggml_vqtbl1q_s8 vqtbl1q_s8 #define ggml_vqtbl1q_u8 vqtbl1q_u8 #endif // !defined(__aarch64__) #if !defined(__ARM_FEATURE_DOTPROD) inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); } #else #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) #endif // !defined(__ARM_FEATURE_DOTPROD) #endif // defined(__ARM_NEON) #if defined(__ARM_NEON) && !defined(_MSC_VER) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { ggml_fp16_internal_t tmp; memcpy(&tmp, &h, sizeof(ggml_fp16_t)); return (float)tmp; } static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { ggml_fp16_t res; ggml_fp16_internal_t tmp = f; memcpy(&res, &tmp, sizeof(ggml_fp16_t)); return res; } #else #ifdef __wasm_simd128__ #include #else #ifdef __POWER9_VECTOR__ #include #undef bool #define bool _Bool #else #if defined(_MSC_VER) || defined(__MINGW32__) #include #else #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__) #if !defined(__riscv) #include #endif #endif #endif #endif #endif #ifdef __riscv_v_intrinsic #include #endif #if defined(__loongarch64) #if defined(__loongarch_asx) #include #endif #if defined(__loongarch_sx) #include #endif #endif #if defined(__loongarch_asx) typedef union { int32_t i; float f; } ft_union; /* float type data load instructions */ static __m128 __lsx_vreplfr2vr_s(float val) { ft_union fi_tmpval = {.f = val}; return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); } static __m256 __lasx_xvreplfr2vr_s(float val) { ft_union fi_tmpval = {.f = val}; return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i); } #endif #ifdef __F16C__ #ifdef _MSC_VER #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) #else #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) #endif #elif defined(__POWER9_VECTOR__) #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) /* the inline asm below is about 12% faster than the lookup method */ #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { register float f; register double d; __asm__( "mtfprd %0,%2\n" "xscvhpdp %0,%0\n" "frsp %1,%0\n" : /* temp */ "=d"(d), /* out */ "=f"(f): /* in */ "r"(h)); return f; } static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { register double d; register ggml_fp16_t r; __asm__( /* xscvdphp can work on double or single precision */ "xscvdphp %0,%2\n" "mffprd %1,%0\n" : /* temp */ "=d"(d), /* out */ "=r"(r): /* in */ "f"(f)); return r; } #else // FP16 <-> FP32 // ref: https://github.com/Maratyszcza/FP16 static inline float fp32_from_bits(uint32_t w) { union { uint32_t as_bits; float as_value; } fp32; fp32.as_bits = w; return fp32.as_value; } static inline uint32_t fp32_to_bits(float f) { union { float as_value; uint32_t as_bits; } fp32; fp32.as_value = f; return fp32.as_bits; } static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { const uint32_t w = (uint32_t) h << 16; const uint32_t sign = w & UINT32_C(0x80000000); const uint32_t two_w = w + w; const uint32_t exp_offset = UINT32_C(0xE0) << 23; #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) const float exp_scale = 0x1.0p-112f; #else const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); #endif const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; const uint32_t magic_mask = UINT32_C(126) << 23; const float magic_bias = 0.5f; const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; const uint32_t denormalized_cutoff = UINT32_C(1) << 27; const uint32_t result = sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); return fp32_from_bits(result); } static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) const float scale_to_inf = 0x1.0p+112f; const float scale_to_zero = 0x1.0p-110f; #else const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); #endif float base = (fabsf(f) * scale_to_inf) * scale_to_zero; const uint32_t w = fp32_to_bits(f); const uint32_t shl1_w = w + w; const uint32_t sign = w & UINT32_C(0x80000000); uint32_t bias = shl1_w & UINT32_C(0xFF000000); if (bias < UINT32_C(0x71000000)) { bias = UINT32_C(0x71000000); } base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; const uint32_t bits = fp32_to_bits(base); const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); const uint32_t nonsign = exp_bits + mantissa_bits; return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); } #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #endif // __F16C__ #endif // defined(__ARM_NEON) && (!defined(__MSC_VER) #ifdef __ARM_FEATURE_SVE #include #endif // __ARM_FEATURE_SVE // precomputed f32 table for f16 (256 KB) // defined in ggml.c, initialized in ggml_init() extern float ggml_table_f32_f16[1 << 16]; // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. // This is also true for POWER9. #if !defined(GGML_FP16_TO_FP32) inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { uint16_t s; memcpy(&s, &f, sizeof(uint16_t)); return ggml_table_f32_f16[s]; } #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) #endif #if !defined(GGML_FP32_TO_FP16) #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #endif // bitset static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated"); #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8) #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1) static size_t ggml_bitset_size(size_t n) { return (n + BITSET_MASK) >> BITSET_SHR; } static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) { return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK))); } static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) { bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK)); } static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) { bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK)); } // hash set #define GGML_HASHSET_FULL ((size_t)-1) #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2) struct ggml_hash_set ggml_hash_set_new(size_t size); void ggml_hash_set_free(struct ggml_hash_set * hash_set); // returns the minimum size for a hash set that can hold min_sz elements size_t ggml_hash_size(size_t min_sz); // remove all elements from the hash set void ggml_hash_set_reset(struct ggml_hash_set * hash_set); // returns true if key is in the hash set static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key); // returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key); // returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key); // return index, asserts if table is full static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key); // hash function for ggml_tensor static inline size_t ggml_hash(const struct ggml_tensor * p) { // the last 4 bits are always zero due to alignment return (size_t)(uintptr_t)p >> 4; } static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) { size_t h = ggml_hash(key) % hash_set->size; // linear probing size_t i = h; while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) { i = (i + 1) % hash_set->size; if (i == h) { // visited all hash table entries -> not found return GGML_HASHSET_FULL; } } return i; } static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) { size_t i = ggml_hash_find(hash_set, key); return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i); } static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) { size_t h = ggml_hash(key) % hash_set->size; // linear probing size_t i = h; do { if (!ggml_bitset_get(hash_set->used, i)) { ggml_bitset_set(hash_set->used, i); hash_set->keys[i] = key; return i; } if (hash_set->keys[i] == key) { return GGML_HASHSET_ALREADY_EXISTS; } i = (i + 1) % hash_set->size; } while (i != h); // visited all hash table entries -> not found GGML_ABORT("fatal error"); } static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) { size_t h = ggml_hash(key) % hash_set->size; // linear probing size_t i = h; do { if (!ggml_bitset_get(hash_set->used, i)) { ggml_bitset_set(hash_set->used, i); hash_set->keys[i] = key; return i; } if (hash_set->keys[i] == key) { return i; } i = (i + 1) % hash_set->size; } while (i != h); // visited all hash table entries -> not found GGML_ABORT("fatal error"); } #ifdef __cplusplus } #endif