ollama/llama/sgemm.cpp

// Copyright 2024 Mozilla Foundation
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

//
//                   _   _          ___ _      _   ___
//                  | |_(_)_ _ _  _| _ ) |    /_\ / __|
//                  |  _| | ' \ || | _ \ |__ / _ \\__ \.
//                   \__|_|_||_\_, |___/____/_/ \_\___/
//                             |__/
//
//                    BASIC LINEAR ALGEBRA SUBPROGRAMS
//
//
// This file implements multithreaded CPU matrix multiplication for the
// common contiguous use case C = Aᵀ * B. These kernels are designed to
// have excellent performance[1] for matrices that fit in the CPU cache
// without imposing any overhead such as cache filling or malloc calls.
//
// This implementation does not guarantee any upper bound with rounding
// errors, which grow along with k. Our goal's to maximally exploit the
// hardware for performance, and then use whatever resources remain for
// improving numerical accuracy.
//
// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].

#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wpedantic"
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif

#include "sgemm.h"
#include "ggml-impl.h"
#include "ggml-cpu-impl.h"
#include "ggml-quants.h"

#ifdef _MSC_VER
#define NOINLINE __declspec(noinline)
#else
#define NOINLINE __attribute__((__noinline__))
#endif

#if defined(__ARM_NEON) || defined(__AVX512F__)
#define VECTOR_REGISTERS 32
#else
#define VECTOR_REGISTERS 16
#endif

#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)

namespace {

inline float unhalf(ggml_fp16_t d) {
    return GGML_FP16_TO_FP32(d);
}

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED ARITHMETIC OPERATIONS

#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline __m128 add(__m128 x, __m128 y) { return _mm_add_ps(x, y); }
inline __m128 sub(__m128 x, __m128 y) { return _mm_sub_ps(x, y); }
inline __m128 mul(__m128 x, __m128 y) { return _mm_mul_ps(x, y); }
#endif  // __SSE__

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline __m256 add(__m256 x, __m256 y) { return _mm256_add_ps(x, y); }
inline __m256 sub(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); }
inline __m256 mul(__m256 x, __m256 y) { return _mm256_mul_ps(x, y); }
#endif // __AVX__

#if defined(__AVX512F__)
inline __m512 add(__m512 x, __m512 y) { return _mm512_add_ps(x, y); }
inline __m512 sub(__m512 x, __m512 y) { return _mm512_sub_ps(x, y); }
inline __m512 mul(__m512 x, __m512 y) { return _mm512_mul_ps(x, y); }
#endif // __AVX512F__

#if defined(__ARM_NEON)
inline float32x4_t add(float32x4_t x, float32x4_t y) { return vaddq_f32(x, y); }
inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vsubq_f32(x, y); }
inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vmulq_f32(x, y); }
#endif // __ARM_NEON

#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
inline float16x8_t add(float16x8_t x, float16x8_t y) { return vaddq_f16(x, y); }
inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED FUSED MULTIPLY ADD

/**
 * Computes a * b + c.
 */
template <typename T, typename U>
inline U madd(T a, T b, U c) {
    return add(mul(a, b), c);
}

#if defined(__FMA__)
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template <>
inline __m256 madd(__m256 a, __m256 b, __m256 c) {
    return _mm256_fmadd_ps(a, b, c);
}
#endif
#if defined(__AVX512F__)
template <>
inline __m512 madd(__m512 a, __m512 b, __m512 c) {
    return _mm512_fmadd_ps(a, b, c);
}
#endif
#endif

#if defined(__ARM_FEATURE_FMA)
template <>
inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
    return vfmaq_f32(c, b, a);
}
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
template <>
inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
    return vfmaq_f16(c, b, a);
}
#endif
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED HORIZONTAL SUM

#if defined(__ARM_NEON)
inline float hsum(float32x4_t x) {
    return vaddvq_f32(x);
}
#endif // __ARM_NEON

#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
inline float hsum(float16x8_t x) {
    return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)),
                                vcvt_f32_f16(vget_high_f16(x))));
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline float hsum(__m128 x) {
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
    x = _mm_add_ps(x, _mm_movehl_ps(x, x));
    x = _mm_add_ss(x, _mm_movehdup_ps(x));
#else
    __m128 t;
    t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));
    x = _mm_add_ps(x, t);
    t = _mm_movehl_ps(t, x);
    x = _mm_add_ss(x, t);
#endif
    return _mm_cvtss_f32(x);
}
#endif

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
inline float hsum(__m256 x) {
    return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1),
                           _mm256_castps256_ps128(x)));
}
#endif // __AVX__

#if defined(__AVX512F__)
inline float hsum(__m512 x) {
    return _mm512_reduce_add_ps(x);
}
#endif // __AVX512F__

////////////////////////////////////////////////////////////////////////////////////////////////////
// VECTORIZED MEMORY LOADING

template <typename T, typename U> T load(const U *);

#if defined(__ARM_NEON)
template <> inline float32x4_t load(const float *p) {
    return vld1q_f32(p);
}
#if !defined(_MSC_VER)
template <> inline float16x8_t load(const ggml_fp16_t *p) {
    return vld1q_f16((const float16_t *)p);
}
template <> inline float32x4_t load(const ggml_fp16_t *p) {
    return vcvt_f32_f16(vld1_f16((const float16_t *)p));
}
#endif // _MSC_VER
#endif // __ARM_NEON

#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template <> inline __m128 load(const float *p) {
    return _mm_loadu_ps(p);
}
#endif  // __SSE__

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
template <> inline __m256 load(const float *p) {
    return _mm256_loadu_ps(p);
}
#endif // __AVX__

#if defined(__F16C__)
template <> inline __m256 load(const ggml_fp16_t *p) {
    return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
}
#endif // __F16C__

#if defined(__AVX512F__)
template <> inline __m512 load(const float *p) {
    return _mm512_loadu_ps(p);
}
template <> inline __m512 load(const ggml_fp16_t *p) {
    return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
}
#endif // __AVX512F__

////////////////////////////////////////////////////////////////////////////////////////////////////
// CONSTANTS

#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////
// FLOATING POINT MATRIX MULTIPLICATION

template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
class tinyBLAS {
  public:
    tinyBLAS(int64_t k,
             const TA *A, int64_t lda,
             const TB *B, int64_t ldb,
             TC *C, int64_t ldc,
             int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }

    void matmul(int64_t m, int64_t n) {
        mnpack(0, m, 0, n);
    }

  private:
    NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
        int64_t mc, nc, mp, np;
        switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
#if VECTOR_REGISTERS == 32
        case 0x55:
            mc = 5;
            nc = 5;
            gemm<5, 5>(m0, m, n0, n);
            break;
        case 0x45:
            mc = 4;
            nc = 5;
            gemm<4, 5>(m0, m, n0, n);
            break;
        case 0x54:
            mc = 5;
            nc = 4;
            gemm<5, 4>(m0, m, n0, n);
            break;
        case 0x44:
            mc = 4;
            nc = 4;
            gemm<4, 4>(m0, m, n0, n);
            break;
        case 0x53:
            mc = 5;
            nc = 3;
            gemm<5, 3>(m0, m, n0, n);
            break;
        case 0x35:
            mc = 3;
            nc = 5;
            gemm<3, 5>(m0, m, n0, n);
            break;
        case 0x43:
            mc = 4;
            nc = 3;
            gemm<4, 3>(m0, m, n0, n);
            break;
#else
        case 0x55:
        case 0x54:
        case 0x53:
        case 0x45:
        case 0x44:
        case 0x43:
            mc = 4;
            nc = 3;
            gemm<4, 3>(m0, m, n0, n);
            break;
        case 0x35:
#endif
        case 0x34:
            mc = 3;
            nc = 4;
            gemm<3, 4>(m0, m, n0, n);
            break;
        case 0x52:
            mc = 5;
            nc = 2;
            gemm<5, 2>(m0, m, n0, n);
            break;
        case 0x33:
            mc = 3;
            nc = 3;
            gemm<3, 3>(m0, m, n0, n);
            break;
        case 0x25:
            mc = 2;
            nc = 5;
            gemm<2, 5>(m0, m, n0, n);
            break;
        case 0x42:
            mc = 4;
            nc = 2;
            gemm<4, 2>(m0, m, n0, n);
            break;
        case 0x24:
            mc = 2;
            nc = 4;
            gemm<2, 4>(m0, m, n0, n);
            break;
        case 0x32:
            mc = 3;
            nc = 2;
            gemm<3, 2>(m0, m, n0, n);
            break;
        case 0x23:
            mc = 2;
            nc = 3;
            gemm<2, 3>(m0, m, n0, n);
            break;
        case 0x51:
            mc = 5;
            nc = 1;
            gemm<5, 1>(m0, m, n0, n);
            break;
        case 0x41:
            mc = 4;
            nc = 1;
            gemm<4, 1>(m0, m, n0, n);
            break;
        case 0x22:
            mc = 2;
            nc = 2;
            gemm<2, 2>(m0, m, n0, n);
            break;
        case 0x15:
            mc = 1;
            nc = 5;
            gemm<1, 5>(m0, m, n0, n);
            break;
        case 0x14:
            mc = 1;
            nc = 4;
            gemm<1, 4>(m0, m, n0, n);
            break;
        case 0x31:
            mc = 3;
            nc = 1;
            gemm<3, 1>(m0, m, n0, n);
            break;
        case 0x13:
            mc = 1;
            nc = 3;
            gemm<1, 3>(m0, m, n0, n);
            break;
        case 0x21:
            mc = 2;
            nc = 1;
            gemm<2, 1>(m0, m, n0, n);
            break;
        case 0x12:
            mc = 1;
            nc = 2;
            gemm<1, 2>(m0, m, n0, n);
            break;
        case 0x11:
            mc = 1;
            nc = 1;
            gemm<1, 1>(m0, m, n0, n);
            break;
        default:
            return;
        }
        mp = m0 + (m - m0) / mc * mc;
        np = n0 + (n - n0) / nc * nc;
        mnpack(mp, m, n0, np);
        mnpack(m0, m, np, n);
    }

    template <int RM, int RN>
    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
        int64_t ytiles = (m - m0) / RM;
        int64_t xtiles = (n - n0) / RN;
        int64_t tiles = xtiles * ytiles;
        int64_t duty = (tiles + nth - 1) / nth;
        int64_t start = duty * ith;
        int64_t end = start + duty;
        if (end > tiles)
            end = tiles;
        for (int64_t job = start; job < end; ++job) {
            int64_t ii = m0 + job / xtiles * RM;
            int64_t jj = n0 + job % xtiles * RN;
            D Cv[RN][RM] = {};
            for (int64_t l = 0; l < k; l += KN)
                for (int64_t j = 0; j < RN; ++j)
                    for (int64_t i = 0; i < RM; ++i)
                        Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
                                        load<V>(B + ldb * (jj + j) + l),
                                        Cv[j][i]);
            for (int64_t j = 0; j < RN; ++j)
                for (int64_t i = 0; i < RM; ++i)
                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
        }
    }

    const TA *const A;
    const TB *const B;
    TC *const C;
    const int64_t k;
    const int64_t lda;
    const int64_t ldb;
    const int64_t ldc;
    const int ith;
    const int nth;
};

//////////////////////////////////////////////////////////////////////////////////////////
// QUANT ZERO MATRIX MULTIPLICATION

#if defined(__ARM_FEATURE_DOTPROD)
template <typename TA>
class tinyBLAS_Q0_ARM {
  public:
    tinyBLAS_Q0_ARM(int64_t k,
                    const TA *A, int64_t lda,
                    const block_q8_0 *B, int64_t ldb,
                    float *C, int64_t ldc,
                    int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }

    void matmul(int64_t m, int64_t n) {
        mnpack(0, m, 0, n);
    }

  private:
    NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
        int64_t mc, nc, mp, np;
        switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) {
        case 0x33:
            mc = 3;
            nc = 3;
            gemm<3, 3>(m0, m, n0, n);
            break;
        case 0x32:
            mc = 3;
            nc = 2;
            gemm<3, 2>(m0, m, n0, n);
            break;
        case 0x23:
            mc = 2;
            nc = 3;
            gemm<2, 3>(m0, m, n0, n);
            break;
        case 0x22:
            mc = 2;
            nc = 2;
            gemm<2, 2>(m0, m, n0, n);
            break;
        case 0x31:
            mc = 3;
            nc = 1;
            gemm<3, 1>(m0, m, n0, n);
            break;
        case 0x13:
            mc = 1;
            nc = 3;
            gemm<1, 3>(m0, m, n0, n);
            break;
        case 0x21:
            mc = 2;
            nc = 1;
            gemm<2, 1>(m0, m, n0, n);
            break;
        case 0x12:
            mc = 1;
            nc = 2;
            gemm<1, 2>(m0, m, n0, n);
            break;
        case 0x11:
            mc = 1;
            nc = 1;
            gemm<1, 1>(m0, m, n0, n);
            break;
        default:
            return;
        }
        mp = m0 + (m - m0) / mc * mc;
        np = n0 + (n - n0) / nc * nc;
        mnpack(mp, m, n0, np);
        mnpack(m0, m, np, n);
    }

    template <int RM, int RN>
    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
        int64_t ytiles = (m - m0) / RM;
        int64_t xtiles = (n - n0) / RN;
        int64_t tiles = xtiles * ytiles;
        int64_t duty = (tiles + nth - 1) / nth;
        int64_t start = duty * ith;
        int64_t end = start + duty;
        if (end > tiles)
            end = tiles;
        for (int64_t job = start; job < end; ++job) {
            int64_t ii = m0 + job / xtiles * RM;
            int64_t jj = n0 + job % xtiles * RN;
            float32x4_t Cv[RN][RM] = {};
            for (int64_t l = 0; l < k; ++l)
                for (int64_t j = 0; j < RN; ++j)
                    for (int64_t i = 0; i < RM; ++i)
                        Cv[j][i] = vmlaq_n_f32(Cv[j][i],
                                               vcvtq_f32_s32(vdotq_s32(
                                                   vdotq_s32(vdupq_n_s32(0),
                                                             load_lo(A + lda * (ii + i) + l),
                                                             load_lo(B + ldb * (jj + j) + l)),
                                                   load_hi(A + lda * (ii + i) + l),
                                                   load_hi(B + ldb * (jj + j) + l))),
                                               unhalf(A[lda * (ii + i) + l].d) *
                                               unhalf(B[ldb * (jj + j) + l].d));
            for (int64_t j = 0; j < RN; ++j)
                for (int64_t i = 0; i < RM; ++i)
                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
        }
    }

    inline int8x16_t load_lo(const block_q8_0 *b) {
        return vld1q_s8(b->qs);
    }

    inline int8x16_t load_hi(const block_q8_0 *b) {
        return vld1q_s8(b->qs + 16);
    }

    inline int8x16_t load_lo(const block_q4_0 *b) {
        return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs),
                                                     vdupq_n_u8(0x0f))),
                        vdupq_n_s8(0x8));
    }

    inline int8x16_t load_hi(const block_q4_0 *b) {
        return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)),
                        vdupq_n_s8(0x8));
    }

    const TA *const A;
    const block_q8_0 *const B;
    float *const C;
    const int64_t k;
    const int64_t lda;
    const int64_t ldb;
    const int64_t ldc;
    const int ith;
    const int nth;
};
#endif // __ARM_FEATURE_DOTPROD

#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
template <typename TA, typename TB, typename TC>
class tinyBLAS_Q0_AVX {
  public:
    tinyBLAS_Q0_AVX(int64_t k,
                    const TA *A, int64_t lda,
                    const TB *B, int64_t ldb,
                    TC *C, int64_t ldc,
                    int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }

    void matmul(int64_t m, int64_t n) {
        mnpack(0, m, 0, n);
    }

  private:
    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
        int64_t mc, nc, mp, np;
        switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
#if VECTOR_REGISTERS == 32
        case 0x44:
            mc = 4;
            nc = 4;
#if defined(__AVX2__) && defined(__F16C__)
            gemm4xN<4>(m0, m, n0, n);
#else
            gemm<4, 4>(m0, m, n0, n);
#endif
            break;
        case 0x43:
            mc = 4;
            nc = 3;
#if defined(__AVX2__) && defined(__F16C__)
            gemm4xN<3>(m0, m, n0, n);
#else
            gemm<4, 3>(m0, m, n0, n);
#endif
            break;
        case 0x34:
            mc = 3;
            nc = 4;
#if defined(__AVX2__) && defined(__F16C__)
            gemmMx4<3>(m0, m, n0, n);
#else
            gemm<3, 4>(m0, m, n0, n);
#endif
            break;
        case 0x33:
            mc = 3;
            nc = 3;
            gemm<3, 3>(m0, m, n0, n);
            break;
        case 0x42:
            mc = 4;
            nc = 2;
#if defined(__AVX2__) && defined(__F16C__)
            gemm4xN<2>(m0, m, n0, n);
#else
            gemm<4, 2>(m0, m, n0, n);
#endif
            break;
        case 0x24:
            mc = 2;
            nc = 4;
#if defined(__AVX2__) && defined(__F16C__)
            gemmMx4<2>(m0, m, n0, n);
#else
            gemm<2, 4>(m0, m, n0, n);
#endif
            break;
#else
        case 0x44:
        case 0x43:
        case 0x42:
            mc = 4;
            nc = 2;
#if defined(__AVX2__) && defined(__F16C__)
            gemm4xN<2>(m0, m, n0, n);
#else
            gemm<4, 2>(m0, m, n0, n);
#endif
            break;
        case 0x34:
        case 0x24:
            mc = 2;
            nc = 4;
#if defined(__AVX2__) && defined(__F16C__)
            gemmMx4<2>(m0, m, n0, n);
#else
            gemm<2, 4>(m0, m, n0, n);
#endif
            break;
        case 0x33:
#endif
        case 0x32:
            mc = 3;
            nc = 2;
            gemm<3, 2>(m0, m, n0, n);
            break;
        case 0x23:
            mc = 2;
            nc = 3;
            gemm<2, 3>(m0, m, n0, n);
            break;
        case 0x41:
            mc = 4;
            nc = 1;
#if defined(__AVX2__) && defined(__F16C__)
            gemm4xN<1>(m0, m, n0, n);
#else
            gemm<4, 1>(m0, m, n0, n);
#endif
            break;
        case 0x22:
            mc = 2;
            nc = 2;
            gemm<2, 2>(m0, m, n0, n);
            break;
        case 0x14:
            mc = 1;
            nc = 4;
#if defined(__AVX2__) && defined(__F16C__)
            gemmMx4<1>(m0, m, n0, n);
#else
            gemm<1, 4>(m0, m, n0, n);
#endif
            break;
        case 0x31:
            mc = 3;
            nc = 1;
            gemm<3, 1>(m0, m, n0, n);
            break;
        case 0x13:
            mc = 1;
            nc = 3;
            gemm<1, 3>(m0, m, n0, n);
            break;
        case 0x21:
            mc = 2;
            nc = 1;
            gemm<2, 1>(m0, m, n0, n);
            break;
        case 0x12:
            mc = 1;
            nc = 2;
            gemm<1, 2>(m0, m, n0, n);
            break;
        case 0x11:
            mc = 1;
            nc = 1;
            gemm<1, 1>(m0, m, n0, n);
            break;
        default:
            return;
        }
        mp = m0 + (m - m0) / mc * mc;
        np = n0 + (n - n0) / nc * nc;
        mnpack(mp, m, n0, np);
        mnpack(m0, m, np, n);
    }

#if defined(__AVX2__) && defined(__F16C__)
// Templated functions for gemm of dimensions 4xN
    template <int RN>
    NOINLINE void gemm4xN(int64_t m0, int64_t m, int64_t n0, int64_t n) {
        int64_t ytiles = (m - m0) / 4;
        int64_t xtiles = (n - n0) / RN;
        int64_t tiles = xtiles * ytiles;
        int64_t duty = (tiles + nth - 1) / nth;
        int64_t start = duty * ith;
        int64_t end = start + duty;
        if (end > tiles)
            end = tiles;
        for (int64_t job = start; job < end; ++job) {
            int64_t ii = m0 + job / xtiles * 4;
            int64_t jj = n0 + job % xtiles * RN;
            __m256 Cv[RN][4] = {};
            for (int64_t l = 0; l < k; ++l) {
                uint64_t a_delta = ((uint64_t)A[lda * (ii + 3) + l].d << 48) | ((uint64_t)A[lda * (ii + 2) + l].d << 32) | ((uint64_t)A[lda * (ii + 1) + l].d << 16) | (A[lda * (ii + 0) + l].d);
                // Convert delta values for four blocks to float values
                __m128 da = _mm_cvtph_ps(_mm_set_epi64x(0, a_delta));
                __m256i avec0 = load(A + lda * (ii + 0) + l);
                __m256i avec1 = load(A + lda * (ii + 1) + l);
                __m256i avec2 = load(A + lda * (ii + 2) + l);
                __m256i avec3 = load(A + lda * (ii + 3) + l);
                for (int64_t j = 0; j < RN; ++j) {
                        __m128 db = _mm_set1_ps(unhalf(B[ldb * (jj + j) + l].d));
                        // Computation of product of delta values for four blocks and replicate it across 256 bit lane
                        __m256 dvec =  _mm256_castps128_ps256(_mm_mul_ps(da, db));
                        dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
                        // Computation of dot product and multiplication with appropriate delta value products
                        Cv[j][0] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
                                    updot(_mm256_sign_epi8(avec0, avec0),
                                          _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec0)),
                                    Cv[j][0]);
                        Cv[j][1] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
                                    updot(_mm256_sign_epi8(avec1, avec1),
                                            _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec1)),
                                    Cv[j][1]);
                        Cv[j][2] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
                                    updot(_mm256_sign_epi8(avec2, avec2),
                                            _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec2)),
                                    Cv[j][2]);
                        Cv[j][3] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
                                    updot(_mm256_sign_epi8(avec3, avec3),
                                            _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec3)),
                                    Cv[j][3]);
                }
            }

            for (int64_t j = 0; j < RN; ++j)
                for (int64_t i = 0; i < 4; ++i)
                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
        }
    }

    // Templated functions for gemm of dimensions Mx4
    template <int RM>
    NOINLINE void gemmMx4(int64_t m0, int64_t m, int64_t n0, int64_t n) {
        int64_t ytiles = (m - m0) / RM;
        int64_t xtiles = (n - n0) / 4;
        int64_t tiles = xtiles * ytiles;
        int64_t duty = (tiles + nth - 1) / nth;
        int64_t start = duty * ith;
        int64_t end = start + duty;
        if (end > tiles)
            end = tiles;
        for (int64_t job = start; job < end; ++job) {
            int64_t ii = m0 + job / xtiles * RM;
            int64_t jj = n0 + job % xtiles * 4;
            __m256 Cv[4][RM] = {};
            for (int64_t l = 0; l < k; ++l) {
                uint64_t b_delta = ((uint64_t)B[ldb * (jj + 3) + l].d << 48) | ((uint64_t)B[ldb * (jj + 2) + l].d << 32) | ((uint64_t)B[ldb * (jj + 1) + l].d << 16) | (B[ldb * (jj + 0) + l].d);
                // Convert delta values for four blocks to float values
                __m128 db = _mm_cvtph_ps(_mm_set_epi64x(0, b_delta));
                __m256i bvec0 = load(B + ldb * (jj + 0) + l);
                __m256i bvec1 = load(B + ldb * (jj + 1) + l);
                __m256i bvec2 = load(B + ldb * (jj + 2) + l);
                __m256i bvec3 = load(B + ldb * (jj + 3) + l);
                for (int64_t i = 0; i < RM; ++i) {
                    __m128 da = _mm_set1_ps(unhalf((A[lda * (ii + i) + l].d)));
                    // Computation of product of delta values for four blocks and replicate it across 256 bit lane
                    __m256 dvec =  _mm256_castps128_ps256(_mm_mul_ps(da, db));
                    dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
                    // Computation of dot product and multiplication with appropriate delta value products
                    Cv[0][i] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
                                                            load(A + lda * (ii + i) + l)),
                                            _mm256_sign_epi8(bvec0, load(A + lda * (ii + i) + l))),
                                    Cv[0][i]);
                    Cv[1][i] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
                                                            load(A + lda * (ii + i) + l)),
                                            _mm256_sign_epi8(bvec1, load(A + lda * (ii + i) + l))),
                                    Cv[1][i]);
                    Cv[2][i] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
                                                            load(A + lda * (ii + i) + l)),
                                            _mm256_sign_epi8(bvec2, load(A + lda * (ii + i) + l))),
                                    Cv[2][i]);
                    Cv[3][i] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
                                                            load(A + lda * (ii + i) + l)),
                                            _mm256_sign_epi8(bvec3, load(A + lda * (ii + i) + l))),
                                    Cv[3][i]);
                }
            }
            for (int64_t j = 0; j < 4; ++j)
                for (int64_t i = 0; i < RM; ++i)
                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
        }
    }
#endif

    template <int RM, int RN>
    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
        int64_t ytiles = (m - m0) / RM;
        int64_t xtiles = (n - n0) / RN;
        int64_t tiles = xtiles * ytiles;
        int64_t duty = (tiles + nth - 1) / nth;
        int64_t start = duty * ith;
        int64_t end = start + duty;
        if (end > tiles)
            end = tiles;
        for (int64_t job = start; job < end; ++job) {
            int64_t ii = m0 + job / xtiles * RM;
            int64_t jj = n0 + job % xtiles * RN;
            __m256 Cv[RN][RM] = {};
            for (int64_t l = 0; l < k; ++l)
                for (int64_t j = 0; j < RN; ++j)
                    for (int64_t i = 0; i < RM; ++i) {
#if defined(__AVX2__)
                        __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
                                                              load(A + lda * (ii + i) + l)),
                                             _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
                                                              load(A + lda * (ii + i) + l)));
#else
                        __m128i ali0 = load0(A + lda * (ii + i) + l);
                        __m128i ali1 = load1(A + lda * (ii + i) + l);
                        __m128i blj0 = load0(B + ldb * (jj + j) + l);
                        __m128i blj1 = load1(B + ldb * (jj + j) + l);

                        __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
                        __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
                        __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
                        __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);

                        // updot
                        const __m128i oneFill = _mm_set1_epi16(1);
                        __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
                        __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
                        __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
#endif
                        Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
                                                       unhalf(B[ldb * (jj + j) + l].d)),
                                                       udTmp,
                                                       Cv[j][i]);
                    }
            for (int64_t j = 0; j < RN; ++j)
                for (int64_t i = 0; i < RM; ++i)
                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
        }
    }

    inline __m256i load(const block_q8_0 *b) {
        return _mm256_loadu_si256((const __m256i *)b->qs);
    }

    inline __m128i load0(const block_q8_0 *b) {
        return _mm_loadu_si128((const __m128i *)b->qs);
    }

    inline __m128i load1(const block_q8_0 *b) {
        return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
    }

    inline __m256i load(const block_q4_0 *b) {
        return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
    }

    inline __m128i load0(const block_q4_0 *b) {
        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
        return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
    }

    inline __m128i load1(const block_q4_0 *b) {
        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
        return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
    }

    inline __m256i load(const block_iq4_nl *b) {
        return MM256_SET_M128I(load1(b), load0(b));
    }

    inline __m128i load0(const block_iq4_nl *b) {
        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
    }

    inline __m128i load1(const block_iq4_nl *b) {
        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
    }

    inline __m256 updot(__m256i u, __m256i s) {
        __m256i res;
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
        res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
#else
        res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
#endif
        return _mm256_cvtepi32_ps(res);
    }

    static inline __m256i denibble(const uint8_t *p) {
        __m128i x = _mm_loadu_si128((const __m128i *)p);
        return _mm256_and_si256(_mm256_set1_epi8(15),
                                _mm256_insertf128_si256(_mm256_castsi128_si256(x),
                                                        _mm_srli_epi16(x, 4), 1));
    }

    const TA *const A;
    const TB *const B;
    TC *const C;
    const int64_t k;
    const int64_t lda;
    const int64_t ldb;
    const int64_t ldc;
    const int ith;
    const int nth;
};
#endif // __AVX__

} // namespace

/**
 * Performs optimized matrix multiplication on CPU.
 *
 * This subroutine may compute C = Aᵀ * B with column major ordering.
 * Despite its name, this isn't a generalized implementation. Work is
 * only performed when a handwritten kernel is written and available.
 * Otherwise the caller should fall back to a general matmul routine.
 *
 * For example, for single-threaded single-precision GEMM you can say
 *
 *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
 *                     0, 1,
 *                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
 *
 * @param m is rows in `A` and `C`
 * @param n is cols in `B` and `C`
 * @param k is cols in `A` and rows in `B`
 * @param A is first input matrix (always transposed)
 * @param lda is row stride of `A`
 * @param B is second input matrix (never transposed)
 * @param ldb is row stride of `B`
 * @param C is input/output array of output matrices
 * @param ldc is row stride of `C`
 * @param ith is thread id (must be less than `nth`)
 * @param nth is number of threads (must be greater than zero)
 * @param Atype is GGML data type of `A`
 * @param Btype is GGML data type of `B`
 * @param Ctype is GGML data type of `C`
 * @return true if this function was able to service the matmul request
 */
bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
                     int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {

    assert(m >= 0);
    assert(n >= 0);
    assert(k >= 0);
    assert(lda >= k);
    assert(ldb >= k);
    assert(ldc >= m);
    assert(nth > 0);
    assert(ith < nth);

    // only enable sgemm for prompt processing
    if (n < 2)
        return false;

    if (Ctype != GGML_TYPE_F32)
        return false;

    switch (Atype) {

    case GGML_TYPE_F32: {
        if (Btype != GGML_TYPE_F32)
            return false;
#if defined(__AVX512F__)
        if (k % 16)
            return false;
        tinyBLAS<16, __m512, __m512, float, float, float> tb{
            k, (const float *)A, lda,
            (const float *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#elif defined(__AVX__) || defined(__AVX2__)
        if (k % 8)
            return false;
        tinyBLAS<8, __m256, __m256, float, float, float> tb{
            k, (const float *)A, lda,
            (const float *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#elif defined(__ARM_NEON)
        if (n < 4)
            return false;
        if (k % 4)
            return false;
        tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
            k, (const float *)A, lda,
            (const float *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#else
        return false;
#endif
    }

    case GGML_TYPE_F16: {
#if defined(__AVX512F__)
        if (k % 16)
            return false;
        if (Btype != GGML_TYPE_F32)
            return false;
        tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
            k, (const ggml_fp16_t *)A, lda,
            (const float *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
        if (k % 8)
            return false;
        if (Btype != GGML_TYPE_F32)
            return false;
        tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
            k, (const ggml_fp16_t *)A, lda,
            (const float *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
        if (n < 8)
            return false;
        if (k % 8)
            return false;
        if (Btype != GGML_TYPE_F16)
            return false;
        tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
            k, (const ggml_fp16_t *)A, lda,
            (const ggml_fp16_t *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
        if (k % 4)
            return false;
        if (Btype != GGML_TYPE_F32)
            return false;
        tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
            k, (const ggml_fp16_t *)A, lda,
            (const float *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#else
        return false;
#endif
    }

    case GGML_TYPE_Q8_0: {
        if (Btype != GGML_TYPE_Q8_0)
           return false;
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
        tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
            k, (const block_q8_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#elif defined(__ARM_FEATURE_DOTPROD)
        tinyBLAS_Q0_ARM<block_q8_0> tb{
            k, (const block_q8_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#else
        return false;
#endif
    }

    case GGML_TYPE_Q4_0: {
        if (Btype != GGML_TYPE_Q8_0)
            return false;
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
        tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
            k, (const block_q4_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#elif defined(__ARM_FEATURE_DOTPROD)
        tinyBLAS_Q0_ARM<block_q4_0> tb{
            k, (const block_q4_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#else
        return false;
#endif
    }

    case GGML_TYPE_IQ4_NL: {
        if (Btype != GGML_TYPE_Q8_0)
            return false;
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
        tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
            k, (const block_iq4_nl *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
            ith, nth};
        tb.matmul(m, n);
        return true;
#else
        return false;
#endif
    }

    default:
        return false;
    }

    (void)m;
    (void)n;
    (void)k;
    (void)A;
    (void)lda;
    (void)B;
    (void)ldb;
    (void)C;
    (void)ldc;
    (void)ith;
    (void)nth;
    (void)Atype;
    (void)Btype;
    (void)Ctype;
}
-												Re-introduce the `llama` package (#5034)

* Re-introduce the llama package

This PR brings back the llama package, making it possible to call llama.cpp and
ggml APIs from Go directly via CGo. This has a few advantages:

- C APIs can be called directly from Go without needing to use the previous
  "server" REST API
- On macOS and for CPU builds on Linux and Windows, Ollama can be built without
  a go generate ./... step, making it easy to get up and running to hack on
  parts of Ollama that don't require fast inference
- Faster build times for AVX,AVX2,CUDA and ROCM (a full build of all runners
  takes <5 min on a fast CPU)
- No git submodule making it easier to clone and build from source

This is a big PR, but much of it is vendor code except for:

- llama.go CGo bindings
- example/: a simple example of running inference
- runner/: a subprocess server designed to replace the llm/ext_server package
- Makefile an as minimal as possible Makefile to build the runner package for
  different targets (cpu, avx, avx2, cuda, rocm)

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>

* cache: Clear old KV cache entries when evicting a slot

When forking a cache entry, if no empty slots are available we
evict the least recently used one and copy over the KV entries
from the closest match. However, this copy does not overwrite
existing values but only adds new ones. Therefore, we need to
clear the old slot first.

This change fixes two issues:
 - The KV cache fills up and runs out of space even though we think
   we are managing it correctly
 - Performance gets worse over time as we use new cache entries that
   are not hot in the processor caches

* doc: explain golang objc linker warning (#6830)

* llama: gather transitive dependencies for rocm for dist packaging (#6848)

* Refine go server makefiles to be more DRY (#6924)

This breaks up the monolithic Makefile for the Go based runners into a
set of utility files as well as recursive Makefiles for the runners.
Files starting with the name "Makefile" are buildable, while files that
end with ".make" are utilities to include in other Makefiles.  This
reduces the amount of nearly identical targets and helps set a pattern
for future community contributions for new GPU runner architectures.

When we are ready to switch over to the Go runners, these files should
move to the top of the repo, and we should add targets for the main CLI,
as well as a helper "install" (put all the built binaries on the local
system in a runnable state) and "dist" target (generate the various
tar/zip files for distribution) for local developer use.

* llama: don't create extraneous directories (#6988)

* llama: Exercise the new build in CI (#6989)

Wire up some basic sanity testing in CI for the Go runner.  GPU runners are not covered yet.

* llama: Refine developer docs for Go server (#6842)

This enhances the documentation for development focusing on the new Go
server.  After we complete the transition further doc refinements
can remove the "transition" discussion.

* runner.go: Allocate batches for all sequences during init

We should tell the model that we could have full batches for all
sequences. We already do this when we allocate the batches but it was
missed during initialization.

* llama.go: Don't return nil from Tokenize on zero length input

Potentially receiving nil in a non-error condition is surprising to
most callers - it's better to return an empty slice.

* runner.go: Remove stop tokens from cache

If the last token is EOG then we don't return this and it isn't
present in the cache (because it was never submitted to Decode).
This works well for extending the cache entry with a new sequence.

However, for multi-token stop sequences, we won't return any of the
tokens but all but the last one will be in the cache. This means
when the conversation continues the cache will contain tokens that
don't overlap with the new prompt.

This works (we will pick up the portion where there is overlap) but
it causes unnecessary cache thrashing because we will fork the original
cache entry as it is not a perfect match.

By trimming the cache to the tokens that we actually return this
issue can be avoided.

* runner.go: Simplify flushing of pending tokens

* runner.go: Update TODOs

* runner.go: Don't panic when processing sequences

If there is an error processing a sequence, we should return a
clean HTTP error back to Ollama rather than panicing. This will
make us more resilient to transient failures.

Panics can still occur during startup as there is no way to serve
requests if that fails.

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: More accurately capture timings

Currently prompt processing time doesn't capture the that it takes
to tokenize the input, only decoding time. We should capture the
full process to more accurately reflect reality. This is especially
true once we start processing images where the initial processing
can take significant time. This is also more consistent with the
existing C++ runner.

* runner.go: Support for vision models

In addition to bringing feature parity with the C++ runner, this also
incorporates several improvements:
 - Cache prompting works with images, avoiding the need to re-decode
   embeddings for every message in a conversation
 - Parallelism is supported, avoiding the need to restrict to one
   sequence at a time. (Though for now Ollama will not schedule
   them while we might need to fall back to the old runner.)

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: Move Unicode checking code and add tests

* runner.go: Export external cache members

Runner and cache are in the same package so the change doesn't
affect anything but it is more internally consistent.

* runner.go: Image embedding cache

Generating embeddings from images can take significant time (on
my machine between 100ms and 8s depending on the model). Although
we already cache the result of decoding these images, the embeddings
need to be regenerated every time. This is not necessary if we get
the same image over and over again, for example, during a conversation.

This currently uses a very small cache with a very simple algorithm
but it is easy to improve as is warranted.

* llama: catch up on patches

Carry forward solar-pro and cli-unicode patches

* runner.go: Don't re-allocate memory for every batch

We can reuse memory allocated from batch to batch since batch
size is fixed. This both saves the cost of reallocation as well
keeps the cache lines hot.

This results in a roughly 1% performance improvement for token
generation with Nvidia GPUs on Linux.

* runner.go: Default to classic input cache policy

The input cache as part of the go runner implemented a cache
policy that aims to maximize hit rate in both single and multi-
user scenarios. When there is a cache hit, the response is
very fast.

However, performance is actually slower when there is an input
cache miss due to worse GPU VRAM locality. This means that
performance is generally better overall for multi-user scenarios
(better input cache hit rate, locality was relatively poor already).
But worse for single users (input cache hit rate is about the same,
locality is now worse).

This defaults the policy back to the old one to avoid a regression
but keeps the new one available through an environment variable
OLLAMA_MULTIUSER_CACHE. This is left undocumented as the goal is
to improve this in the future to get the best of both worlds
without user configuration.

For inputs that result in cache misses, on Nvidia/Linux this
change improves performance by 31% for prompt processing and
13% for token generation.

* runner.go: Increase size of response channel

Generally the CPU can easily keep up with handling reponses that
are generated but there's no reason not to let generation continue
and handle things in larger batches if needed.

* llama: Add CI to verify all vendored changes have patches (#7066)

Make sure we don't accidentally merge changes in the vendored code
that aren't also reflected in the patches.

* llama: adjust clip patch for mingw utf-16 (#7065)

* llama: adjust clip patch for mingw utf-16

* llama: ensure static linking of runtime libs

Avoid runtime dependencies on non-standard libraries

* runner.go: Enable llamafile (all platforms) and BLAS (Mac OS)

These are two features that are shown on llama.cpp's system info
that are currently different between the two runners. On my test
systems the performance difference is very small to negligible
but it is probably still good to equalize the features.

* llm: Don't add BOS/EOS for tokenize requests

This is consistent with what server.cpp currently does. It affects
things like token processing counts for embedding requests.

* runner.go: Don't cache prompts for embeddings

Our integration with server.cpp implicitly disables prompt caching
because it is not part of the JSON object being parsed, this makes
the Go runner behavior similarly.

Prompt caching has been seen to affect the results of text completions
on certain hardware. The results are not wrong either way but they
are non-deterministic. However, embeddings seem to be affected even
on hardware that does not show this behavior for completions. For
now, it is best to maintain consistency with the existing behavior.

* runner.go: Adjust debug log levels

Add system info printed at startup and quiet down noisier logging.

* llama: fix compiler flag differences (#7082)

Adjust the flags for the new Go server to more closely match the
generate flow

* llama: refine developer docs (#7121)

* llama: doc and example clean up (#7122)

* llama: doc and example clean up

* llama: Move new dockerfile into llama dir

Temporary home until we fully transition to the Go server

* llama: runner doc cleanup

* llama.go: Add description for Tokenize error case

---------

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
Co-authored-by: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
											
										
										
											2024-10-08 15:53:54 +00:00
+								// Copyright 2024 Mozilla Foundation
 								//
 								// Permission is hereby granted, free of charge, to any person obtaining
 								// a copy of this software and associated documentation files (the
 								// "Software"), to deal in the Software without restriction, including
 								// without limitation the rights to use, copy, modify, merge, publish,
 								// distribute, sublicense, and/or sell copies of the Software, and to
 								// permit persons to whom the Software is furnished to do so, subject to
 								// the following conditions:
 								//
 								// The above copyright notice and this permission notice shall be
 								// included in all copies or substantial portions of the Software.
 								//
 								// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 								// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 								// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 								// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 								// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 								// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 								// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 								// SOFTWARE.
 								//
 								//                   _   _          ___ _      _   ___
 								//                  | |_(_)_ _ _  _| _ ) |    /_\ / __|
 								//                  |  _| | ' \ || | _ \ |__ / _ \\__ \.
 								//                   \__|_|_||_\_, |___/____/_/ \_\___/
 								//                             |__/
 								//
 								//                    BASIC LINEAR ALGEBRA SUBPROGRAMS
 								//
 								//
 								// This file implements multithreaded CPU matrix multiplication for the
 								// common contiguous use case C = Aᵀ * B. These kernels are designed to
 								// have excellent performance[1] for matrices that fit in the CPU cache
 								// without imposing any overhead such as cache filling or malloc calls.
 								//
 								// This implementation does not guarantee any upper bound with rounding
 								// errors, which grow along with k. Our goal's to maximally exploit the
 								// hardware for performance, and then use whatever resources remain for
 								// improving numerical accuracy.
 								//
 								// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
 								//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
 								#if defined(__GNUC__)
 								#pragma GCC diagnostic ignored "-Wpedantic"
 								#pragma GCC diagnostic ignored "-Wignored-attributes"
 								#endif
 								#include "sgemm.h"
 								#include "ggml-impl.h"
-												IBM granite/granitemoe architecture support (#6760)

* fix(ext_server): Port llama.cpp sampling refactors to ext_server

This was a fairly large changeset. I closely followed the changes here:
https://github.com/ggerganov/llama.cpp/commit/df270ef74596da8f1178f08991f4c51f18c9ee82

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(server.cpp): Refactor server.cpp logging for llama.cpp overhaul

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Bump llama.cpp to the latest master with `granite` support

This does not yet have granite MoE support, but that can come in a
follow up PR

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(patches): Update all patches (except solar-pro) to work with bumped llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(solar): Update solar patch for llama.cpp bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump llama.cpp for granitemoe support

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump llama.cpp for granitemoe support

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(solar): Update the solar-pro patch for latest llama.cpp bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump to the latest master of llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(patches): Update all patches for latest bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama): Always run sync.sh from the right directory

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/patches): Update llama patches

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama)!: Rough sync with llama.cpp submodule

There are a number of changes that will need to be propagated to llama.go
before any of this works!

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/patches): Add a patch and update for missing ggml-impl.h include

This include is where the ggml_cgraph struct is defined. It is included in
many of the .c files to define the forward declartion in ggml.h. It seems
that with the subset of code included here, the import was somehow lost (or
out-of-order) when building, so adding this include to llama.cpp fixes the
missing definition.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/sync): Add missing ggml-cpu-impl.h copy-over in sync.sh

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Add missing log.cpp

This was added as part of the logging overhaul done in llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Overhaul use of sampling module for llama.cpp changes

The changes here reflect the changes made in the big llama.cpp sampling PR
https://github.com/ggerganov/llama.cpp/pull/9294

The sampling functionality is now broken into the base interface
(llama_sampler) and the generation implementation (gpt_sampler). The
changes here reflect that. Since the sampling.h/sampling.cpp code uses c++
STL headers, the sampling_ext.[h|cpp] wrapper is maintained to allow go to
access a pure-C interface.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Fix the impl of SampleTokenGreedy for new sampling

I don't think this method is currently used, so it could probably just be
removed so that all sampling goes through the GPT interface, but in the
interest of doing no harm, this should keep the method working as expected.

Branch: IBMGraniteArchitectureSupport

* fix(llama): Remove unused SampleTokenGreedy

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(sync): Remove bash-specific change to sync.sh

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* chore(gofumpt): Format on llama.go to pass linting

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llm): Fix missing <thread> include in ext_server

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Remove TODO about grammar_first

This feature was not used/needed previously so should be fine without
plumbing it through now.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Better naming for sampling wrapper and args

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Fix patch 05 to use new wrapper api and re-sync

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* runner: Flush pending responses before returning

If there are any pending reponses (such as from potential stop
tokens) then we should send them back before ending the sequence.
Otherwise, we can be missing tokens at the end of a response.

Fixes #6707

* fix(llama/sampling): Use gpt_sampler with a forward declaration

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Remove unnecessary patch for gguf impl header

This was caused by an earlier mistake in the embeddings patch that was
dereferencing the pointer instead of using the wrapper API.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llm): Remove use of deprecated --log-disable flag

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
											
										
										
											2024-10-17 18:59:52 +00:00
+								#include "ggml-cpu-impl.h"
-												Re-introduce the `llama` package (#5034)

* Re-introduce the llama package

This PR brings back the llama package, making it possible to call llama.cpp and
ggml APIs from Go directly via CGo. This has a few advantages:

- C APIs can be called directly from Go without needing to use the previous
  "server" REST API
- On macOS and for CPU builds on Linux and Windows, Ollama can be built without
  a go generate ./... step, making it easy to get up and running to hack on
  parts of Ollama that don't require fast inference
- Faster build times for AVX,AVX2,CUDA and ROCM (a full build of all runners
  takes <5 min on a fast CPU)
- No git submodule making it easier to clone and build from source

This is a big PR, but much of it is vendor code except for:

- llama.go CGo bindings
- example/: a simple example of running inference
- runner/: a subprocess server designed to replace the llm/ext_server package
- Makefile an as minimal as possible Makefile to build the runner package for
  different targets (cpu, avx, avx2, cuda, rocm)

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>

* cache: Clear old KV cache entries when evicting a slot

When forking a cache entry, if no empty slots are available we
evict the least recently used one and copy over the KV entries
from the closest match. However, this copy does not overwrite
existing values but only adds new ones. Therefore, we need to
clear the old slot first.

This change fixes two issues:
 - The KV cache fills up and runs out of space even though we think
   we are managing it correctly
 - Performance gets worse over time as we use new cache entries that
   are not hot in the processor caches

* doc: explain golang objc linker warning (#6830)

* llama: gather transitive dependencies for rocm for dist packaging (#6848)

* Refine go server makefiles to be more DRY (#6924)

This breaks up the monolithic Makefile for the Go based runners into a
set of utility files as well as recursive Makefiles for the runners.
Files starting with the name "Makefile" are buildable, while files that
end with ".make" are utilities to include in other Makefiles.  This
reduces the amount of nearly identical targets and helps set a pattern
for future community contributions for new GPU runner architectures.

When we are ready to switch over to the Go runners, these files should
move to the top of the repo, and we should add targets for the main CLI,
as well as a helper "install" (put all the built binaries on the local
system in a runnable state) and "dist" target (generate the various
tar/zip files for distribution) for local developer use.

* llama: don't create extraneous directories (#6988)

* llama: Exercise the new build in CI (#6989)

Wire up some basic sanity testing in CI for the Go runner.  GPU runners are not covered yet.

* llama: Refine developer docs for Go server (#6842)

This enhances the documentation for development focusing on the new Go
server.  After we complete the transition further doc refinements
can remove the "transition" discussion.

* runner.go: Allocate batches for all sequences during init

We should tell the model that we could have full batches for all
sequences. We already do this when we allocate the batches but it was
missed during initialization.

* llama.go: Don't return nil from Tokenize on zero length input

Potentially receiving nil in a non-error condition is surprising to
most callers - it's better to return an empty slice.

* runner.go: Remove stop tokens from cache

If the last token is EOG then we don't return this and it isn't
present in the cache (because it was never submitted to Decode).
This works well for extending the cache entry with a new sequence.

However, for multi-token stop sequences, we won't return any of the
tokens but all but the last one will be in the cache. This means
when the conversation continues the cache will contain tokens that
don't overlap with the new prompt.

This works (we will pick up the portion where there is overlap) but
it causes unnecessary cache thrashing because we will fork the original
cache entry as it is not a perfect match.

By trimming the cache to the tokens that we actually return this
issue can be avoided.

* runner.go: Simplify flushing of pending tokens

* runner.go: Update TODOs

* runner.go: Don't panic when processing sequences

If there is an error processing a sequence, we should return a
clean HTTP error back to Ollama rather than panicing. This will
make us more resilient to transient failures.

Panics can still occur during startup as there is no way to serve
requests if that fails.

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: More accurately capture timings

Currently prompt processing time doesn't capture the that it takes
to tokenize the input, only decoding time. We should capture the
full process to more accurately reflect reality. This is especially
true once we start processing images where the initial processing
can take significant time. This is also more consistent with the
existing C++ runner.

* runner.go: Support for vision models

In addition to bringing feature parity with the C++ runner, this also
incorporates several improvements:
 - Cache prompting works with images, avoiding the need to re-decode
   embeddings for every message in a conversation
 - Parallelism is supported, avoiding the need to restrict to one
   sequence at a time. (Though for now Ollama will not schedule
   them while we might need to fall back to the old runner.)

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: Move Unicode checking code and add tests

* runner.go: Export external cache members

Runner and cache are in the same package so the change doesn't
affect anything but it is more internally consistent.

* runner.go: Image embedding cache

Generating embeddings from images can take significant time (on
my machine between 100ms and 8s depending on the model). Although
we already cache the result of decoding these images, the embeddings
need to be regenerated every time. This is not necessary if we get
the same image over and over again, for example, during a conversation.

This currently uses a very small cache with a very simple algorithm
but it is easy to improve as is warranted.

* llama: catch up on patches

Carry forward solar-pro and cli-unicode patches

* runner.go: Don't re-allocate memory for every batch

We can reuse memory allocated from batch to batch since batch
size is fixed. This both saves the cost of reallocation as well
keeps the cache lines hot.

This results in a roughly 1% performance improvement for token
generation with Nvidia GPUs on Linux.

* runner.go: Default to classic input cache policy

The input cache as part of the go runner implemented a cache
policy that aims to maximize hit rate in both single and multi-
user scenarios. When there is a cache hit, the response is
very fast.

However, performance is actually slower when there is an input
cache miss due to worse GPU VRAM locality. This means that
performance is generally better overall for multi-user scenarios
(better input cache hit rate, locality was relatively poor already).
But worse for single users (input cache hit rate is about the same,
locality is now worse).

This defaults the policy back to the old one to avoid a regression
but keeps the new one available through an environment variable
OLLAMA_MULTIUSER_CACHE. This is left undocumented as the goal is
to improve this in the future to get the best of both worlds
without user configuration.

For inputs that result in cache misses, on Nvidia/Linux this
change improves performance by 31% for prompt processing and
13% for token generation.

* runner.go: Increase size of response channel

Generally the CPU can easily keep up with handling reponses that
are generated but there's no reason not to let generation continue
and handle things in larger batches if needed.

* llama: Add CI to verify all vendored changes have patches (#7066)

Make sure we don't accidentally merge changes in the vendored code
that aren't also reflected in the patches.

* llama: adjust clip patch for mingw utf-16 (#7065)

* llama: adjust clip patch for mingw utf-16

* llama: ensure static linking of runtime libs

Avoid runtime dependencies on non-standard libraries

* runner.go: Enable llamafile (all platforms) and BLAS (Mac OS)

These are two features that are shown on llama.cpp's system info
that are currently different between the two runners. On my test
systems the performance difference is very small to negligible
but it is probably still good to equalize the features.

* llm: Don't add BOS/EOS for tokenize requests

This is consistent with what server.cpp currently does. It affects
things like token processing counts for embedding requests.

* runner.go: Don't cache prompts for embeddings

Our integration with server.cpp implicitly disables prompt caching
because it is not part of the JSON object being parsed, this makes
the Go runner behavior similarly.

Prompt caching has been seen to affect the results of text completions
on certain hardware. The results are not wrong either way but they
are non-deterministic. However, embeddings seem to be affected even
on hardware that does not show this behavior for completions. For
now, it is best to maintain consistency with the existing behavior.

* runner.go: Adjust debug log levels

Add system info printed at startup and quiet down noisier logging.

* llama: fix compiler flag differences (#7082)

Adjust the flags for the new Go server to more closely match the
generate flow

* llama: refine developer docs (#7121)

* llama: doc and example clean up (#7122)

* llama: doc and example clean up

* llama: Move new dockerfile into llama dir

Temporary home until we fully transition to the Go server

* llama: runner doc cleanup

* llama.go: Add description for Tokenize error case

---------

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
Co-authored-by: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
											
										
										
											2024-10-08 15:53:54 +00:00
+								#include "ggml-quants.h"
 								#ifdef _MSC_VER
 								#define NOINLINE __declspec(noinline)
 								#else
 								#define NOINLINE __attribute__((__noinline__))
 								#endif
 								#if defined(__ARM_NEON) || defined(__AVX512F__)
 								#define VECTOR_REGISTERS 32
 								#else
 								#define VECTOR_REGISTERS 16
 								#endif
 								#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
 								namespace {
 								inline float unhalf(ggml_fp16_t d) {
 								    return GGML_FP16_TO_FP32(d);
 								}
 								////////////////////////////////////////////////////////////////////////////////////////////////////
 								// VECTORIZED ARITHMETIC OPERATIONS
 								#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 								inline __m128 add(__m128 x, __m128 y) { return _mm_add_ps(x, y); }
 								inline __m128 sub(__m128 x, __m128 y) { return _mm_sub_ps(x, y); }
 								inline __m128 mul(__m128 x, __m128 y) { return _mm_mul_ps(x, y); }
 								#endif  // __SSE__
 								#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 								inline __m256 add(__m256 x, __m256 y) { return _mm256_add_ps(x, y); }
 								inline __m256 sub(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); }
 								inline __m256 mul(__m256 x, __m256 y) { return _mm256_mul_ps(x, y); }
 								#endif // __AVX__
 								#if defined(__AVX512F__)
 								inline __m512 add(__m512 x, __m512 y) { return _mm512_add_ps(x, y); }
 								inline __m512 sub(__m512 x, __m512 y) { return _mm512_sub_ps(x, y); }
 								inline __m512 mul(__m512 x, __m512 y) { return _mm512_mul_ps(x, y); }
 								#endif // __AVX512F__
 								#if defined(__ARM_NEON)
 								inline float32x4_t add(float32x4_t x, float32x4_t y) { return vaddq_f32(x, y); }
 								inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vsubq_f32(x, y); }
 								inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vmulq_f32(x, y); }
 								#endif // __ARM_NEON
 								#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 								inline float16x8_t add(float16x8_t x, float16x8_t y) { return vaddq_f16(x, y); }
 								inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
 								inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
 								#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 								////////////////////////////////////////////////////////////////////////////////////////////////////
 								// VECTORIZED FUSED MULTIPLY ADD
 								/**
 								 * Computes a * b + c.
 								 */
 								template <typename T, typename U>
 								inline U madd(T a, T b, U c) {
 								    return add(mul(a, b), c);
 								}
 								#if defined(__FMA__)
 								#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 								template <>
 								inline __m256 madd(__m256 a, __m256 b, __m256 c) {
 								    return _mm256_fmadd_ps(a, b, c);
 								}
 								#endif
 								#if defined(__AVX512F__)
 								template <>
 								inline __m512 madd(__m512 a, __m512 b, __m512 c) {
 								    return _mm512_fmadd_ps(a, b, c);
 								}
 								#endif
 								#endif
 								#if defined(__ARM_FEATURE_FMA)
 								template <>
 								inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
 								    return vfmaq_f32(c, b, a);
 								}
 								#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
 								template <>
 								inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
 								    return vfmaq_f16(c, b, a);
 								}
 								#endif
 								#endif
 								////////////////////////////////////////////////////////////////////////////////////////////////////
 								// VECTORIZED HORIZONTAL SUM
 								#if defined(__ARM_NEON)
 								inline float hsum(float32x4_t x) {
 								    return vaddvq_f32(x);
 								}
 								#endif // __ARM_NEON
 								#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
 								inline float hsum(float16x8_t x) {
 								    return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)),
 								                                vcvt_f32_f16(vget_high_f16(x))));
 								}
 								#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 								#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 								inline float hsum(__m128 x) {
 								#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 								    x = _mm_add_ps(x, _mm_movehl_ps(x, x));
 								    x = _mm_add_ss(x, _mm_movehdup_ps(x));
 								#else
 								    __m128 t;
 								    t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));
 								    x = _mm_add_ps(x, t);
 								    t = _mm_movehl_ps(t, x);
 								    x = _mm_add_ss(x, t);
 								#endif
 								    return _mm_cvtss_f32(x);
 								}
 								#endif
 								#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 								inline float hsum(__m256 x) {
 								    return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1),
 								                           _mm256_castps256_ps128(x)));
 								}
 								#endif // __AVX__
 								#if defined(__AVX512F__)
 								inline float hsum(__m512 x) {
 								    return _mm512_reduce_add_ps(x);
 								}
 								#endif // __AVX512F__
 								////////////////////////////////////////////////////////////////////////////////////////////////////
 								// VECTORIZED MEMORY LOADING
 								template <typename T, typename U> T load(const U *);
 								#if defined(__ARM_NEON)
 								template <> inline float32x4_t load(const float *p) {
 								    return vld1q_f32(p);
 								}
 								#if !defined(_MSC_VER)
 								template <> inline float16x8_t load(const ggml_fp16_t *p) {
 								    return vld1q_f16((const float16_t *)p);
 								}
 								template <> inline float32x4_t load(const ggml_fp16_t *p) {
 								    return vcvt_f32_f16(vld1_f16((const float16_t *)p));
 								}
 								#endif // _MSC_VER
 								#endif // __ARM_NEON
 								#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 								template <> inline __m128 load(const float *p) {
 								    return _mm_loadu_ps(p);
 								}
 								#endif  // __SSE__
 								#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 								template <> inline __m256 load(const float *p) {
 								    return _mm256_loadu_ps(p);
 								}
 								#endif // __AVX__
 								#if defined(__F16C__)
 								template <> inline __m256 load(const ggml_fp16_t *p) {
 								    return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
 								}
 								#endif // __F16C__
 								#if defined(__AVX512F__)
 								template <> inline __m512 load(const float *p) {
 								    return _mm512_loadu_ps(p);
 								}
 								template <> inline __m512 load(const ggml_fp16_t *p) {
 								    return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
 								}
 								#endif // __AVX512F__
-												IBM granite/granitemoe architecture support (#6760)

* fix(ext_server): Port llama.cpp sampling refactors to ext_server

This was a fairly large changeset. I closely followed the changes here:
https://github.com/ggerganov/llama.cpp/commit/df270ef74596da8f1178f08991f4c51f18c9ee82

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(server.cpp): Refactor server.cpp logging for llama.cpp overhaul

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Bump llama.cpp to the latest master with `granite` support

This does not yet have granite MoE support, but that can come in a
follow up PR

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(patches): Update all patches (except solar-pro) to work with bumped llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(solar): Update solar patch for llama.cpp bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump llama.cpp for granitemoe support

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump llama.cpp for granitemoe support

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(solar): Update the solar-pro patch for latest llama.cpp bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump to the latest master of llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(patches): Update all patches for latest bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama): Always run sync.sh from the right directory

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/patches): Update llama patches

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama)!: Rough sync with llama.cpp submodule

There are a number of changes that will need to be propagated to llama.go
before any of this works!

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/patches): Add a patch and update for missing ggml-impl.h include

This include is where the ggml_cgraph struct is defined. It is included in
many of the .c files to define the forward declartion in ggml.h. It seems
that with the subset of code included here, the import was somehow lost (or
out-of-order) when building, so adding this include to llama.cpp fixes the
missing definition.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/sync): Add missing ggml-cpu-impl.h copy-over in sync.sh

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Add missing log.cpp

This was added as part of the logging overhaul done in llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Overhaul use of sampling module for llama.cpp changes

The changes here reflect the changes made in the big llama.cpp sampling PR
https://github.com/ggerganov/llama.cpp/pull/9294

The sampling functionality is now broken into the base interface
(llama_sampler) and the generation implementation (gpt_sampler). The
changes here reflect that. Since the sampling.h/sampling.cpp code uses c++
STL headers, the sampling_ext.[h|cpp] wrapper is maintained to allow go to
access a pure-C interface.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Fix the impl of SampleTokenGreedy for new sampling

I don't think this method is currently used, so it could probably just be
removed so that all sampling goes through the GPT interface, but in the
interest of doing no harm, this should keep the method working as expected.

Branch: IBMGraniteArchitectureSupport

* fix(llama): Remove unused SampleTokenGreedy

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(sync): Remove bash-specific change to sync.sh

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* chore(gofumpt): Format on llama.go to pass linting

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llm): Fix missing <thread> include in ext_server

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Remove TODO about grammar_first

This feature was not used/needed previously so should be fine without
plumbing it through now.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Better naming for sampling wrapper and args

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Fix patch 05 to use new wrapper api and re-sync

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* runner: Flush pending responses before returning

If there are any pending reponses (such as from potential stop
tokens) then we should send them back before ending the sequence.
Otherwise, we can be missing tokens at the end of a response.

Fixes #6707

* fix(llama/sampling): Use gpt_sampler with a forward declaration

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Remove unnecessary patch for gguf impl header

This was caused by an earlier mistake in the embeddings patch that was
dereferencing the pointer instead of using the wrapper API.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llm): Remove use of deprecated --log-disable flag

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
											
										
										
											2024-10-17 18:59:52 +00:00
+								////////////////////////////////////////////////////////////////////////////////////////////////////
 								// CONSTANTS
 								#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
 								static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
 								static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
 								#endif
-												Re-introduce the `llama` package (#5034)

* Re-introduce the llama package

This PR brings back the llama package, making it possible to call llama.cpp and
ggml APIs from Go directly via CGo. This has a few advantages:

- C APIs can be called directly from Go without needing to use the previous
  "server" REST API
- On macOS and for CPU builds on Linux and Windows, Ollama can be built without
  a go generate ./... step, making it easy to get up and running to hack on
  parts of Ollama that don't require fast inference
- Faster build times for AVX,AVX2,CUDA and ROCM (a full build of all runners
  takes <5 min on a fast CPU)
- No git submodule making it easier to clone and build from source

This is a big PR, but much of it is vendor code except for:

- llama.go CGo bindings
- example/: a simple example of running inference
- runner/: a subprocess server designed to replace the llm/ext_server package
- Makefile an as minimal as possible Makefile to build the runner package for
  different targets (cpu, avx, avx2, cuda, rocm)

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>

* cache: Clear old KV cache entries when evicting a slot

When forking a cache entry, if no empty slots are available we
evict the least recently used one and copy over the KV entries
from the closest match. However, this copy does not overwrite
existing values but only adds new ones. Therefore, we need to
clear the old slot first.

This change fixes two issues:
 - The KV cache fills up and runs out of space even though we think
   we are managing it correctly
 - Performance gets worse over time as we use new cache entries that
   are not hot in the processor caches

* doc: explain golang objc linker warning (#6830)

* llama: gather transitive dependencies for rocm for dist packaging (#6848)

* Refine go server makefiles to be more DRY (#6924)

This breaks up the monolithic Makefile for the Go based runners into a
set of utility files as well as recursive Makefiles for the runners.
Files starting with the name "Makefile" are buildable, while files that
end with ".make" are utilities to include in other Makefiles.  This
reduces the amount of nearly identical targets and helps set a pattern
for future community contributions for new GPU runner architectures.

When we are ready to switch over to the Go runners, these files should
move to the top of the repo, and we should add targets for the main CLI,
as well as a helper "install" (put all the built binaries on the local
system in a runnable state) and "dist" target (generate the various
tar/zip files for distribution) for local developer use.

* llama: don't create extraneous directories (#6988)

* llama: Exercise the new build in CI (#6989)

Wire up some basic sanity testing in CI for the Go runner.  GPU runners are not covered yet.

* llama: Refine developer docs for Go server (#6842)

This enhances the documentation for development focusing on the new Go
server.  After we complete the transition further doc refinements
can remove the "transition" discussion.

* runner.go: Allocate batches for all sequences during init

We should tell the model that we could have full batches for all
sequences. We already do this when we allocate the batches but it was
missed during initialization.

* llama.go: Don't return nil from Tokenize on zero length input

Potentially receiving nil in a non-error condition is surprising to
most callers - it's better to return an empty slice.

* runner.go: Remove stop tokens from cache

If the last token is EOG then we don't return this and it isn't
present in the cache (because it was never submitted to Decode).
This works well for extending the cache entry with a new sequence.

However, for multi-token stop sequences, we won't return any of the
tokens but all but the last one will be in the cache. This means
when the conversation continues the cache will contain tokens that
don't overlap with the new prompt.

This works (we will pick up the portion where there is overlap) but
it causes unnecessary cache thrashing because we will fork the original
cache entry as it is not a perfect match.

By trimming the cache to the tokens that we actually return this
issue can be avoided.

* runner.go: Simplify flushing of pending tokens

* runner.go: Update TODOs

* runner.go: Don't panic when processing sequences

If there is an error processing a sequence, we should return a
clean HTTP error back to Ollama rather than panicing. This will
make us more resilient to transient failures.

Panics can still occur during startup as there is no way to serve
requests if that fails.

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: More accurately capture timings

Currently prompt processing time doesn't capture the that it takes
to tokenize the input, only decoding time. We should capture the
full process to more accurately reflect reality. This is especially
true once we start processing images where the initial processing
can take significant time. This is also more consistent with the
existing C++ runner.

* runner.go: Support for vision models

In addition to bringing feature parity with the C++ runner, this also
incorporates several improvements:
 - Cache prompting works with images, avoiding the need to re-decode
   embeddings for every message in a conversation
 - Parallelism is supported, avoiding the need to restrict to one
   sequence at a time. (Though for now Ollama will not schedule
   them while we might need to fall back to the old runner.)

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: Move Unicode checking code and add tests

* runner.go: Export external cache members

Runner and cache are in the same package so the change doesn't
affect anything but it is more internally consistent.

* runner.go: Image embedding cache

Generating embeddings from images can take significant time (on
my machine between 100ms and 8s depending on the model). Although
we already cache the result of decoding these images, the embeddings
need to be regenerated every time. This is not necessary if we get
the same image over and over again, for example, during a conversation.

This currently uses a very small cache with a very simple algorithm
but it is easy to improve as is warranted.

* llama: catch up on patches

Carry forward solar-pro and cli-unicode patches

* runner.go: Don't re-allocate memory for every batch

We can reuse memory allocated from batch to batch since batch
size is fixed. This both saves the cost of reallocation as well
keeps the cache lines hot.

This results in a roughly 1% performance improvement for token
generation with Nvidia GPUs on Linux.

* runner.go: Default to classic input cache policy

The input cache as part of the go runner implemented a cache
policy that aims to maximize hit rate in both single and multi-
user scenarios. When there is a cache hit, the response is
very fast.

However, performance is actually slower when there is an input
cache miss due to worse GPU VRAM locality. This means that
performance is generally better overall for multi-user scenarios
(better input cache hit rate, locality was relatively poor already).
But worse for single users (input cache hit rate is about the same,
locality is now worse).

This defaults the policy back to the old one to avoid a regression
but keeps the new one available through an environment variable
OLLAMA_MULTIUSER_CACHE. This is left undocumented as the goal is
to improve this in the future to get the best of both worlds
without user configuration.

For inputs that result in cache misses, on Nvidia/Linux this
change improves performance by 31% for prompt processing and
13% for token generation.

* runner.go: Increase size of response channel

Generally the CPU can easily keep up with handling reponses that
are generated but there's no reason not to let generation continue
and handle things in larger batches if needed.

* llama: Add CI to verify all vendored changes have patches (#7066)

Make sure we don't accidentally merge changes in the vendored code
that aren't also reflected in the patches.

* llama: adjust clip patch for mingw utf-16 (#7065)

* llama: adjust clip patch for mingw utf-16

* llama: ensure static linking of runtime libs

Avoid runtime dependencies on non-standard libraries

* runner.go: Enable llamafile (all platforms) and BLAS (Mac OS)

These are two features that are shown on llama.cpp's system info
that are currently different between the two runners. On my test
systems the performance difference is very small to negligible
but it is probably still good to equalize the features.

* llm: Don't add BOS/EOS for tokenize requests

This is consistent with what server.cpp currently does. It affects
things like token processing counts for embedding requests.

* runner.go: Don't cache prompts for embeddings

Our integration with server.cpp implicitly disables prompt caching
because it is not part of the JSON object being parsed, this makes
the Go runner behavior similarly.

Prompt caching has been seen to affect the results of text completions
on certain hardware. The results are not wrong either way but they
are non-deterministic. However, embeddings seem to be affected even
on hardware that does not show this behavior for completions. For
now, it is best to maintain consistency with the existing behavior.

* runner.go: Adjust debug log levels

Add system info printed at startup and quiet down noisier logging.

* llama: fix compiler flag differences (#7082)

Adjust the flags for the new Go server to more closely match the
generate flow

* llama: refine developer docs (#7121)

* llama: doc and example clean up (#7122)

* llama: doc and example clean up

* llama: Move new dockerfile into llama dir

Temporary home until we fully transition to the Go server

* llama: runner doc cleanup

* llama.go: Add description for Tokenize error case

---------

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
Co-authored-by: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
											
										
										
											2024-10-08 15:53:54 +00:00
+								////////////////////////////////////////////////////////////////////////////////////////////////////
 								// FLOATING POINT MATRIX MULTIPLICATION
 								template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
 								class tinyBLAS {
 								  public:
 								    tinyBLAS(int64_t k,
 								             const TA *A, int64_t lda,
 								             const TB *B, int64_t ldb,
 								             TC *C, int64_t ldc,
 								             int ith, int nth)
 								        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
 								    }
 								    void matmul(int64_t m, int64_t n) {
 								        mnpack(0, m, 0, n);
 								    }
 								  private:
 								    NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
 								        int64_t mc, nc, mp, np;
 								        switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
 								#if VECTOR_REGISTERS == 32
 								        case 0x55:
 								            mc = 5;
 								            nc = 5;
 								            gemm<5, 5>(m0, m, n0, n);
 								            break;
 								        case 0x45:
 								            mc = 4;
 								            nc = 5;
 								            gemm<4, 5>(m0, m, n0, n);
 								            break;
 								        case 0x54:
 								            mc = 5;
 								            nc = 4;
 								            gemm<5, 4>(m0, m, n0, n);
 								            break;
 								        case 0x44:
 								            mc = 4;
 								            nc = 4;
 								            gemm<4, 4>(m0, m, n0, n);
 								            break;
 								        case 0x53:
 								            mc = 5;
 								            nc = 3;
 								            gemm<5, 3>(m0, m, n0, n);
 								            break;
 								        case 0x35:
 								            mc = 3;
 								            nc = 5;
 								            gemm<3, 5>(m0, m, n0, n);
 								            break;
 								        case 0x43:
 								            mc = 4;
 								            nc = 3;
 								            gemm<4, 3>(m0, m, n0, n);
 								            break;
 								#else
 								        case 0x55:
 								        case 0x54:
 								        case 0x53:
 								        case 0x45:
 								        case 0x44:
 								        case 0x43:
 								            mc = 4;
 								            nc = 3;
 								            gemm<4, 3>(m0, m, n0, n);
 								            break;
 								        case 0x35:
 								#endif
 								        case 0x34:
 								            mc = 3;
 								            nc = 4;
 								            gemm<3, 4>(m0, m, n0, n);
 								            break;
 								        case 0x52:
 								            mc = 5;
 								            nc = 2;
 								            gemm<5, 2>(m0, m, n0, n);
 								            break;
 								        case 0x33:
 								            mc = 3;
 								            nc = 3;
 								            gemm<3, 3>(m0, m, n0, n);
 								            break;
 								        case 0x25:
 								            mc = 2;
 								            nc = 5;
 								            gemm<2, 5>(m0, m, n0, n);
 								            break;
 								        case 0x42:
 								            mc = 4;
 								            nc = 2;
 								            gemm<4, 2>(m0, m, n0, n);
 								            break;
 								        case 0x24:
 								            mc = 2;
 								            nc = 4;
 								            gemm<2, 4>(m0, m, n0, n);
 								            break;
 								        case 0x32:
 								            mc = 3;
 								            nc = 2;
 								            gemm<3, 2>(m0, m, n0, n);
 								            break;
 								        case 0x23:
 								            mc = 2;
 								            nc = 3;
 								            gemm<2, 3>(m0, m, n0, n);
 								            break;
 								        case 0x51:
 								            mc = 5;
 								            nc = 1;
 								            gemm<5, 1>(m0, m, n0, n);
 								            break;
 								        case 0x41:
 								            mc = 4;
 								            nc = 1;
 								            gemm<4, 1>(m0, m, n0, n);
 								            break;
 								        case 0x22:
 								            mc = 2;
 								            nc = 2;
 								            gemm<2, 2>(m0, m, n0, n);
 								            break;
 								        case 0x15:
 								            mc = 1;
 								            nc = 5;
 								            gemm<1, 5>(m0, m, n0, n);
 								            break;
 								        case 0x14:
 								            mc = 1;
 								            nc = 4;
 								            gemm<1, 4>(m0, m, n0, n);
 								            break;
 								        case 0x31:
 								            mc = 3;
 								            nc = 1;
 								            gemm<3, 1>(m0, m, n0, n);
 								            break;
 								        case 0x13:
 								            mc = 1;
 								            nc = 3;
 								            gemm<1, 3>(m0, m, n0, n);
 								            break;
 								        case 0x21:
 								            mc = 2;
 								            nc = 1;
 								            gemm<2, 1>(m0, m, n0, n);
 								            break;
 								        case 0x12:
 								            mc = 1;
 								            nc = 2;
 								            gemm<1, 2>(m0, m, n0, n);
 								            break;
 								        case 0x11:
 								            mc = 1;
 								            nc = 1;
 								            gemm<1, 1>(m0, m, n0, n);
 								            break;
 								        default:
 								            return;
 								        }
 								        mp = m0 + (m - m0) / mc * mc;
 								        np = n0 + (n - n0) / nc * nc;
 								        mnpack(mp, m, n0, np);
 								        mnpack(m0, m, np, n);
 								    }
 								    template <int RM, int RN>
 								    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
 								        int64_t ytiles = (m - m0) / RM;
 								        int64_t xtiles = (n - n0) / RN;
 								        int64_t tiles = xtiles * ytiles;
 								        int64_t duty = (tiles + nth - 1) / nth;
 								        int64_t start = duty * ith;
 								        int64_t end = start + duty;
 								        if (end > tiles)
 								            end = tiles;
 								        for (int64_t job = start; job < end; ++job) {
 								            int64_t ii = m0 + job / xtiles * RM;
 								            int64_t jj = n0 + job % xtiles * RN;
 								            D Cv[RN][RM] = {};
 								            for (int64_t l = 0; l < k; l += KN)
 								                for (int64_t j = 0; j < RN; ++j)
 								                    for (int64_t i = 0; i < RM; ++i)
 								                        Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
 								                                        load<V>(B + ldb * (jj + j) + l),
 								                                        Cv[j][i]);
 								            for (int64_t j = 0; j < RN; ++j)
 								                for (int64_t i = 0; i < RM; ++i)
 								                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
 								        }
 								    }
 								    const TA *const A;
 								    const TB *const B;
 								    TC *const C;
 								    const int64_t k;
 								    const int64_t lda;
 								    const int64_t ldb;
 								    const int64_t ldc;
 								    const int ith;
 								    const int nth;
 								};
 								//////////////////////////////////////////////////////////////////////////////////////////
 								// QUANT ZERO MATRIX MULTIPLICATION
 								#if defined(__ARM_FEATURE_DOTPROD)
 								template <typename TA>
 								class tinyBLAS_Q0_ARM {
 								  public:
 								    tinyBLAS_Q0_ARM(int64_t k,
 								                    const TA *A, int64_t lda,
 								                    const block_q8_0 *B, int64_t ldb,
 								                    float *C, int64_t ldc,
 								                    int ith, int nth)
 								        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
 								    }
 								    void matmul(int64_t m, int64_t n) {
 								        mnpack(0, m, 0, n);
 								    }
 								  private:
 								    NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
 								        int64_t mc, nc, mp, np;
 								        switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) {
 								        case 0x33:
 								            mc = 3;
 								            nc = 3;
 								            gemm<3, 3>(m0, m, n0, n);
 								            break;
 								        case 0x32:
 								            mc = 3;
 								            nc = 2;
 								            gemm<3, 2>(m0, m, n0, n);
 								            break;
 								        case 0x23:
 								            mc = 2;
 								            nc = 3;
 								            gemm<2, 3>(m0, m, n0, n);
 								            break;
 								        case 0x22:
 								            mc = 2;
 								            nc = 2;
 								            gemm<2, 2>(m0, m, n0, n);
 								            break;
 								        case 0x31:
 								            mc = 3;
 								            nc = 1;
 								            gemm<3, 1>(m0, m, n0, n);
 								            break;
 								        case 0x13:
 								            mc = 1;
 								            nc = 3;
 								            gemm<1, 3>(m0, m, n0, n);
 								            break;
 								        case 0x21:
 								            mc = 2;
 								            nc = 1;
 								            gemm<2, 1>(m0, m, n0, n);
 								            break;
 								        case 0x12:
 								            mc = 1;
 								            nc = 2;
 								            gemm<1, 2>(m0, m, n0, n);
 								            break;
 								        case 0x11:
 								            mc = 1;
 								            nc = 1;
 								            gemm<1, 1>(m0, m, n0, n);
 								            break;
 								        default:
 								            return;
 								        }
 								        mp = m0 + (m - m0) / mc * mc;
 								        np = n0 + (n - n0) / nc * nc;
 								        mnpack(mp, m, n0, np);
 								        mnpack(m0, m, np, n);
 								    }
 								    template <int RM, int RN>
 								    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
 								        int64_t ytiles = (m - m0) / RM;
 								        int64_t xtiles = (n - n0) / RN;
 								        int64_t tiles = xtiles * ytiles;
 								        int64_t duty = (tiles + nth - 1) / nth;
 								        int64_t start = duty * ith;
 								        int64_t end = start + duty;
 								        if (end > tiles)
 								            end = tiles;
 								        for (int64_t job = start; job < end; ++job) {
 								            int64_t ii = m0 + job / xtiles * RM;
 								            int64_t jj = n0 + job % xtiles * RN;
 								            float32x4_t Cv[RN][RM] = {};
 								            for (int64_t l = 0; l < k; ++l)
 								                for (int64_t j = 0; j < RN; ++j)
 								                    for (int64_t i = 0; i < RM; ++i)
 								                        Cv[j][i] = vmlaq_n_f32(Cv[j][i],
 								                                               vcvtq_f32_s32(vdotq_s32(
 								                                                   vdotq_s32(vdupq_n_s32(0),
 								                                                             load_lo(A + lda * (ii + i) + l),
 								                                                             load_lo(B + ldb * (jj + j) + l)),
 								                                                   load_hi(A + lda * (ii + i) + l),
 								                                                   load_hi(B + ldb * (jj + j) + l))),
 								                                               unhalf(A[lda * (ii + i) + l].d) *
 								                                               unhalf(B[ldb * (jj + j) + l].d));
 								            for (int64_t j = 0; j < RN; ++j)
 								                for (int64_t i = 0; i < RM; ++i)
 								                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
 								        }
 								    }
 								    inline int8x16_t load_lo(const block_q8_0 *b) {
 								        return vld1q_s8(b->qs);
 								    }
 								    inline int8x16_t load_hi(const block_q8_0 *b) {
 								        return vld1q_s8(b->qs + 16);
 								    }
 								    inline int8x16_t load_lo(const block_q4_0 *b) {
 								        return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs),
 								                                                     vdupq_n_u8(0x0f))),
 								                        vdupq_n_s8(0x8));
 								    }
 								    inline int8x16_t load_hi(const block_q4_0 *b) {
 								        return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)),
 								                        vdupq_n_s8(0x8));
 								    }
 								    const TA *const A;
 								    const block_q8_0 *const B;
 								    float *const C;
 								    const int64_t k;
 								    const int64_t lda;
 								    const int64_t ldb;
 								    const int64_t ldc;
 								    const int ith;
 								    const int nth;
 								};
 								#endif // __ARM_FEATURE_DOTPROD
 								#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
 								template <typename TA, typename TB, typename TC>
 								class tinyBLAS_Q0_AVX {
 								  public:
 								    tinyBLAS_Q0_AVX(int64_t k,
 								                    const TA *A, int64_t lda,
 								                    const TB *B, int64_t ldb,
 								                    TC *C, int64_t ldc,
 								                    int ith, int nth)
 								        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
 								    }
 								    void matmul(int64_t m, int64_t n) {
 								        mnpack(0, m, 0, n);
 								    }
 								  private:
 								    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
 								        int64_t mc, nc, mp, np;
 								        switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
 								#if VECTOR_REGISTERS == 32
 								        case 0x44:
 								            mc = 4;
 								            nc = 4;
 								#if defined(__AVX2__) && defined(__F16C__)
 								            gemm4xN<4>(m0, m, n0, n);
 								#else
 								            gemm<4, 4>(m0, m, n0, n);
 								#endif
 								            break;
 								        case 0x43:
 								            mc = 4;
 								            nc = 3;
 								#if defined(__AVX2__) && defined(__F16C__)
 								            gemm4xN<3>(m0, m, n0, n);
 								#else
 								            gemm<4, 3>(m0, m, n0, n);
 								#endif
 								            break;
 								        case 0x34:
 								            mc = 3;
 								            nc = 4;
 								#if defined(__AVX2__) && defined(__F16C__)
 								            gemmMx4<3>(m0, m, n0, n);
 								#else
 								            gemm<3, 4>(m0, m, n0, n);
 								#endif
 								            break;
 								        case 0x33:
 								            mc = 3;
 								            nc = 3;
 								            gemm<3, 3>(m0, m, n0, n);
 								            break;
 								        case 0x42:
 								            mc = 4;
 								            nc = 2;
 								#if defined(__AVX2__) && defined(__F16C__)
 								            gemm4xN<2>(m0, m, n0, n);
 								#else
 								            gemm<4, 2>(m0, m, n0, n);
 								#endif
 								            break;
 								        case 0x24:
 								            mc = 2;
 								            nc = 4;
 								#if defined(__AVX2__) && defined(__F16C__)
 								            gemmMx4<2>(m0, m, n0, n);
 								#else
 								            gemm<2, 4>(m0, m, n0, n);
 								#endif
 								            break;
 								#else
 								        case 0x44:
 								        case 0x43:
 								        case 0x42:
 								            mc = 4;
 								            nc = 2;
 								#if defined(__AVX2__) && defined(__F16C__)
 								            gemm4xN<2>(m0, m, n0, n);
 								#else
 								            gemm<4, 2>(m0, m, n0, n);
 								#endif
 								            break;
 								        case 0x34:
 								        case 0x24:
 								            mc = 2;
 								            nc = 4;
 								#if defined(__AVX2__) && defined(__F16C__)
 								            gemmMx4<2>(m0, m, n0, n);
 								#else
 								            gemm<2, 4>(m0, m, n0, n);
 								#endif
 								            break;
 								        case 0x33:
 								#endif
 								        case 0x32:
 								            mc = 3;
 								            nc = 2;
 								            gemm<3, 2>(m0, m, n0, n);
 								            break;
 								        case 0x23:
 								            mc = 2;
 								            nc = 3;
 								            gemm<2, 3>(m0, m, n0, n);
 								            break;
 								        case 0x41:
 								            mc = 4;
 								            nc = 1;
 								#if defined(__AVX2__) && defined(__F16C__)
 								            gemm4xN<1>(m0, m, n0, n);
 								#else
 								            gemm<4, 1>(m0, m, n0, n);
 								#endif
 								            break;
 								        case 0x22:
 								            mc = 2;
 								            nc = 2;
 								            gemm<2, 2>(m0, m, n0, n);
 								            break;
 								        case 0x14:
 								            mc = 1;
 								            nc = 4;
 								#if defined(__AVX2__) && defined(__F16C__)
 								            gemmMx4<1>(m0, m, n0, n);
 								#else
 								            gemm<1, 4>(m0, m, n0, n);
 								#endif
 								            break;
 								        case 0x31:
 								            mc = 3;
 								            nc = 1;
 								            gemm<3, 1>(m0, m, n0, n);
 								            break;
 								        case 0x13:
 								            mc = 1;
 								            nc = 3;
 								            gemm<1, 3>(m0, m, n0, n);
 								            break;
 								        case 0x21:
 								            mc = 2;
 								            nc = 1;
 								            gemm<2, 1>(m0, m, n0, n);
 								            break;
 								        case 0x12:
 								            mc = 1;
 								            nc = 2;
 								            gemm<1, 2>(m0, m, n0, n);
 								            break;
 								        case 0x11:
 								            mc = 1;
 								            nc = 1;
 								            gemm<1, 1>(m0, m, n0, n);
 								            break;
 								        default:
 								            return;
 								        }
 								        mp = m0 + (m - m0) / mc * mc;
 								        np = n0 + (n - n0) / nc * nc;
 								        mnpack(mp, m, n0, np);
 								        mnpack(m0, m, np, n);
 								    }
 								#if defined(__AVX2__) && defined(__F16C__)
 								// Templated functions for gemm of dimensions 4xN
 								    template <int RN>
 								    NOINLINE void gemm4xN(int64_t m0, int64_t m, int64_t n0, int64_t n) {
 								        int64_t ytiles = (m - m0) / 4;
 								        int64_t xtiles = (n - n0) / RN;
 								        int64_t tiles = xtiles * ytiles;
 								        int64_t duty = (tiles + nth - 1) / nth;
 								        int64_t start = duty * ith;
 								        int64_t end = start + duty;
 								        if (end > tiles)
 								            end = tiles;
 								        for (int64_t job = start; job < end; ++job) {
 								            int64_t ii = m0 + job / xtiles * 4;
 								            int64_t jj = n0 + job % xtiles * RN;
 								            __m256 Cv[RN][4] = {};
 								            for (int64_t l = 0; l < k; ++l) {
 								                uint64_t a_delta = ((uint64_t)A[lda * (ii + 3) + l].d << 48) | ((uint64_t)A[lda * (ii + 2) + l].d << 32) | ((uint64_t)A[lda * (ii + 1) + l].d << 16) | (A[lda * (ii + 0) + l].d);
 								                // Convert delta values for four blocks to float values
 								                __m128 da = _mm_cvtph_ps(_mm_set_epi64x(0, a_delta));
 								                __m256i avec0 = load(A + lda * (ii + 0) + l);
 								                __m256i avec1 = load(A + lda * (ii + 1) + l);
 								                __m256i avec2 = load(A + lda * (ii + 2) + l);
 								                __m256i avec3 = load(A + lda * (ii + 3) + l);
 								                for (int64_t j = 0; j < RN; ++j) {
 								                        __m128 db = _mm_set1_ps(unhalf(B[ldb * (jj + j) + l].d));
 								                        // Computation of product of delta values for four blocks and replicate it across 256 bit lane
 								                        __m256 dvec =  _mm256_castps128_ps256(_mm_mul_ps(da, db));
 								                        dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
 								                        // Computation of dot product and multiplication with appropriate delta value products
 								                        Cv[j][0] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
 								                                    updot(_mm256_sign_epi8(avec0, avec0),
 								                                          _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec0)),
 								                                    Cv[j][0]);
 								                        Cv[j][1] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
 								                                    updot(_mm256_sign_epi8(avec1, avec1),
 								                                            _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec1)),
 								                                    Cv[j][1]);
 								                        Cv[j][2] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
 								                                    updot(_mm256_sign_epi8(avec2, avec2),
 								                                            _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec2)),
 								                                    Cv[j][2]);
 								                        Cv[j][3] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
 								                                    updot(_mm256_sign_epi8(avec3, avec3),
 								                                            _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec3)),
 								                                    Cv[j][3]);
 								                }
 								            }
 								            for (int64_t j = 0; j < RN; ++j)
 								                for (int64_t i = 0; i < 4; ++i)
 								                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
 								        }
 								    }
 								    // Templated functions for gemm of dimensions Mx4
 								    template <int RM>
 								    NOINLINE void gemmMx4(int64_t m0, int64_t m, int64_t n0, int64_t n) {
 								        int64_t ytiles = (m - m0) / RM;
 								        int64_t xtiles = (n - n0) / 4;
 								        int64_t tiles = xtiles * ytiles;
 								        int64_t duty = (tiles + nth - 1) / nth;
 								        int64_t start = duty * ith;
 								        int64_t end = start + duty;
 								        if (end > tiles)
 								            end = tiles;
 								        for (int64_t job = start; job < end; ++job) {
 								            int64_t ii = m0 + job / xtiles * RM;
 								            int64_t jj = n0 + job % xtiles * 4;
 								            __m256 Cv[4][RM] = {};
 								            for (int64_t l = 0; l < k; ++l) {
 								                uint64_t b_delta = ((uint64_t)B[ldb * (jj + 3) + l].d << 48) | ((uint64_t)B[ldb * (jj + 2) + l].d << 32) | ((uint64_t)B[ldb * (jj + 1) + l].d << 16) | (B[ldb * (jj + 0) + l].d);
 								                // Convert delta values for four blocks to float values
 								                __m128 db = _mm_cvtph_ps(_mm_set_epi64x(0, b_delta));
 								                __m256i bvec0 = load(B + ldb * (jj + 0) + l);
 								                __m256i bvec1 = load(B + ldb * (jj + 1) + l);
 								                __m256i bvec2 = load(B + ldb * (jj + 2) + l);
 								                __m256i bvec3 = load(B + ldb * (jj + 3) + l);
 								                for (int64_t i = 0; i < RM; ++i) {
 								                    __m128 da = _mm_set1_ps(unhalf((A[lda * (ii + i) + l].d)));
 								                    // Computation of product of delta values for four blocks and replicate it across 256 bit lane
 								                    __m256 dvec =  _mm256_castps128_ps256(_mm_mul_ps(da, db));
 								                    dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
 								                    // Computation of dot product and multiplication with appropriate delta value products
 								                    Cv[0][i] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
 								                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
 								                                                            load(A + lda * (ii + i) + l)),
 								                                            _mm256_sign_epi8(bvec0, load(A + lda * (ii + i) + l))),
 								                                    Cv[0][i]);
 								                    Cv[1][i] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
 								                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
 								                                                            load(A + lda * (ii + i) + l)),
 								                                            _mm256_sign_epi8(bvec1, load(A + lda * (ii + i) + l))),
 								                                    Cv[1][i]);
 								                    Cv[2][i] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
 								                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
 								                                                            load(A + lda * (ii + i) + l)),
 								                                            _mm256_sign_epi8(bvec2, load(A + lda * (ii + i) + l))),
 								                                    Cv[2][i]);
 								                    Cv[3][i] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
 								                                    updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
 								                                                            load(A + lda * (ii + i) + l)),
 								                                            _mm256_sign_epi8(bvec3, load(A + lda * (ii + i) + l))),
 								                                    Cv[3][i]);
 								                }
 								            }
 								            for (int64_t j = 0; j < 4; ++j)
 								                for (int64_t i = 0; i < RM; ++i)
 								                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
 								        }
 								    }
 								#endif
 								    template <int RM, int RN>
 								    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
 								        int64_t ytiles = (m - m0) / RM;
 								        int64_t xtiles = (n - n0) / RN;
 								        int64_t tiles = xtiles * ytiles;
 								        int64_t duty = (tiles + nth - 1) / nth;
 								        int64_t start = duty * ith;
 								        int64_t end = start + duty;
 								        if (end > tiles)
 								            end = tiles;
 								        for (int64_t job = start; job < end; ++job) {
 								            int64_t ii = m0 + job / xtiles * RM;
 								            int64_t jj = n0 + job % xtiles * RN;
 								            __m256 Cv[RN][RM] = {};
 								            for (int64_t l = 0; l < k; ++l)
 								                for (int64_t j = 0; j < RN; ++j)
 								                    for (int64_t i = 0; i < RM; ++i) {
 								#if defined(__AVX2__)
 								                        __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
 								                                                              load(A + lda * (ii + i) + l)),
 								                                             _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
 								                                                              load(A + lda * (ii + i) + l)));
 								#else
 								                        __m128i ali0 = load0(A + lda * (ii + i) + l);
 								                        __m128i ali1 = load1(A + lda * (ii + i) + l);
 								                        __m128i blj0 = load0(B + ldb * (jj + j) + l);
 								                        __m128i blj1 = load1(B + ldb * (jj + j) + l);
 								                        __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
 								                        __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
 								                        __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
 								                        __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
 								                        // updot
 								                        const __m128i oneFill = _mm_set1_epi16(1);
 								                        __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
 								                        __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
 								                        __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
 								#endif
 								                        Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
 								                                                       unhalf(B[ldb * (jj + j) + l].d)),
 								                                                       udTmp,
 								                                                       Cv[j][i]);
 								                    }
 								            for (int64_t j = 0; j < RN; ++j)
 								                for (int64_t i = 0; i < RM; ++i)
 								                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
 								        }
 								    }
 								    inline __m256i load(const block_q8_0 *b) {
 								        return _mm256_loadu_si256((const __m256i *)b->qs);
 								    }
 								    inline __m128i load0(const block_q8_0 *b) {
 								        return _mm_loadu_si128((const __m128i *)b->qs);
 								    }
 								    inline __m128i load1(const block_q8_0 *b) {
 								        return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
 								    }
 								    inline __m256i load(const block_q4_0 *b) {
 								        return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
 								    }
 								    inline __m128i load0(const block_q4_0 *b) {
 								        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
 								        return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
 								    }
 								    inline __m128i load1(const block_q4_0 *b) {
 								        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
 								        return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
 								    }
-												IBM granite/granitemoe architecture support (#6760)

* fix(ext_server): Port llama.cpp sampling refactors to ext_server

This was a fairly large changeset. I closely followed the changes here:
https://github.com/ggerganov/llama.cpp/commit/df270ef74596da8f1178f08991f4c51f18c9ee82

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(server.cpp): Refactor server.cpp logging for llama.cpp overhaul

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Bump llama.cpp to the latest master with `granite` support

This does not yet have granite MoE support, but that can come in a
follow up PR

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(patches): Update all patches (except solar-pro) to work with bumped llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(solar): Update solar patch for llama.cpp bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump llama.cpp for granitemoe support

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump llama.cpp for granitemoe support

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(solar): Update the solar-pro patch for latest llama.cpp bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump to the latest master of llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(patches): Update all patches for latest bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama): Always run sync.sh from the right directory

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/patches): Update llama patches

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama)!: Rough sync with llama.cpp submodule

There are a number of changes that will need to be propagated to llama.go
before any of this works!

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/patches): Add a patch and update for missing ggml-impl.h include

This include is where the ggml_cgraph struct is defined. It is included in
many of the .c files to define the forward declartion in ggml.h. It seems
that with the subset of code included here, the import was somehow lost (or
out-of-order) when building, so adding this include to llama.cpp fixes the
missing definition.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/sync): Add missing ggml-cpu-impl.h copy-over in sync.sh

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Add missing log.cpp

This was added as part of the logging overhaul done in llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Overhaul use of sampling module for llama.cpp changes

The changes here reflect the changes made in the big llama.cpp sampling PR
https://github.com/ggerganov/llama.cpp/pull/9294

The sampling functionality is now broken into the base interface
(llama_sampler) and the generation implementation (gpt_sampler). The
changes here reflect that. Since the sampling.h/sampling.cpp code uses c++
STL headers, the sampling_ext.[h|cpp] wrapper is maintained to allow go to
access a pure-C interface.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Fix the impl of SampleTokenGreedy for new sampling

I don't think this method is currently used, so it could probably just be
removed so that all sampling goes through the GPT interface, but in the
interest of doing no harm, this should keep the method working as expected.

Branch: IBMGraniteArchitectureSupport

* fix(llama): Remove unused SampleTokenGreedy

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(sync): Remove bash-specific change to sync.sh

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* chore(gofumpt): Format on llama.go to pass linting

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llm): Fix missing <thread> include in ext_server

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Remove TODO about grammar_first

This feature was not used/needed previously so should be fine without
plumbing it through now.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Better naming for sampling wrapper and args

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Fix patch 05 to use new wrapper api and re-sync

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* runner: Flush pending responses before returning

If there are any pending reponses (such as from potential stop
tokens) then we should send them back before ending the sequence.
Otherwise, we can be missing tokens at the end of a response.

Fixes #6707

* fix(llama/sampling): Use gpt_sampler with a forward declaration

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Remove unnecessary patch for gguf impl header

This was caused by an earlier mistake in the embeddings patch that was
dereferencing the pointer instead of using the wrapper API.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llm): Remove use of deprecated --log-disable flag

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
											
										
										
											2024-10-17 18:59:52 +00:00
+								    inline __m256i load(const block_iq4_nl *b) {
 								        return MM256_SET_M128I(load1(b), load0(b));
 								    }
 								    inline __m128i load0(const block_iq4_nl *b) {
 								        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
 								        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
 								    }
 								    inline __m128i load1(const block_iq4_nl *b) {
 								        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
 								        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
 								    }
-												Re-introduce the `llama` package (#5034)

* Re-introduce the llama package

This PR brings back the llama package, making it possible to call llama.cpp and
ggml APIs from Go directly via CGo. This has a few advantages:

- C APIs can be called directly from Go without needing to use the previous
  "server" REST API
- On macOS and for CPU builds on Linux and Windows, Ollama can be built without
  a go generate ./... step, making it easy to get up and running to hack on
  parts of Ollama that don't require fast inference
- Faster build times for AVX,AVX2,CUDA and ROCM (a full build of all runners
  takes <5 min on a fast CPU)
- No git submodule making it easier to clone and build from source

This is a big PR, but much of it is vendor code except for:

- llama.go CGo bindings
- example/: a simple example of running inference
- runner/: a subprocess server designed to replace the llm/ext_server package
- Makefile an as minimal as possible Makefile to build the runner package for
  different targets (cpu, avx, avx2, cuda, rocm)

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>

* cache: Clear old KV cache entries when evicting a slot

When forking a cache entry, if no empty slots are available we
evict the least recently used one and copy over the KV entries
from the closest match. However, this copy does not overwrite
existing values but only adds new ones. Therefore, we need to
clear the old slot first.

This change fixes two issues:
 - The KV cache fills up and runs out of space even though we think
   we are managing it correctly
 - Performance gets worse over time as we use new cache entries that
   are not hot in the processor caches

* doc: explain golang objc linker warning (#6830)

* llama: gather transitive dependencies for rocm for dist packaging (#6848)

* Refine go server makefiles to be more DRY (#6924)

This breaks up the monolithic Makefile for the Go based runners into a
set of utility files as well as recursive Makefiles for the runners.
Files starting with the name "Makefile" are buildable, while files that
end with ".make" are utilities to include in other Makefiles.  This
reduces the amount of nearly identical targets and helps set a pattern
for future community contributions for new GPU runner architectures.

When we are ready to switch over to the Go runners, these files should
move to the top of the repo, and we should add targets for the main CLI,
as well as a helper "install" (put all the built binaries on the local
system in a runnable state) and "dist" target (generate the various
tar/zip files for distribution) for local developer use.

* llama: don't create extraneous directories (#6988)

* llama: Exercise the new build in CI (#6989)

Wire up some basic sanity testing in CI for the Go runner.  GPU runners are not covered yet.

* llama: Refine developer docs for Go server (#6842)

This enhances the documentation for development focusing on the new Go
server.  After we complete the transition further doc refinements
can remove the "transition" discussion.

* runner.go: Allocate batches for all sequences during init

We should tell the model that we could have full batches for all
sequences. We already do this when we allocate the batches but it was
missed during initialization.

* llama.go: Don't return nil from Tokenize on zero length input

Potentially receiving nil in a non-error condition is surprising to
most callers - it's better to return an empty slice.

* runner.go: Remove stop tokens from cache

If the last token is EOG then we don't return this and it isn't
present in the cache (because it was never submitted to Decode).
This works well for extending the cache entry with a new sequence.

However, for multi-token stop sequences, we won't return any of the
tokens but all but the last one will be in the cache. This means
when the conversation continues the cache will contain tokens that
don't overlap with the new prompt.

This works (we will pick up the portion where there is overlap) but
it causes unnecessary cache thrashing because we will fork the original
cache entry as it is not a perfect match.

By trimming the cache to the tokens that we actually return this
issue can be avoided.

* runner.go: Simplify flushing of pending tokens

* runner.go: Update TODOs

* runner.go: Don't panic when processing sequences

If there is an error processing a sequence, we should return a
clean HTTP error back to Ollama rather than panicing. This will
make us more resilient to transient failures.

Panics can still occur during startup as there is no way to serve
requests if that fails.

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: More accurately capture timings

Currently prompt processing time doesn't capture the that it takes
to tokenize the input, only decoding time. We should capture the
full process to more accurately reflect reality. This is especially
true once we start processing images where the initial processing
can take significant time. This is also more consistent with the
existing C++ runner.

* runner.go: Support for vision models

In addition to bringing feature parity with the C++ runner, this also
incorporates several improvements:
 - Cache prompting works with images, avoiding the need to re-decode
   embeddings for every message in a conversation
 - Parallelism is supported, avoiding the need to restrict to one
   sequence at a time. (Though for now Ollama will not schedule
   them while we might need to fall back to the old runner.)

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: Move Unicode checking code and add tests

* runner.go: Export external cache members

Runner and cache are in the same package so the change doesn't
affect anything but it is more internally consistent.

* runner.go: Image embedding cache

Generating embeddings from images can take significant time (on
my machine between 100ms and 8s depending on the model). Although
we already cache the result of decoding these images, the embeddings
need to be regenerated every time. This is not necessary if we get
the same image over and over again, for example, during a conversation.

This currently uses a very small cache with a very simple algorithm
but it is easy to improve as is warranted.

* llama: catch up on patches

Carry forward solar-pro and cli-unicode patches

* runner.go: Don't re-allocate memory for every batch

We can reuse memory allocated from batch to batch since batch
size is fixed. This both saves the cost of reallocation as well
keeps the cache lines hot.

This results in a roughly 1% performance improvement for token
generation with Nvidia GPUs on Linux.

* runner.go: Default to classic input cache policy

The input cache as part of the go runner implemented a cache
policy that aims to maximize hit rate in both single and multi-
user scenarios. When there is a cache hit, the response is
very fast.

However, performance is actually slower when there is an input
cache miss due to worse GPU VRAM locality. This means that
performance is generally better overall for multi-user scenarios
(better input cache hit rate, locality was relatively poor already).
But worse for single users (input cache hit rate is about the same,
locality is now worse).

This defaults the policy back to the old one to avoid a regression
but keeps the new one available through an environment variable
OLLAMA_MULTIUSER_CACHE. This is left undocumented as the goal is
to improve this in the future to get the best of both worlds
without user configuration.

For inputs that result in cache misses, on Nvidia/Linux this
change improves performance by 31% for prompt processing and
13% for token generation.

* runner.go: Increase size of response channel

Generally the CPU can easily keep up with handling reponses that
are generated but there's no reason not to let generation continue
and handle things in larger batches if needed.

* llama: Add CI to verify all vendored changes have patches (#7066)

Make sure we don't accidentally merge changes in the vendored code
that aren't also reflected in the patches.

* llama: adjust clip patch for mingw utf-16 (#7065)

* llama: adjust clip patch for mingw utf-16

* llama: ensure static linking of runtime libs

Avoid runtime dependencies on non-standard libraries

* runner.go: Enable llamafile (all platforms) and BLAS (Mac OS)

These are two features that are shown on llama.cpp's system info
that are currently different between the two runners. On my test
systems the performance difference is very small to negligible
but it is probably still good to equalize the features.

* llm: Don't add BOS/EOS for tokenize requests

This is consistent with what server.cpp currently does. It affects
things like token processing counts for embedding requests.

* runner.go: Don't cache prompts for embeddings

Our integration with server.cpp implicitly disables prompt caching
because it is not part of the JSON object being parsed, this makes
the Go runner behavior similarly.

Prompt caching has been seen to affect the results of text completions
on certain hardware. The results are not wrong either way but they
are non-deterministic. However, embeddings seem to be affected even
on hardware that does not show this behavior for completions. For
now, it is best to maintain consistency with the existing behavior.

* runner.go: Adjust debug log levels

Add system info printed at startup and quiet down noisier logging.

* llama: fix compiler flag differences (#7082)

Adjust the flags for the new Go server to more closely match the
generate flow

* llama: refine developer docs (#7121)

* llama: doc and example clean up (#7122)

* llama: doc and example clean up

* llama: Move new dockerfile into llama dir

Temporary home until we fully transition to the Go server

* llama: runner doc cleanup

* llama.go: Add description for Tokenize error case

---------

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
Co-authored-by: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
											
										
										
											2024-10-08 15:53:54 +00:00
+								    inline __m256 updot(__m256i u, __m256i s) {
 								        __m256i res;
 								#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
 								        res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
 								#else
 								        res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
 								#endif
 								        return _mm256_cvtepi32_ps(res);
 								    }
 								    static inline __m256i denibble(const uint8_t *p) {
 								        __m128i x = _mm_loadu_si128((const __m128i *)p);
 								        return _mm256_and_si256(_mm256_set1_epi8(15),
 								                                _mm256_insertf128_si256(_mm256_castsi128_si256(x),
 								                                                        _mm_srli_epi16(x, 4), 1));
 								    }
 								    const TA *const A;
 								    const TB *const B;
 								    TC *const C;
 								    const int64_t k;
 								    const int64_t lda;
 								    const int64_t ldb;
 								    const int64_t ldc;
 								    const int ith;
 								    const int nth;
 								};
 								#endif // __AVX__
 								} // namespace
 								/**
 								 * Performs optimized matrix multiplication on CPU.
 								 *
 								 * This subroutine may compute C = Aᵀ * B with column major ordering.
 								 * Despite its name, this isn't a generalized implementation. Work is
 								 * only performed when a handwritten kernel is written and available.
 								 * Otherwise the caller should fall back to a general matmul routine.
 								 *
 								 * For example, for single-threaded single-precision GEMM you can say
 								 *
 								 *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
 								 *                     0, 1,
 								 *                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
 								 *
 								 * @param m is rows in `A` and `C`
 								 * @param n is cols in `B` and `C`
 								 * @param k is cols in `A` and rows in `B`
 								 * @param A is first input matrix (always transposed)
 								 * @param lda is row stride of `A`
 								 * @param B is second input matrix (never transposed)
 								 * @param ldb is row stride of `B`
 								 * @param C is input/output array of output matrices
 								 * @param ldc is row stride of `C`
 								 * @param ith is thread id (must be less than `nth`)
 								 * @param nth is number of threads (must be greater than zero)
 								 * @param Atype is GGML data type of `A`
 								 * @param Btype is GGML data type of `B`
 								 * @param Ctype is GGML data type of `C`
 								 * @return true if this function was able to service the matmul request
 								 */
 								bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
 								                     int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
 								    assert(m >= 0);
 								    assert(n >= 0);
 								    assert(k >= 0);
 								    assert(lda >= k);
 								    assert(ldb >= k);
 								    assert(ldc >= m);
 								    assert(nth > 0);
 								    assert(ith < nth);
-												IBM granite/granitemoe architecture support (#6760)

* fix(ext_server): Port llama.cpp sampling refactors to ext_server

This was a fairly large changeset. I closely followed the changes here:
https://github.com/ggerganov/llama.cpp/commit/df270ef74596da8f1178f08991f4c51f18c9ee82

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(server.cpp): Refactor server.cpp logging for llama.cpp overhaul

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Bump llama.cpp to the latest master with `granite` support

This does not yet have granite MoE support, but that can come in a
follow up PR

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(patches): Update all patches (except solar-pro) to work with bumped llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(solar): Update solar patch for llama.cpp bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump llama.cpp for granitemoe support

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump llama.cpp for granitemoe support

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(solar): Update the solar-pro patch for latest llama.cpp bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump to the latest master of llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(patches): Update all patches for latest bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama): Always run sync.sh from the right directory

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/patches): Update llama patches

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama)!: Rough sync with llama.cpp submodule

There are a number of changes that will need to be propagated to llama.go
before any of this works!

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/patches): Add a patch and update for missing ggml-impl.h include

This include is where the ggml_cgraph struct is defined. It is included in
many of the .c files to define the forward declartion in ggml.h. It seems
that with the subset of code included here, the import was somehow lost (or
out-of-order) when building, so adding this include to llama.cpp fixes the
missing definition.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/sync): Add missing ggml-cpu-impl.h copy-over in sync.sh

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Add missing log.cpp

This was added as part of the logging overhaul done in llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Overhaul use of sampling module for llama.cpp changes

The changes here reflect the changes made in the big llama.cpp sampling PR
https://github.com/ggerganov/llama.cpp/pull/9294

The sampling functionality is now broken into the base interface
(llama_sampler) and the generation implementation (gpt_sampler). The
changes here reflect that. Since the sampling.h/sampling.cpp code uses c++
STL headers, the sampling_ext.[h|cpp] wrapper is maintained to allow go to
access a pure-C interface.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Fix the impl of SampleTokenGreedy for new sampling

I don't think this method is currently used, so it could probably just be
removed so that all sampling goes through the GPT interface, but in the
interest of doing no harm, this should keep the method working as expected.

Branch: IBMGraniteArchitectureSupport

* fix(llama): Remove unused SampleTokenGreedy

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(sync): Remove bash-specific change to sync.sh

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* chore(gofumpt): Format on llama.go to pass linting

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llm): Fix missing <thread> include in ext_server

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Remove TODO about grammar_first

This feature was not used/needed previously so should be fine without
plumbing it through now.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Better naming for sampling wrapper and args

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Fix patch 05 to use new wrapper api and re-sync

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* runner: Flush pending responses before returning

If there are any pending reponses (such as from potential stop
tokens) then we should send them back before ending the sequence.
Otherwise, we can be missing tokens at the end of a response.

Fixes #6707

* fix(llama/sampling): Use gpt_sampler with a forward declaration

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Remove unnecessary patch for gguf impl header

This was caused by an earlier mistake in the embeddings patch that was
dereferencing the pointer instead of using the wrapper API.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llm): Remove use of deprecated --log-disable flag

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
											
										
										
											2024-10-17 18:59:52 +00:00
+								    // only enable sgemm for prompt processing
 								    if (n < 2)
 								        return false;
-												Re-introduce the `llama` package (#5034)

* Re-introduce the llama package

This PR brings back the llama package, making it possible to call llama.cpp and
ggml APIs from Go directly via CGo. This has a few advantages:

- C APIs can be called directly from Go without needing to use the previous
  "server" REST API
- On macOS and for CPU builds on Linux and Windows, Ollama can be built without
  a go generate ./... step, making it easy to get up and running to hack on
  parts of Ollama that don't require fast inference
- Faster build times for AVX,AVX2,CUDA and ROCM (a full build of all runners
  takes <5 min on a fast CPU)
- No git submodule making it easier to clone and build from source

This is a big PR, but much of it is vendor code except for:

- llama.go CGo bindings
- example/: a simple example of running inference
- runner/: a subprocess server designed to replace the llm/ext_server package
- Makefile an as minimal as possible Makefile to build the runner package for
  different targets (cpu, avx, avx2, cuda, rocm)

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>

* cache: Clear old KV cache entries when evicting a slot

When forking a cache entry, if no empty slots are available we
evict the least recently used one and copy over the KV entries
from the closest match. However, this copy does not overwrite
existing values but only adds new ones. Therefore, we need to
clear the old slot first.

This change fixes two issues:
 - The KV cache fills up and runs out of space even though we think
   we are managing it correctly
 - Performance gets worse over time as we use new cache entries that
   are not hot in the processor caches

* doc: explain golang objc linker warning (#6830)

* llama: gather transitive dependencies for rocm for dist packaging (#6848)

* Refine go server makefiles to be more DRY (#6924)

This breaks up the monolithic Makefile for the Go based runners into a
set of utility files as well as recursive Makefiles for the runners.
Files starting with the name "Makefile" are buildable, while files that
end with ".make" are utilities to include in other Makefiles.  This
reduces the amount of nearly identical targets and helps set a pattern
for future community contributions for new GPU runner architectures.

When we are ready to switch over to the Go runners, these files should
move to the top of the repo, and we should add targets for the main CLI,
as well as a helper "install" (put all the built binaries on the local
system in a runnable state) and "dist" target (generate the various
tar/zip files for distribution) for local developer use.

* llama: don't create extraneous directories (#6988)

* llama: Exercise the new build in CI (#6989)

Wire up some basic sanity testing in CI for the Go runner.  GPU runners are not covered yet.

* llama: Refine developer docs for Go server (#6842)

This enhances the documentation for development focusing on the new Go
server.  After we complete the transition further doc refinements
can remove the "transition" discussion.

* runner.go: Allocate batches for all sequences during init

We should tell the model that we could have full batches for all
sequences. We already do this when we allocate the batches but it was
missed during initialization.

* llama.go: Don't return nil from Tokenize on zero length input

Potentially receiving nil in a non-error condition is surprising to
most callers - it's better to return an empty slice.

* runner.go: Remove stop tokens from cache

If the last token is EOG then we don't return this and it isn't
present in the cache (because it was never submitted to Decode).
This works well for extending the cache entry with a new sequence.

However, for multi-token stop sequences, we won't return any of the
tokens but all but the last one will be in the cache. This means
when the conversation continues the cache will contain tokens that
don't overlap with the new prompt.

This works (we will pick up the portion where there is overlap) but
it causes unnecessary cache thrashing because we will fork the original
cache entry as it is not a perfect match.

By trimming the cache to the tokens that we actually return this
issue can be avoided.

* runner.go: Simplify flushing of pending tokens

* runner.go: Update TODOs

* runner.go: Don't panic when processing sequences

If there is an error processing a sequence, we should return a
clean HTTP error back to Ollama rather than panicing. This will
make us more resilient to transient failures.

Panics can still occur during startup as there is no way to serve
requests if that fails.

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: More accurately capture timings

Currently prompt processing time doesn't capture the that it takes
to tokenize the input, only decoding time. We should capture the
full process to more accurately reflect reality. This is especially
true once we start processing images where the initial processing
can take significant time. This is also more consistent with the
existing C++ runner.

* runner.go: Support for vision models

In addition to bringing feature parity with the C++ runner, this also
incorporates several improvements:
 - Cache prompting works with images, avoiding the need to re-decode
   embeddings for every message in a conversation
 - Parallelism is supported, avoiding the need to restrict to one
   sequence at a time. (Though for now Ollama will not schedule
   them while we might need to fall back to the old runner.)

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: Move Unicode checking code and add tests

* runner.go: Export external cache members

Runner and cache are in the same package so the change doesn't
affect anything but it is more internally consistent.

* runner.go: Image embedding cache

Generating embeddings from images can take significant time (on
my machine between 100ms and 8s depending on the model). Although
we already cache the result of decoding these images, the embeddings
need to be regenerated every time. This is not necessary if we get
the same image over and over again, for example, during a conversation.

This currently uses a very small cache with a very simple algorithm
but it is easy to improve as is warranted.

* llama: catch up on patches

Carry forward solar-pro and cli-unicode patches

* runner.go: Don't re-allocate memory for every batch

We can reuse memory allocated from batch to batch since batch
size is fixed. This both saves the cost of reallocation as well
keeps the cache lines hot.

This results in a roughly 1% performance improvement for token
generation with Nvidia GPUs on Linux.

* runner.go: Default to classic input cache policy

The input cache as part of the go runner implemented a cache
policy that aims to maximize hit rate in both single and multi-
user scenarios. When there is a cache hit, the response is
very fast.

However, performance is actually slower when there is an input
cache miss due to worse GPU VRAM locality. This means that
performance is generally better overall for multi-user scenarios
(better input cache hit rate, locality was relatively poor already).
But worse for single users (input cache hit rate is about the same,
locality is now worse).

This defaults the policy back to the old one to avoid a regression
but keeps the new one available through an environment variable
OLLAMA_MULTIUSER_CACHE. This is left undocumented as the goal is
to improve this in the future to get the best of both worlds
without user configuration.

For inputs that result in cache misses, on Nvidia/Linux this
change improves performance by 31% for prompt processing and
13% for token generation.

* runner.go: Increase size of response channel

Generally the CPU can easily keep up with handling reponses that
are generated but there's no reason not to let generation continue
and handle things in larger batches if needed.

* llama: Add CI to verify all vendored changes have patches (#7066)

Make sure we don't accidentally merge changes in the vendored code
that aren't also reflected in the patches.

* llama: adjust clip patch for mingw utf-16 (#7065)

* llama: adjust clip patch for mingw utf-16

* llama: ensure static linking of runtime libs

Avoid runtime dependencies on non-standard libraries

* runner.go: Enable llamafile (all platforms) and BLAS (Mac OS)

These are two features that are shown on llama.cpp's system info
that are currently different between the two runners. On my test
systems the performance difference is very small to negligible
but it is probably still good to equalize the features.

* llm: Don't add BOS/EOS for tokenize requests

This is consistent with what server.cpp currently does. It affects
things like token processing counts for embedding requests.

* runner.go: Don't cache prompts for embeddings

Our integration with server.cpp implicitly disables prompt caching
because it is not part of the JSON object being parsed, this makes
the Go runner behavior similarly.

Prompt caching has been seen to affect the results of text completions
on certain hardware. The results are not wrong either way but they
are non-deterministic. However, embeddings seem to be affected even
on hardware that does not show this behavior for completions. For
now, it is best to maintain consistency with the existing behavior.

* runner.go: Adjust debug log levels

Add system info printed at startup and quiet down noisier logging.

* llama: fix compiler flag differences (#7082)

Adjust the flags for the new Go server to more closely match the
generate flow

* llama: refine developer docs (#7121)

* llama: doc and example clean up (#7122)

* llama: doc and example clean up

* llama: Move new dockerfile into llama dir

Temporary home until we fully transition to the Go server

* llama: runner doc cleanup

* llama.go: Add description for Tokenize error case

---------

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
Co-authored-by: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
											
										
										
											2024-10-08 15:53:54 +00:00
+								    if (Ctype != GGML_TYPE_F32)
 								        return false;
 								    switch (Atype) {
 								    case GGML_TYPE_F32: {
 								        if (Btype != GGML_TYPE_F32)
 								            return false;
 								#if defined(__AVX512F__)
 								        if (k % 16)
 								            return false;
 								        tinyBLAS<16, __m512, __m512, float, float, float> tb{
 								            k, (const float *)A, lda,
 								            (const float *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#elif defined(__AVX__) || defined(__AVX2__)
 								        if (k % 8)
 								            return false;
 								        tinyBLAS<8, __m256, __m256, float, float, float> tb{
 								            k, (const float *)A, lda,
 								            (const float *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#elif defined(__ARM_NEON)
 								        if (n < 4)
 								            return false;
 								        if (k % 4)
 								            return false;
 								        tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
 								            k, (const float *)A, lda,
 								            (const float *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#else
 								        return false;
 								#endif
 								    }
 								    case GGML_TYPE_F16: {
 								#if defined(__AVX512F__)
 								        if (k % 16)
 								            return false;
 								        if (Btype != GGML_TYPE_F32)
 								            return false;
 								        tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
 								            k, (const ggml_fp16_t *)A, lda,
 								            (const float *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
 								        if (k % 8)
 								            return false;
 								        if (Btype != GGML_TYPE_F32)
 								            return false;
 								        tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
 								            k, (const ggml_fp16_t *)A, lda,
 								            (const float *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
 								        if (n < 8)
 								            return false;
 								        if (k % 8)
 								            return false;
 								        if (Btype != GGML_TYPE_F16)
 								            return false;
 								        tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
 								            k, (const ggml_fp16_t *)A, lda,
 								            (const ggml_fp16_t *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#elif defined(__ARM_NEON) && !defined(_MSC_VER)
 								        if (k % 4)
 								            return false;
 								        if (Btype != GGML_TYPE_F32)
 								            return false;
 								        tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
 								            k, (const ggml_fp16_t *)A, lda,
 								            (const float *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#else
 								        return false;
 								#endif
 								    }
 								    case GGML_TYPE_Q8_0: {
 								        if (Btype != GGML_TYPE_Q8_0)
 								           return false;
 								#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
 								        tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
 								            k, (const block_q8_0 *)A, lda,
 								            (const block_q8_0 *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#elif defined(__ARM_FEATURE_DOTPROD)
 								        tinyBLAS_Q0_ARM<block_q8_0> tb{
 								            k, (const block_q8_0 *)A, lda,
 								            (const block_q8_0 *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#else
 								        return false;
 								#endif
 								    }
 								    case GGML_TYPE_Q4_0: {
 								        if (Btype != GGML_TYPE_Q8_0)
 								            return false;
 								#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
 								        tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
 								            k, (const block_q4_0 *)A, lda,
 								            (const block_q8_0 *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#elif defined(__ARM_FEATURE_DOTPROD)
 								        tinyBLAS_Q0_ARM<block_q4_0> tb{
 								            k, (const block_q4_0 *)A, lda,
 								            (const block_q8_0 *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#else
 								        return false;
 								#endif
 								    }
-												IBM granite/granitemoe architecture support (#6760)

* fix(ext_server): Port llama.cpp sampling refactors to ext_server

This was a fairly large changeset. I closely followed the changes here:
https://github.com/ggerganov/llama.cpp/commit/df270ef74596da8f1178f08991f4c51f18c9ee82

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(server.cpp): Refactor server.cpp logging for llama.cpp overhaul

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Bump llama.cpp to the latest master with `granite` support

This does not yet have granite MoE support, but that can come in a
follow up PR

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(patches): Update all patches (except solar-pro) to work with bumped llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(solar): Update solar patch for llama.cpp bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump llama.cpp for granitemoe support

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump llama.cpp for granitemoe support

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(solar): Update the solar-pro patch for latest llama.cpp bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama.cpp): Bump to the latest master of llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(patches): Update all patches for latest bump

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama): Always run sync.sh from the right directory

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/patches): Update llama patches

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(llama)!: Rough sync with llama.cpp submodule

There are a number of changes that will need to be propagated to llama.go
before any of this works!

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/patches): Add a patch and update for missing ggml-impl.h include

This include is where the ggml_cgraph struct is defined. It is included in
many of the .c files to define the forward declartion in ggml.h. It seems
that with the subset of code included here, the import was somehow lost (or
out-of-order) when building, so adding this include to llama.cpp fixes the
missing definition.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama/sync): Add missing ggml-cpu-impl.h copy-over in sync.sh

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Add missing log.cpp

This was added as part of the logging overhaul done in llama.cpp

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Overhaul use of sampling module for llama.cpp changes

The changes here reflect the changes made in the big llama.cpp sampling PR
https://github.com/ggerganov/llama.cpp/pull/9294

The sampling functionality is now broken into the base interface
(llama_sampler) and the generation implementation (gpt_sampler). The
changes here reflect that. Since the sampling.h/sampling.cpp code uses c++
STL headers, the sampling_ext.[h|cpp] wrapper is maintained to allow go to
access a pure-C interface.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Fix the impl of SampleTokenGreedy for new sampling

I don't think this method is currently used, so it could probably just be
removed so that all sampling goes through the GPT interface, but in the
interest of doing no harm, this should keep the method working as expected.

Branch: IBMGraniteArchitectureSupport

* fix(llama): Remove unused SampleTokenGreedy

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(sync): Remove bash-specific change to sync.sh

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* chore(gofumpt): Format on llama.go to pass linting

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llm): Fix missing <thread> include in ext_server

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Remove TODO about grammar_first

This feature was not used/needed previously so should be fine without
plumbing it through now.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Better naming for sampling wrapper and args

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Fix patch 05 to use new wrapper api and re-sync

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* runner: Flush pending responses before returning

If there are any pending reponses (such as from potential stop
tokens) then we should send them back before ending the sequence.
Otherwise, we can be missing tokens at the end of a response.

Fixes #6707

* fix(llama/sampling): Use gpt_sampler with a forward declaration

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llama): Remove unnecessary patch for gguf impl header

This was caused by an earlier mistake in the embeddings patch that was
dereferencing the pointer instead of using the wrapper API.

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(llm): Remove use of deprecated --log-disable flag

Branch: IBMGraniteArchitectureSupport

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
											
										
										
											2024-10-17 18:59:52 +00:00
+								    case GGML_TYPE_IQ4_NL: {
 								        if (Btype != GGML_TYPE_Q8_0)
 								            return false;
 								#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
 								        tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
 								            k, (const block_iq4_nl *)A, lda,
 								            (const block_q8_0 *)B, ldb,
 								            (float *)C, ldc,
 								            ith, nth};
 								        tb.matmul(m, n);
 								        return true;
 								#else
 								        return false;
 								#endif
 								    }
-												Re-introduce the `llama` package (#5034)

* Re-introduce the llama package

This PR brings back the llama package, making it possible to call llama.cpp and
ggml APIs from Go directly via CGo. This has a few advantages:

- C APIs can be called directly from Go without needing to use the previous
  "server" REST API
- On macOS and for CPU builds on Linux and Windows, Ollama can be built without
  a go generate ./... step, making it easy to get up and running to hack on
  parts of Ollama that don't require fast inference
- Faster build times for AVX,AVX2,CUDA and ROCM (a full build of all runners
  takes <5 min on a fast CPU)
- No git submodule making it easier to clone and build from source

This is a big PR, but much of it is vendor code except for:

- llama.go CGo bindings
- example/: a simple example of running inference
- runner/: a subprocess server designed to replace the llm/ext_server package
- Makefile an as minimal as possible Makefile to build the runner package for
  different targets (cpu, avx, avx2, cuda, rocm)

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>

* cache: Clear old KV cache entries when evicting a slot

When forking a cache entry, if no empty slots are available we
evict the least recently used one and copy over the KV entries
from the closest match. However, this copy does not overwrite
existing values but only adds new ones. Therefore, we need to
clear the old slot first.

This change fixes two issues:
 - The KV cache fills up and runs out of space even though we think
   we are managing it correctly
 - Performance gets worse over time as we use new cache entries that
   are not hot in the processor caches

* doc: explain golang objc linker warning (#6830)

* llama: gather transitive dependencies for rocm for dist packaging (#6848)

* Refine go server makefiles to be more DRY (#6924)

This breaks up the monolithic Makefile for the Go based runners into a
set of utility files as well as recursive Makefiles for the runners.
Files starting with the name "Makefile" are buildable, while files that
end with ".make" are utilities to include in other Makefiles.  This
reduces the amount of nearly identical targets and helps set a pattern
for future community contributions for new GPU runner architectures.

When we are ready to switch over to the Go runners, these files should
move to the top of the repo, and we should add targets for the main CLI,
as well as a helper "install" (put all the built binaries on the local
system in a runnable state) and "dist" target (generate the various
tar/zip files for distribution) for local developer use.

* llama: don't create extraneous directories (#6988)

* llama: Exercise the new build in CI (#6989)

Wire up some basic sanity testing in CI for the Go runner.  GPU runners are not covered yet.

* llama: Refine developer docs for Go server (#6842)

This enhances the documentation for development focusing on the new Go
server.  After we complete the transition further doc refinements
can remove the "transition" discussion.

* runner.go: Allocate batches for all sequences during init

We should tell the model that we could have full batches for all
sequences. We already do this when we allocate the batches but it was
missed during initialization.

* llama.go: Don't return nil from Tokenize on zero length input

Potentially receiving nil in a non-error condition is surprising to
most callers - it's better to return an empty slice.

* runner.go: Remove stop tokens from cache

If the last token is EOG then we don't return this and it isn't
present in the cache (because it was never submitted to Decode).
This works well for extending the cache entry with a new sequence.

However, for multi-token stop sequences, we won't return any of the
tokens but all but the last one will be in the cache. This means
when the conversation continues the cache will contain tokens that
don't overlap with the new prompt.

This works (we will pick up the portion where there is overlap) but
it causes unnecessary cache thrashing because we will fork the original
cache entry as it is not a perfect match.

By trimming the cache to the tokens that we actually return this
issue can be avoided.

* runner.go: Simplify flushing of pending tokens

* runner.go: Update TODOs

* runner.go: Don't panic when processing sequences

If there is an error processing a sequence, we should return a
clean HTTP error back to Ollama rather than panicing. This will
make us more resilient to transient failures.

Panics can still occur during startup as there is no way to serve
requests if that fails.

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: More accurately capture timings

Currently prompt processing time doesn't capture the that it takes
to tokenize the input, only decoding time. We should capture the
full process to more accurately reflect reality. This is especially
true once we start processing images where the initial processing
can take significant time. This is also more consistent with the
existing C++ runner.

* runner.go: Support for vision models

In addition to bringing feature parity with the C++ runner, this also
incorporates several improvements:
 - Cache prompting works with images, avoiding the need to re-decode
   embeddings for every message in a conversation
 - Parallelism is supported, avoiding the need to restrict to one
   sequence at a time. (Though for now Ollama will not schedule
   them while we might need to fall back to the old runner.)

Co-authored-by: jmorganca <jmorganca@gmail.com>

* runner.go: Move Unicode checking code and add tests

* runner.go: Export external cache members

Runner and cache are in the same package so the change doesn't
affect anything but it is more internally consistent.

* runner.go: Image embedding cache

Generating embeddings from images can take significant time (on
my machine between 100ms and 8s depending on the model). Although
we already cache the result of decoding these images, the embeddings
need to be regenerated every time. This is not necessary if we get
the same image over and over again, for example, during a conversation.

This currently uses a very small cache with a very simple algorithm
but it is easy to improve as is warranted.

* llama: catch up on patches

Carry forward solar-pro and cli-unicode patches

* runner.go: Don't re-allocate memory for every batch

We can reuse memory allocated from batch to batch since batch
size is fixed. This both saves the cost of reallocation as well
keeps the cache lines hot.

This results in a roughly 1% performance improvement for token
generation with Nvidia GPUs on Linux.

* runner.go: Default to classic input cache policy

The input cache as part of the go runner implemented a cache
policy that aims to maximize hit rate in both single and multi-
user scenarios. When there is a cache hit, the response is
very fast.

However, performance is actually slower when there is an input
cache miss due to worse GPU VRAM locality. This means that
performance is generally better overall for multi-user scenarios
(better input cache hit rate, locality was relatively poor already).
But worse for single users (input cache hit rate is about the same,
locality is now worse).

This defaults the policy back to the old one to avoid a regression
but keeps the new one available through an environment variable
OLLAMA_MULTIUSER_CACHE. This is left undocumented as the goal is
to improve this in the future to get the best of both worlds
without user configuration.

For inputs that result in cache misses, on Nvidia/Linux this
change improves performance by 31% for prompt processing and
13% for token generation.

* runner.go: Increase size of response channel

Generally the CPU can easily keep up with handling reponses that
are generated but there's no reason not to let generation continue
and handle things in larger batches if needed.

* llama: Add CI to verify all vendored changes have patches (#7066)

Make sure we don't accidentally merge changes in the vendored code
that aren't also reflected in the patches.

* llama: adjust clip patch for mingw utf-16 (#7065)

* llama: adjust clip patch for mingw utf-16

* llama: ensure static linking of runtime libs

Avoid runtime dependencies on non-standard libraries

* runner.go: Enable llamafile (all platforms) and BLAS (Mac OS)

These are two features that are shown on llama.cpp's system info
that are currently different between the two runners. On my test
systems the performance difference is very small to negligible
but it is probably still good to equalize the features.

* llm: Don't add BOS/EOS for tokenize requests

This is consistent with what server.cpp currently does. It affects
things like token processing counts for embedding requests.

* runner.go: Don't cache prompts for embeddings

Our integration with server.cpp implicitly disables prompt caching
because it is not part of the JSON object being parsed, this makes
the Go runner behavior similarly.

Prompt caching has been seen to affect the results of text completions
on certain hardware. The results are not wrong either way but they
are non-deterministic. However, embeddings seem to be affected even
on hardware that does not show this behavior for completions. For
now, it is best to maintain consistency with the existing behavior.

* runner.go: Adjust debug log levels

Add system info printed at startup and quiet down noisier logging.

* llama: fix compiler flag differences (#7082)

Adjust the flags for the new Go server to more closely match the
generate flow

* llama: refine developer docs (#7121)

* llama: doc and example clean up (#7122)

* llama: doc and example clean up

* llama: Move new dockerfile into llama dir

Temporary home until we fully transition to the Go server

* llama: runner doc cleanup

* llama.go: Add description for Tokenize error case

---------

Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
Co-authored-by: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
											
										
										
											2024-10-08 15:53:54 +00:00
+								    default:
 								        return false;
 								    }
 								    (void)m;
 								    (void)n;
 								    (void)k;
 								    (void)A;
 								    (void)lda;
 								    (void)B;
 								    (void)ldb;
 								    (void)C;
 								    (void)ldc;
 								    (void)ith;
 								    (void)nth;
 								    (void)Atype;
 								    (void)Btype;
 								    (void)Ctype;
 								}