diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 0baf86ff..721a9ae8 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -18,7 +18,7 @@ sign() { fi } -COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on" +COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_OPENMP=off" case "${GOARCH}" in "amd64") @@ -27,7 +27,7 @@ case "${GOARCH}" in # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_BLAS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build @@ -37,7 +37,7 @@ case "${GOARCH}" in # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}/cpu" echo "Building LCD CPU" build @@ -49,7 +49,7 @@ case "${GOARCH}" in # Approximately 400% faster than LCD on same CPU # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}/cpu_avx" echo "Building AVX CPU" build @@ -61,7 +61,7 @@ case "${GOARCH}" in # Approximately 10% faster than AVX on same CPU # init_vars - CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_BLAS=off -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" @@ -75,7 +75,7 @@ case "${GOARCH}" in # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_BLAS=off -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index a9df6ff8..f9100875 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then export CUDACXX=$(command -v nvcc) fi fi -COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" +COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off" source $(dirname $0)/gen_common.sh init_vars git_module_setup @@ -64,7 +64,7 @@ if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_OPENMP=off ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}_static" echo "Building static library" build @@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake # -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake - COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off" + COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_OPENMP=off" if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 7b56aed3..0eb48ffa 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -37,7 +37,8 @@ function init_vars { } $script:cmakeDefs = @( "-DBUILD_SHARED_LIBS=on", - "-DLLAMA_NATIVE=off" + "-DLLAMA_NATIVE=off", + "-DLLAMA_OPENMP=off" ) $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() @@ -206,7 +207,8 @@ function build_static() { "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_F16C=off", - "-DLLAMA_FMA=off") + "-DLLAMA_FMA=off", + "-DLLAMA_OPENMP=off") $script:buildDir="../build/windows/${script:ARCH}_static" write-host "Building static library" build diff --git a/llm/llama.cpp b/llm/llama.cpp index 5921b8f0..7c26775a 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit 5921b8f089d3b7bda86aac5a66825df6a6c10603 +Subproject commit 7c26775adb579e92b59c82e8084c07a1d0f75e9c diff --git a/llm/patches/01-load-progress.diff b/llm/patches/01-load-progress.diff index acd44d20..be528609 100644 --- a/llm/patches/01-load-progress.diff +++ b/llm/patches/01-load-progress.diff @@ -1,8 +1,8 @@ diff --git a/common/common.cpp b/common/common.cpp -index ba1ecf0e..cead57cc 100644 +index 73ff0e85..6adb1a92 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -1836,6 +1836,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & +@@ -2447,6 +2447,8 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; @@ -12,20 +12,20 @@ index ba1ecf0e..cead57cc 100644 mparams.kv_overrides = NULL; } else { diff --git a/common/common.h b/common/common.h -index d80344f2..71e84834 100644 +index 58ed72f4..0bb2605e 100644 --- a/common/common.h +++ b/common/common.h -@@ -174,6 +174,13 @@ struct gpt_params { - // multimodal models (see examples/llava) +@@ -180,6 +180,13 @@ struct gpt_params { std::string mmproj = ""; // path to multimodal projector std::vector image; // path to image file(s) -+ + + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. + // If the provided progress_callback returns true, model loading continues. + // If it returns false, model loading is immediately aborted. + llama_progress_callback progress_callback = NULL; + // context pointer passed to the progress callback + void * progress_callback_user_data; - }; - - void gpt_params_handle_model_default(gpt_params & params); ++ + // server params + int32_t port = 8080; // server listens on this network port + int32_t timeout_read = 600; // http read timeout in seconds diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index 27c8aabc..2a2e7306 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,8 +1,8 @@ diff --git a/llama.cpp b/llama.cpp -index 40d2ec2c..74f3ee9c 100644 +index 61948751..4b72a293 100644 --- a/llama.cpp +++ b/llama.cpp -@@ -4642,16 +4642,7 @@ static void llm_load_vocab( +@@ -4824,16 +4824,7 @@ static void llm_load_vocab( // for now, only BPE models have pre-tokenizers if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { @@ -15,14 +15,14 @@ index 40d2ec2c..74f3ee9c 100644 - LLAMA_LOG_WARN("%s: ************************************ \n", __func__); - LLAMA_LOG_WARN("%s: \n", __func__); - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; -- } else if ( -+ if ( - tokenizer_pre == "default") { +- } else if (tokenizer_pre == "default") { ++ if (tokenizer_pre == "default") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( -@@ -4703,7 +4694,8 @@ static void llm_load_vocab( - tokenizer_pre == "smaug-bpe") { - vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG; + tokenizer_pre == "llama3" || +@@ -4888,7 +4879,8 @@ static void llm_load_vocab( + tokenizer_pre == "poro-chat") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO; } else { - throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);