From 811b1f03c8270baa8004aff9e977ed3d18593661 Mon Sep 17 00:00:00 2001 From: Bruce MacDonald Date: Fri, 24 Nov 2023 13:58:09 -0500 Subject: [PATCH] deprecate ggml - remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan --- .dockerignore | 1 - .gitmodules | 5 -- cmd/cmd.go | 24 ++++- docs/modelfile.md | 2 +- llm/ggml.go | 78 +--------------- llm/llama.cpp/generate_darwin_amd64.go | 9 -- llm/llama.cpp/generate_darwin_arm64.go | 9 -- llm/llama.cpp/generate_linux.go | 12 --- llm/llama.cpp/generate_windows.go | 7 -- llm/llama.cpp/ggml | 1 - .../0001-add-detokenize-endpoint.patch | 51 ----------- .../patches/0002-34B-model-support.patch | 89 ------------------- ...onization-in-new-matrix-multiplicati.patch | 30 ------- ...dd-missing-barriers-for-mul-mat-2699.patch | 41 --------- ...DA-s-half-type-for-aarch64-1455-2670.patch | 32 ------- llm/llama.go | 23 +++-- llm/llm.go | 16 +--- server/images.go | 35 +++++++- server/routes.go | 2 +- 19 files changed, 74 insertions(+), 393 deletions(-) delete mode 160000 llm/llama.cpp/ggml delete mode 100644 llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch delete mode 100644 llm/llama.cpp/patches/0002-34B-model-support.patch delete mode 100644 llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch delete mode 100644 llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch delete mode 100644 llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch diff --git a/.dockerignore b/.dockerignore index 150c8f6e..116c58f7 100644 --- a/.dockerignore +++ b/.dockerignore @@ -3,7 +3,6 @@ ollama app dist scripts -llm/llama.cpp/ggml llm/llama.cpp/gguf .env .cache diff --git a/.gitmodules b/.gitmodules index 49a54fa9..5a65e4df 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,8 +1,3 @@ -[submodule "llm/llama.cpp/ggml"] - path = llm/llama.cpp/ggml - url = https://github.com/ggerganov/llama.cpp.git - ignore = dirty - shallow = true [submodule "llm/llama.cpp/gguf"] path = llm/llama.cpp/gguf url = https://github.com/ggerganov/llama.cpp.git diff --git a/cmd/cmd.go b/cmd/cmd.go index b0ba416b..23d9c120 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -572,10 +572,30 @@ func generate(cmd *cobra.Command, opts generateOptions) error { } if err := client.Generate(ctx, &request, fn); err != nil { - if errors.Is(err, context.Canceled) { + switch { + case errors.Is(err, context.Canceled): return nil + case strings.Contains(err.Error(), "unsupported model format"): + // pull and retry to see if the model has been updated + parts := strings.Split(opts.Model, string(os.PathSeparator)) + if len(parts) == 1 { + // this is a library model, log some info + fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...") + } + if err := PullHandler(cmd, []string{opts.Model}); err != nil { + fmt.Printf("Error: %s\n", err) + return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error + } + // retry + if err := client.Generate(ctx, &request, fn); err != nil { + if errors.Is(err, context.Canceled) { + return nil + } + return err + } + default: + return err } - return err } if opts.Prompt != "" { fmt.Println() diff --git a/docs/modelfile.md b/docs/modelfile.md index 20113090..80e896eb 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -188,7 +188,7 @@ SYSTEM """""" ### ADAPTER -The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined. +The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGUF file format. The adapter should be tuned from the base model otherwise the behaviour is undefined. ```modelfile ADAPTER ./ollama-lora.bin diff --git a/llm/ggml.go b/llm/ggml.go index 8d421e03..f71328e1 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -86,74 +86,6 @@ type container interface { Decode(*readSeekOffset) (model, error) } -type containerGGML struct{} - -func (c *containerGGML) Name() string { - return "ggml" -} - -func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) { - // file contents aren't decoded - ro.Seek(0, io.SeekEnd) - return nil, nil -} - -type containerGGMF struct { - version uint32 -} - -func (c *containerGGMF) Name() string { - return "ggmf" -} - -func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) { - var version uint32 - binary.Read(ro, binary.LittleEndian, &version) - - switch version { - case 1: - default: - return nil, errors.New("invalid version") - } - - c.version = version - - // remaining file contents aren't decoded - ro.Seek(0, io.SeekEnd) - - return nil, nil -} - -type containerGGJT struct { - version uint32 -} - -func (c *containerGGJT) Name() string { - return "ggjt" -} - -func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) { - var version uint32 - binary.Read(ro, binary.LittleEndian, &version) - - switch version { - case 1, 2, 3: - default: - return nil, errors.New("invalid version") - } - - c.version = version - - // different model types may have different layouts for hyperparameters - var llama llamaModel - binary.Read(ro, binary.LittleEndian, &llama.hyperparameters) - - // remaining file contents aren't decoded - ro.Seek(0, io.SeekEnd) - - return &llama, nil -} - type containerLORA struct { version uint32 } @@ -194,6 +126,8 @@ const ( FILE_MAGIC_GGUF_BE = 0x47475546 ) +var ErrUnsupportedFormat = errors.New("unsupported model format") + func DecodeGGML(r io.ReadSeeker) (*GGML, error) { ro := readSeekOffset{ReadSeeker: r} @@ -204,12 +138,8 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) { var c container switch magic { - case FILE_MAGIC_GGML: - c = &containerGGML{} - case FILE_MAGIC_GGMF: - c = &containerGGMF{} - case FILE_MAGIC_GGJT: - c = &containerGGJT{} + case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT: + return nil, ErrUnsupportedFormat case FILE_MAGIC_GGLA: c = &containerLORA{} case FILE_MAGIC_GGUF_LE: diff --git a/llm/llama.cpp/generate_darwin_amd64.go b/llm/llama.cpp/generate_darwin_amd64.go index 65a53386..fed45fd9 100644 --- a/llm/llama.cpp/generate_darwin_amd64.go +++ b/llm/llama.cpp/generate_darwin_amd64.go @@ -2,15 +2,6 @@ package llm //go:generate git submodule init -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch -//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch -//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -//go:generate cmake --build ggml/build/cpu --target server --config Release -//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner - //go:generate git submodule update --force gguf //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on diff --git a/llm/llama.cpp/generate_darwin_arm64.go b/llm/llama.cpp/generate_darwin_arm64.go index 81fd8914..0c33bc51 100644 --- a/llm/llama.cpp/generate_darwin_arm64.go +++ b/llm/llama.cpp/generate_darwin_arm64.go @@ -2,15 +2,6 @@ package llm //go:generate git submodule init -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch -//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch -//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -//go:generate cmake --build ggml/build/metal --target server --config Release -//go:generate mv ggml/build/metal/bin/server ggml/build/metal/bin/ollama-runner - //go:generate git submodule update --force gguf //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch //go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 diff --git a/llm/llama.cpp/generate_linux.go b/llm/llama.cpp/generate_linux.go index ce9e78a5..e67ca21a 100644 --- a/llm/llama.cpp/generate_linux.go +++ b/llm/llama.cpp/generate_linux.go @@ -2,15 +2,6 @@ package llm //go:generate git submodule init -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch -//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch -//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/cpu --target server --config Release -//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner - //go:generate git submodule update --force gguf //go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch @@ -18,9 +9,6 @@ package llm //go:generate cmake --build gguf/build/cpu --target server --config Release //go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner -//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/cuda --target server --config Release -//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner //go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0 //go:generate cmake --build gguf/build/cuda --target server --config Release //go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner diff --git a/llm/llama.cpp/generate_windows.go b/llm/llama.cpp/generate_windows.go index 2fb4c39f..6cd9566f 100644 --- a/llm/llama.cpp/generate_windows.go +++ b/llm/llama.cpp/generate_windows.go @@ -2,13 +2,6 @@ package llm //go:generate git submodule init -//go:generate git submodule update --force ggml -//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch -//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch -//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on -//go:generate cmake --build ggml/build/cpu --target server --config Release -//go:generate cmd /c move ggml\build\cpu\bin\Release\server.exe ggml\build\cpu\bin\Release\ollama-runner.exe - //go:generate git submodule update --force gguf //go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch //go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off diff --git a/llm/llama.cpp/ggml b/llm/llama.cpp/ggml deleted file mode 160000 index 9e232f02..00000000 --- a/llm/llama.cpp/ggml +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9e232f0234073358e7031c1b8d7aa45020469a3b diff --git a/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch b/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch deleted file mode 100644 index 34ea0e2d..00000000 --- a/llm/llama.cpp/patches/0001-add-detokenize-endpoint.patch +++ /dev/null @@ -1,51 +0,0 @@ -From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001 -From: Bruce MacDonald -Date: Mon, 28 Aug 2023 18:08:12 -0400 -Subject: [PATCH] add detokenize endpoint - ---- - examples/server/server.cpp | 21 +++++++++++++++++++++ - 1 file changed, 21 insertions(+) - -diff --git a/examples/server/server.cpp b/examples/server/server.cpp -index 9966045..5014691 100644 ---- a/examples/server/server.cpp -+++ b/examples/server/server.cpp -@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector &tokens) - {"tokens", tokens}}; - } - -+static json format_detokenized_response(std::string content) -+{ -+ return json{ -+ {"content", content}}; -+} -+ - static void parse_options_completion(const json &body, llama_server_context &llama) - { - gpt_params default_params; -@@ -1361,6 +1367,21 @@ int main(int argc, char **argv) - const json data = format_tokenizer_response(tokens); - return res.set_content(data.dump(), "application/json"); }); - -+ svr.Post("/detokenize", [&llama](const Request &req, Response &res) -+ { -+ auto lock = llama.lock(); -+ -+ const json body = json::parse(req.body); -+ std::string content; -+ if (body.count("tokens") != 0) -+ { -+ const std::vector tokens = body["tokens"]; -+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend()); -+ } -+ -+ const json data = format_detokenized_response(content); -+ return res.set_content(data.dump(), "application/json"); }); -+ - svr.Post("/embedding", [&llama](const Request &req, Response &res) - { - auto lock = llama.lock(); --- -2.39.2 (Apple Git-143) - diff --git a/llm/llama.cpp/patches/0002-34B-model-support.patch b/llm/llama.cpp/patches/0002-34B-model-support.patch deleted file mode 100644 index 275a6cbb..00000000 --- a/llm/llama.cpp/patches/0002-34B-model-support.patch +++ /dev/null @@ -1,89 +0,0 @@ -From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001 -From: Bruce MacDonald -Date: Mon, 28 Aug 2023 18:08:53 -0400 -Subject: [PATCH] 34B model support - ---- - llama.cpp | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/llama.cpp b/llama.cpp -index f2cbe76..62c5cdf 100644 ---- a/llama.cpp -+++ b/llama.cpp -@@ -79,6 +79,7 @@ enum e_model { - MODEL_7B, - MODEL_13B, - MODEL_30B, -+ MODEL_34B, - MODEL_65B, - MODEL_70B, - }; -@@ -122,6 +123,7 @@ static std::map MEM_REQ_SCRATCH0(int n_ctx) - { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB }, - { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB }, - { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB }, -+ { MODEL_34B, ((size_t) n_ctx / 9ull + 160ull) * MB }, - { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess - { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB }, - }; -@@ -135,6 +137,7 @@ static const std::map & MEM_REQ_SCRATCH1() - { MODEL_7B, 160ull * MB }, - { MODEL_13B, 192ull * MB }, - { MODEL_30B, 256ull * MB }, -+ { MODEL_34B, 256ull * MB }, - { MODEL_65B, 384ull * MB }, // guess - { MODEL_70B, 304ull * MB }, - }; -@@ -149,6 +152,7 @@ static const std::map & MEM_REQ_EVAL() - { MODEL_7B, 10ull * MB }, - { MODEL_13B, 12ull * MB }, - { MODEL_30B, 16ull * MB }, -+ { MODEL_34B, 16ull * MB }, - { MODEL_65B, 24ull * MB }, // guess - { MODEL_70B, 24ull * MB }, - }; -@@ -164,6 +168,7 @@ static const std::map & VRAM_REQ_SCRATCH_BASE() - { MODEL_7B, 512ull * kB }, - { MODEL_13B, 640ull * kB }, - { MODEL_30B, 768ull * kB }, -+ { MODEL_34B, 768ull * kB }, - { MODEL_65B, 1280ull * kB }, - { MODEL_70B, 1280ull * kB }, - }; -@@ -179,6 +184,7 @@ static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() - { MODEL_7B, 128ull }, - { MODEL_13B, 160ull }, - { MODEL_30B, 208ull }, -+ { MODEL_34B, 208ull }, - { MODEL_65B, 256ull }, - { MODEL_70B, 256ull }, - }; -@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) { - case MODEL_7B: return "7B"; - case MODEL_13B: return "13B"; - case MODEL_30B: return "30B"; -+ case MODEL_34B: return "34B"; - case MODEL_65B: return "65B"; - case MODEL_70B: return "70B"; - default: LLAMA_ASSERT(false); -@@ -1074,6 +1081,7 @@ static void llama_model_load_internal( - case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; -+ case 48: model.type = e_model::MODEL_34B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = e_model::MODEL_65B; break; - default: -@@ -1094,6 +1102,8 @@ static void llama_model_load_internal( - LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); - model.type = e_model::MODEL_70B; - hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model -+ } else if (model.type == e_model::MODEL_34B && n_gqa == 8) { -+ hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model - } - - hparams.rope_freq_base = rope_freq_base; --- -2.39.2 (Apple Git-143) - diff --git a/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch b/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch deleted file mode 100644 index e5540ab1..00000000 --- a/llm/llama.cpp/patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch +++ /dev/null @@ -1,30 +0,0 @@ -From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001 -From: Shouzheng Liu -Date: Mon, 21 Aug 2023 06:59:29 -0400 -Subject: [PATCH] metal : fix synchronization in new matrix multiplication - kernel (#2686) - ---- - ggml-metal.metal | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/ggml-metal.metal b/ggml-metal.metal -index 3f31252..88d48f6 100644 ---- a/ggml-metal.metal -+++ b/ggml-metal.metal -@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0, - threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ - + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; - for (int i = 0; i < 8; i++) { -+ threadgroup_barrier(mem_flags::mem_device); - simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); - } - -- threadgroup_barrier(mem_flags::mem_threadgroup); -+ threadgroup_barrier(mem_flags::mem_device); - device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; - if (sgitg==0) { - for (int i = 0; i < n_rows; i++) { --- -2.41.0 - diff --git a/llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch b/llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch deleted file mode 100644 index a2649097..00000000 --- a/llm/llama.cpp/patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch +++ /dev/null @@ -1,41 +0,0 @@ -From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001 -From: Shouzheng Liu -Date: Tue, 22 Aug 2023 02:18:40 -0400 -Subject: [PATCH] metal : add missing barriers for mul-mat (#2699) - ---- - ggml-metal.metal | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/ggml-metal.metal b/ggml-metal.metal -index 88d48f6..ce3541f 100644 ---- a/ggml-metal.metal -+++ b/ggml-metal.metal -@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0, - //load data and store to threadgroup memory - half4x4 temp_a; - dequantize_func(x, il, temp_a); -+ threadgroup_barrier(mem_flags::mem_threadgroup); - #pragma unroll(16) - for (int i = 0; i < 16; i++) { - *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ -@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0, - } - } else { - // block is smaller than 64x32, we should avoid writing data outside of the matrix -+ threadgroup_barrier(mem_flags::mem_threadgroup); - threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ - + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; - for (int i = 0; i < 8; i++) { -- threadgroup_barrier(mem_flags::mem_device); - simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); - } - -- threadgroup_barrier(mem_flags::mem_device); -+ threadgroup_barrier(mem_flags::mem_threadgroup); - device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; - if (sgitg==0) { - for (int i = 0; i < n_rows; i++) { --- -2.41.0 - diff --git a/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch b/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch deleted file mode 100644 index 7b67f680..00000000 --- a/llm/llama.cpp/patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001 -From: Kylin <56434533+KyL0N@users.noreply.github.com> -Date: Tue, 22 Aug 2023 15:14:23 +0800 -Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670) - -* ggml: support CUDA's half type for aarch64(#1455) -support CUDA's half type for aarch64 in ggml_fp16_t definition - -* ggml: use __CUDACC__ to recognise nvcc compiler ---- - ggml.h | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/ggml.h b/ggml.h -index 544ad2d..0ec7ec5 100644 ---- a/ggml.h -+++ b/ggml.h -@@ -259,8 +259,9 @@ - extern "C" { - #endif - --#ifdef __ARM_NEON -- // we use the built-in 16-bit float type -+#if defined(__ARM_NEON) && defined(__CUDACC__) -+ typedef half ggml_fp16_t; -+#elif defined(__ARM_NEON) - typedef __fp16 ggml_fp16_t; - #else - typedef uint16_t ggml_fp16_t; --- -2.39.2 (Apple Git-143) - diff --git a/llm/llama.go b/llm/llama.go index adaa4c57..574c24ff 100644 --- a/llm/llama.go +++ b/llm/llama.go @@ -59,13 +59,12 @@ ws ::= ([ \t\n] ws)? var llamaCppEmbed embed.FS type ModelRunner struct { - Type string // "gguf" or "ggml" Path string // path to the model runner executable Accelerated bool } -func chooseRunners(workDir, runnerType string) []ModelRunner { - buildPath := path.Join("llama.cpp", runnerType, "build") +func chooseRunners(workDir string) []ModelRunner { + buildPath := path.Join("llama.cpp", "gguf", "build") var runners []ModelRunner // set the runners based on the OS @@ -73,25 +72,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner { switch runtime.GOOS { case "darwin": if runtime.GOARCH == "arm64" { - runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}} + runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}} } else { - runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}} + runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}} } case "linux": runners = []ModelRunner{ - {Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true}, - {Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}, + {Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true}, + {Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}, } case "windows": // TODO: select windows GPU runner here when available runners = []ModelRunner{ - {Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true}, - {Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")}, + {Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true}, + {Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")}, } default: log.Printf("unknown OS, running on CPU: %s", runtime.GOOS) runners = []ModelRunner{ - {Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}, + {Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}, } } @@ -141,7 +140,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner { } } if !runnerAvailable { - log.Fatalf("%s runner not found", runnerType) + log.Fatalf("gguf runner not found") } // return the runners to try in priority order @@ -149,7 +148,6 @@ func chooseRunners(workDir, runnerType string) []ModelRunner { for _, r := range runners { // clean the ModelRunner paths so that they match the OS we are running on localRunnersByPriority = append(localRunnersByPriority, ModelRunner{ - Type: r.Type, Path: filepath.Clean(path.Join(workDir, r.Path)), Accelerated: r.Accelerated, }) @@ -350,6 +348,7 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner "--batch-size", fmt.Sprintf("%d", opts.NumBatch), "--n-gpu-layers", fmt.Sprintf("%d", numGPU), "--embedding", + "--parallel", "2", } if opts.MainGPU > 0 { diff --git a/llm/llm.go b/llm/llm.go index fc6bbfd7..92bffd76 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -76,16 +76,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) } } - switch ggml.Name() { - case "gguf": - // TODO: gguf will load these options automatically from the model binary - opts.NumGQA = 0 - opts.RopeFrequencyBase = 0.0 - opts.RopeFrequencyScale = 0.0 - return newLlama(model, adapters, projectors, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts) - case "ggml", "ggmf", "ggjt", "ggla": - return newLlama(model, adapters, projectors, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts) - default: - return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily()) - } + opts.NumGQA = 0 + opts.RopeFrequencyBase = 0.0 + opts.RopeFrequencyScale = 0.0 + return newLlama(model, adapters, projectors, chooseRunners(workDir), ggml.NumLayers(), opts) } diff --git a/server/images.go b/server/images.go index 9442bd74..8e7b3343 100644 --- a/server/images.go +++ b/server/images.go @@ -418,6 +418,27 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars return err } + // if the model is not in gguf format, pull the base model to try and get it in gguf format + if fromConfig.ModelFormat != "gguf" { + fn(api.ProgressResponse{Status: "updating base model"}) + if err := PullModel(ctx, c.Args, &RegistryOptions{}, fn); err != nil { + log.Printf("error pulling model: %v", err) + } + // Reset the file pointer to the beginning of the file + _, err = fromConfigFile.Seek(0, 0) + if err != nil { + return fmt.Errorf("update from config after pull: %w", err) + } + if err := json.NewDecoder(fromConfigFile).Decode(&fromConfig); err != nil { + return err + } + } + + // if the model is still not in gguf format, error out + if fromConfig.ModelFormat != "gguf" { + return fmt.Errorf("%s is not in gguf format, this base model is not compatible with this version of ollama", c.Args) + } + config.SetModelFormat(fromConfig.ModelFormat) config.SetModelFamily(append(fromConfig.ModelFamilies, fromConfig.ModelFamily)...) config.SetModelType(fromConfig.ModelType) @@ -456,15 +477,21 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars defer bin.Close() var offset int64 + CREATE: for { fn(api.ProgressResponse{Status: "creating model layer"}) bin.Seek(offset, io.SeekStart) ggml, err := llm.DecodeGGML(bin) - if errors.Is(err, io.EOF) { - break - } else if err != nil { - return err + if err != nil { + switch { + case errors.Is(err, io.EOF): + break CREATE + case errors.Is(err, llm.ErrUnsupportedFormat): + return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err) + default: + return err + } } config.SetModelFormat(ggml.Name()) diff --git a/server/routes.go b/server/routes.go index b9bd5447..12213606 100644 --- a/server/routes.go +++ b/server/routes.go @@ -114,7 +114,7 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess // some older models are not compatible with newer versions of llama.cpp // show a generalized compatibility error until there is a better way to // check for model compatibility - if strings.Contains(err.Error(), "failed to load model") { + if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") { err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, model.ShortName) }