deprecate ggml
- remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com>
This commit is contained in:
parent
ed195f3562
commit
811b1f03c8
19 changed files with 74 additions and 393 deletions
|
@ -3,7 +3,6 @@ ollama
|
|||
app
|
||||
dist
|
||||
scripts
|
||||
llm/llama.cpp/ggml
|
||||
llm/llama.cpp/gguf
|
||||
.env
|
||||
.cache
|
||||
|
|
5
.gitmodules
vendored
5
.gitmodules
vendored
|
@ -1,8 +1,3 @@
|
|||
[submodule "llm/llama.cpp/ggml"]
|
||||
path = llm/llama.cpp/ggml
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
ignore = dirty
|
||||
shallow = true
|
||||
[submodule "llm/llama.cpp/gguf"]
|
||||
path = llm/llama.cpp/gguf
|
||||
url = https://github.com/ggerganov/llama.cpp.git
|
||||
|
|
24
cmd/cmd.go
24
cmd/cmd.go
|
@ -572,10 +572,30 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
|
|||
}
|
||||
|
||||
if err := client.Generate(ctx, &request, fn); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
switch {
|
||||
case errors.Is(err, context.Canceled):
|
||||
return nil
|
||||
case strings.Contains(err.Error(), "unsupported model format"):
|
||||
// pull and retry to see if the model has been updated
|
||||
parts := strings.Split(opts.Model, string(os.PathSeparator))
|
||||
if len(parts) == 1 {
|
||||
// this is a library model, log some info
|
||||
fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...")
|
||||
}
|
||||
if err := PullHandler(cmd, []string{opts.Model}); err != nil {
|
||||
fmt.Printf("Error: %s\n", err)
|
||||
return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error
|
||||
}
|
||||
// retry
|
||||
if err := client.Generate(ctx, &request, fn); err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
default:
|
||||
return err
|
||||
}
|
||||
return err
|
||||
}
|
||||
if opts.Prompt != "" {
|
||||
fmt.Println()
|
||||
|
|
|
@ -188,7 +188,7 @@ SYSTEM """<system message>"""
|
|||
|
||||
### ADAPTER
|
||||
|
||||
The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
|
||||
The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGUF file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
|
||||
|
||||
```modelfile
|
||||
ADAPTER ./ollama-lora.bin
|
||||
|
|
78
llm/ggml.go
78
llm/ggml.go
|
@ -86,74 +86,6 @@ type container interface {
|
|||
Decode(*readSeekOffset) (model, error)
|
||||
}
|
||||
|
||||
type containerGGML struct{}
|
||||
|
||||
func (c *containerGGML) Name() string {
|
||||
return "ggml"
|
||||
}
|
||||
|
||||
func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
|
||||
// file contents aren't decoded
|
||||
ro.Seek(0, io.SeekEnd)
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
type containerGGMF struct {
|
||||
version uint32
|
||||
}
|
||||
|
||||
func (c *containerGGMF) Name() string {
|
||||
return "ggmf"
|
||||
}
|
||||
|
||||
func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
|
||||
var version uint32
|
||||
binary.Read(ro, binary.LittleEndian, &version)
|
||||
|
||||
switch version {
|
||||
case 1:
|
||||
default:
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
c.version = version
|
||||
|
||||
// remaining file contents aren't decoded
|
||||
ro.Seek(0, io.SeekEnd)
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
type containerGGJT struct {
|
||||
version uint32
|
||||
}
|
||||
|
||||
func (c *containerGGJT) Name() string {
|
||||
return "ggjt"
|
||||
}
|
||||
|
||||
func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
|
||||
var version uint32
|
||||
binary.Read(ro, binary.LittleEndian, &version)
|
||||
|
||||
switch version {
|
||||
case 1, 2, 3:
|
||||
default:
|
||||
return nil, errors.New("invalid version")
|
||||
}
|
||||
|
||||
c.version = version
|
||||
|
||||
// different model types may have different layouts for hyperparameters
|
||||
var llama llamaModel
|
||||
binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
|
||||
|
||||
// remaining file contents aren't decoded
|
||||
ro.Seek(0, io.SeekEnd)
|
||||
|
||||
return &llama, nil
|
||||
}
|
||||
|
||||
type containerLORA struct {
|
||||
version uint32
|
||||
}
|
||||
|
@ -194,6 +126,8 @@ const (
|
|||
FILE_MAGIC_GGUF_BE = 0x47475546
|
||||
)
|
||||
|
||||
var ErrUnsupportedFormat = errors.New("unsupported model format")
|
||||
|
||||
func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
|
||||
ro := readSeekOffset{ReadSeeker: r}
|
||||
|
||||
|
@ -204,12 +138,8 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
|
|||
|
||||
var c container
|
||||
switch magic {
|
||||
case FILE_MAGIC_GGML:
|
||||
c = &containerGGML{}
|
||||
case FILE_MAGIC_GGMF:
|
||||
c = &containerGGMF{}
|
||||
case FILE_MAGIC_GGJT:
|
||||
c = &containerGGJT{}
|
||||
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
|
||||
return nil, ErrUnsupportedFormat
|
||||
case FILE_MAGIC_GGLA:
|
||||
c = &containerLORA{}
|
||||
case FILE_MAGIC_GGUF_LE:
|
||||
|
|
|
@ -2,15 +2,6 @@ package llm
|
|||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on
|
||||
|
|
|
@ -2,15 +2,6 @@ package llm
|
|||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
||||
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
//go:generate cmake --build ggml/build/metal --target server --config Release
|
||||
//go:generate mv ggml/build/metal/bin/server ggml/build/metal/bin/ollama-runner
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||
|
|
|
@ -2,15 +2,6 @@ package llm
|
|||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
|
||||
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
|
||||
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
||||
|
@ -18,9 +9,6 @@ package llm
|
|||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||
//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
|
||||
|
||||
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cuda --target server --config Release
|
||||
//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
|
||||
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0
|
||||
//go:generate cmake --build gguf/build/cuda --target server --config Release
|
||||
//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
|
||||
|
|
|
@ -2,13 +2,6 @@ package llm
|
|||
|
||||
//go:generate git submodule init
|
||||
|
||||
//go:generate git submodule update --force ggml
|
||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
||||
//go:generate cmd /c move ggml\build\cpu\bin\Release\server.exe ggml\build\cpu\bin\Release\ollama-runner.exe
|
||||
|
||||
//go:generate git submodule update --force gguf
|
||||
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
Subproject commit 9e232f0234073358e7031c1b8d7aa45020469a3b
|
|
@ -1,51 +0,0 @@
|
|||
From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001
|
||||
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
Date: Mon, 28 Aug 2023 18:08:12 -0400
|
||||
Subject: [PATCH] add detokenize endpoint
|
||||
|
||||
---
|
||||
examples/server/server.cpp | 21 +++++++++++++++++++++
|
||||
1 file changed, 21 insertions(+)
|
||||
|
||||
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
|
||||
index 9966045..5014691 100644
|
||||
--- a/examples/server/server.cpp
|
||||
+++ b/examples/server/server.cpp
|
||||
@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
||||
{"tokens", tokens}};
|
||||
}
|
||||
|
||||
+static json format_detokenized_response(std::string content)
|
||||
+{
|
||||
+ return json{
|
||||
+ {"content", content}};
|
||||
+}
|
||||
+
|
||||
static void parse_options_completion(const json &body, llama_server_context &llama)
|
||||
{
|
||||
gpt_params default_params;
|
||||
@@ -1361,6 +1367,21 @@ int main(int argc, char **argv)
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
return res.set_content(data.dump(), "application/json"); });
|
||||
|
||||
+ svr.Post("/detokenize", [&llama](const Request &req, Response &res)
|
||||
+ {
|
||||
+ auto lock = llama.lock();
|
||||
+
|
||||
+ const json body = json::parse(req.body);
|
||||
+ std::string content;
|
||||
+ if (body.count("tokens") != 0)
|
||||
+ {
|
||||
+ const std::vector<llama_token> tokens = body["tokens"];
|
||||
+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
|
||||
+ }
|
||||
+
|
||||
+ const json data = format_detokenized_response(content);
|
||||
+ return res.set_content(data.dump(), "application/json"); });
|
||||
+
|
||||
svr.Post("/embedding", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
auto lock = llama.lock();
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
|
@ -1,89 +0,0 @@
|
|||
From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001
|
||||
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
Date: Mon, 28 Aug 2023 18:08:53 -0400
|
||||
Subject: [PATCH] 34B model support
|
||||
|
||||
---
|
||||
llama.cpp | 10 ++++++++++
|
||||
1 file changed, 10 insertions(+)
|
||||
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index f2cbe76..62c5cdf 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -79,6 +79,7 @@ enum e_model {
|
||||
MODEL_7B,
|
||||
MODEL_13B,
|
||||
MODEL_30B,
|
||||
+ MODEL_34B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
};
|
||||
@@ -122,6 +123,7 @@ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
|
||||
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
||||
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
||||
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
||||
+ { MODEL_34B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
||||
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
||||
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
|
||||
};
|
||||
@@ -135,6 +137,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||
{ MODEL_7B, 160ull * MB },
|
||||
{ MODEL_13B, 192ull * MB },
|
||||
{ MODEL_30B, 256ull * MB },
|
||||
+ { MODEL_34B, 256ull * MB },
|
||||
{ MODEL_65B, 384ull * MB }, // guess
|
||||
{ MODEL_70B, 304ull * MB },
|
||||
};
|
||||
@@ -149,6 +152,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
{ MODEL_7B, 10ull * MB },
|
||||
{ MODEL_13B, 12ull * MB },
|
||||
{ MODEL_30B, 16ull * MB },
|
||||
+ { MODEL_34B, 16ull * MB },
|
||||
{ MODEL_65B, 24ull * MB }, // guess
|
||||
{ MODEL_70B, 24ull * MB },
|
||||
};
|
||||
@@ -164,6 +168,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
||||
{ MODEL_7B, 512ull * kB },
|
||||
{ MODEL_13B, 640ull * kB },
|
||||
{ MODEL_30B, 768ull * kB },
|
||||
+ { MODEL_34B, 768ull * kB },
|
||||
{ MODEL_65B, 1280ull * kB },
|
||||
{ MODEL_70B, 1280ull * kB },
|
||||
};
|
||||
@@ -179,6 +184,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
||||
{ MODEL_7B, 128ull },
|
||||
{ MODEL_13B, 160ull },
|
||||
{ MODEL_30B, 208ull },
|
||||
+ { MODEL_34B, 208ull },
|
||||
{ MODEL_65B, 256ull },
|
||||
{ MODEL_70B, 256ull },
|
||||
};
|
||||
@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_30B: return "30B";
|
||||
+ case MODEL_34B: return "34B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
default: LLAMA_ASSERT(false);
|
||||
@@ -1074,6 +1081,7 @@ static void llama_model_load_internal(
|
||||
case 26: model.type = e_model::MODEL_3B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
+ case 48: model.type = e_model::MODEL_34B; break;
|
||||
case 60: model.type = e_model::MODEL_30B; break;
|
||||
case 80: model.type = e_model::MODEL_65B; break;
|
||||
default:
|
||||
@@ -1094,6 +1102,8 @@ static void llama_model_load_internal(
|
||||
LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
||||
model.type = e_model::MODEL_70B;
|
||||
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
||||
+ } else if (model.type == e_model::MODEL_34B && n_gqa == 8) {
|
||||
+ hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model
|
||||
}
|
||||
|
||||
hparams.rope_freq_base = rope_freq_base;
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
|
||||
From: Shouzheng Liu <lshzh.hi@gmail.com>
|
||||
Date: Mon, 21 Aug 2023 06:59:29 -0400
|
||||
Subject: [PATCH] metal : fix synchronization in new matrix multiplication
|
||||
kernel (#2686)
|
||||
|
||||
---
|
||||
ggml-metal.metal | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
||||
index 3f31252..88d48f6 100644
|
||||
--- a/ggml-metal.metal
|
||||
+++ b/ggml-metal.metal
|
||||
@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
+ threadgroup_barrier(mem_flags::mem_device);
|
||||
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||
}
|
||||
|
||||
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
+ threadgroup_barrier(mem_flags::mem_device);
|
||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||
if (sgitg==0) {
|
||||
for (int i = 0; i < n_rows; i++) {
|
||||
--
|
||||
2.41.0
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001
|
||||
From: Shouzheng Liu <lshzh.hi@gmail.com>
|
||||
Date: Tue, 22 Aug 2023 02:18:40 -0400
|
||||
Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)
|
||||
|
||||
---
|
||||
ggml-metal.metal | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
||||
index 88d48f6..ce3541f 100644
|
||||
--- a/ggml-metal.metal
|
||||
+++ b/ggml-metal.metal
|
||||
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
//load data and store to threadgroup memory
|
||||
half4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
#pragma unroll(16)
|
||||
for (int i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||
@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
}
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
- threadgroup_barrier(mem_flags::mem_device);
|
||||
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||
}
|
||||
|
||||
- threadgroup_barrier(mem_flags::mem_device);
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||
if (sgitg==0) {
|
||||
for (int i = 0; i < n_rows; i++) {
|
||||
--
|
||||
2.41.0
|
||||
|
|
@ -1,32 +0,0 @@
|
|||
From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
|
||||
From: Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||
Date: Tue, 22 Aug 2023 15:14:23 +0800
|
||||
Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
|
||||
|
||||
* ggml: support CUDA's half type for aarch64(#1455)
|
||||
support CUDA's half type for aarch64 in ggml_fp16_t definition
|
||||
|
||||
* ggml: use __CUDACC__ to recognise nvcc compiler
|
||||
---
|
||||
ggml.h | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml.h b/ggml.h
|
||||
index 544ad2d..0ec7ec5 100644
|
||||
--- a/ggml.h
|
||||
+++ b/ggml.h
|
||||
@@ -259,8 +259,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
-#ifdef __ARM_NEON
|
||||
- // we use the built-in 16-bit float type
|
||||
+#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||
+ typedef half ggml_fp16_t;
|
||||
+#elif defined(__ARM_NEON)
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
#else
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
23
llm/llama.go
23
llm/llama.go
|
@ -59,13 +59,12 @@ ws ::= ([ \t\n] ws)?
|
|||
var llamaCppEmbed embed.FS
|
||||
|
||||
type ModelRunner struct {
|
||||
Type string // "gguf" or "ggml"
|
||||
Path string // path to the model runner executable
|
||||
Accelerated bool
|
||||
}
|
||||
|
||||
func chooseRunners(workDir, runnerType string) []ModelRunner {
|
||||
buildPath := path.Join("llama.cpp", runnerType, "build")
|
||||
func chooseRunners(workDir string) []ModelRunner {
|
||||
buildPath := path.Join("llama.cpp", "gguf", "build")
|
||||
var runners []ModelRunner
|
||||
|
||||
// set the runners based on the OS
|
||||
|
@ -73,25 +72,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
|
|||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
if runtime.GOARCH == "arm64" {
|
||||
runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
|
||||
runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
|
||||
} else {
|
||||
runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
|
||||
runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
|
||||
}
|
||||
case "linux":
|
||||
runners = []ModelRunner{
|
||||
{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
|
||||
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||
{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
|
||||
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||
}
|
||||
case "windows":
|
||||
// TODO: select windows GPU runner here when available
|
||||
runners = []ModelRunner{
|
||||
{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
|
||||
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
|
||||
{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
|
||||
{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
|
||||
}
|
||||
default:
|
||||
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
|
||||
runners = []ModelRunner{
|
||||
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -141,7 +140,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
|
|||
}
|
||||
}
|
||||
if !runnerAvailable {
|
||||
log.Fatalf("%s runner not found", runnerType)
|
||||
log.Fatalf("gguf runner not found")
|
||||
}
|
||||
|
||||
// return the runners to try in priority order
|
||||
|
@ -149,7 +148,6 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
|
|||
for _, r := range runners {
|
||||
// clean the ModelRunner paths so that they match the OS we are running on
|
||||
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
|
||||
Type: r.Type,
|
||||
Path: filepath.Clean(path.Join(workDir, r.Path)),
|
||||
Accelerated: r.Accelerated,
|
||||
})
|
||||
|
@ -350,6 +348,7 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
|
|||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||
"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
|
||||
"--embedding",
|
||||
"--parallel", "2",
|
||||
}
|
||||
|
||||
if opts.MainGPU > 0 {
|
||||
|
|
16
llm/llm.go
16
llm/llm.go
|
@ -76,16 +76,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
|||
}
|
||||
}
|
||||
|
||||
switch ggml.Name() {
|
||||
case "gguf":
|
||||
// TODO: gguf will load these options automatically from the model binary
|
||||
opts.NumGQA = 0
|
||||
opts.RopeFrequencyBase = 0.0
|
||||
opts.RopeFrequencyScale = 0.0
|
||||
return newLlama(model, adapters, projectors, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
|
||||
case "ggml", "ggmf", "ggjt", "ggla":
|
||||
return newLlama(model, adapters, projectors, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
|
||||
}
|
||||
opts.NumGQA = 0
|
||||
opts.RopeFrequencyBase = 0.0
|
||||
opts.RopeFrequencyScale = 0.0
|
||||
return newLlama(model, adapters, projectors, chooseRunners(workDir), ggml.NumLayers(), opts)
|
||||
}
|
||||
|
|
|
@ -418,6 +418,27 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
|||
return err
|
||||
}
|
||||
|
||||
// if the model is not in gguf format, pull the base model to try and get it in gguf format
|
||||
if fromConfig.ModelFormat != "gguf" {
|
||||
fn(api.ProgressResponse{Status: "updating base model"})
|
||||
if err := PullModel(ctx, c.Args, &RegistryOptions{}, fn); err != nil {
|
||||
log.Printf("error pulling model: %v", err)
|
||||
}
|
||||
// Reset the file pointer to the beginning of the file
|
||||
_, err = fromConfigFile.Seek(0, 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update from config after pull: %w", err)
|
||||
}
|
||||
if err := json.NewDecoder(fromConfigFile).Decode(&fromConfig); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// if the model is still not in gguf format, error out
|
||||
if fromConfig.ModelFormat != "gguf" {
|
||||
return fmt.Errorf("%s is not in gguf format, this base model is not compatible with this version of ollama", c.Args)
|
||||
}
|
||||
|
||||
config.SetModelFormat(fromConfig.ModelFormat)
|
||||
config.SetModelFamily(append(fromConfig.ModelFamilies, fromConfig.ModelFamily)...)
|
||||
config.SetModelType(fromConfig.ModelType)
|
||||
|
@ -456,15 +477,21 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
|||
defer bin.Close()
|
||||
|
||||
var offset int64
|
||||
CREATE:
|
||||
for {
|
||||
fn(api.ProgressResponse{Status: "creating model layer"})
|
||||
|
||||
bin.Seek(offset, io.SeekStart)
|
||||
ggml, err := llm.DecodeGGML(bin)
|
||||
if errors.Is(err, io.EOF) {
|
||||
break
|
||||
} else if err != nil {
|
||||
return err
|
||||
if err != nil {
|
||||
switch {
|
||||
case errors.Is(err, io.EOF):
|
||||
break CREATE
|
||||
case errors.Is(err, llm.ErrUnsupportedFormat):
|
||||
return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
|
||||
default:
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
config.SetModelFormat(ggml.Name())
|
||||
|
|
|
@ -114,7 +114,7 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
|
|||
// some older models are not compatible with newer versions of llama.cpp
|
||||
// show a generalized compatibility error until there is a better way to
|
||||
// check for model compatibility
|
||||
if strings.Contains(err.Error(), "failed to load model") {
|
||||
if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") {
|
||||
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, model.ShortName)
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue