deprecate ggml
- remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com>
This commit is contained in:
parent
ed195f3562
commit
811b1f03c8
19 changed files with 74 additions and 393 deletions
|
@ -3,7 +3,6 @@ ollama
|
||||||
app
|
app
|
||||||
dist
|
dist
|
||||||
scripts
|
scripts
|
||||||
llm/llama.cpp/ggml
|
|
||||||
llm/llama.cpp/gguf
|
llm/llama.cpp/gguf
|
||||||
.env
|
.env
|
||||||
.cache
|
.cache
|
||||||
|
|
5
.gitmodules
vendored
5
.gitmodules
vendored
|
@ -1,8 +1,3 @@
|
||||||
[submodule "llm/llama.cpp/ggml"]
|
|
||||||
path = llm/llama.cpp/ggml
|
|
||||||
url = https://github.com/ggerganov/llama.cpp.git
|
|
||||||
ignore = dirty
|
|
||||||
shallow = true
|
|
||||||
[submodule "llm/llama.cpp/gguf"]
|
[submodule "llm/llama.cpp/gguf"]
|
||||||
path = llm/llama.cpp/gguf
|
path = llm/llama.cpp/gguf
|
||||||
url = https://github.com/ggerganov/llama.cpp.git
|
url = https://github.com/ggerganov/llama.cpp.git
|
||||||
|
|
24
cmd/cmd.go
24
cmd/cmd.go
|
@ -572,10 +572,30 @@ func generate(cmd *cobra.Command, opts generateOptions) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := client.Generate(ctx, &request, fn); err != nil {
|
if err := client.Generate(ctx, &request, fn); err != nil {
|
||||||
if errors.Is(err, context.Canceled) {
|
switch {
|
||||||
|
case errors.Is(err, context.Canceled):
|
||||||
return nil
|
return nil
|
||||||
|
case strings.Contains(err.Error(), "unsupported model format"):
|
||||||
|
// pull and retry to see if the model has been updated
|
||||||
|
parts := strings.Split(opts.Model, string(os.PathSeparator))
|
||||||
|
if len(parts) == 1 {
|
||||||
|
// this is a library model, log some info
|
||||||
|
fmt.Fprintln(os.Stderr, "This model is no longer compatible with Ollama. Pulling a new version...")
|
||||||
|
}
|
||||||
|
if err := PullHandler(cmd, []string{opts.Model}); err != nil {
|
||||||
|
fmt.Printf("Error: %s\n", err)
|
||||||
|
return fmt.Errorf("unsupported model, please update this model to gguf format") // relay the original error
|
||||||
|
}
|
||||||
|
// retry
|
||||||
|
if err := client.Generate(ctx, &request, fn); err != nil {
|
||||||
|
if errors.Is(err, context.Canceled) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
if opts.Prompt != "" {
|
if opts.Prompt != "" {
|
||||||
fmt.Println()
|
fmt.Println()
|
||||||
|
|
|
@ -188,7 +188,7 @@ SYSTEM """<system message>"""
|
||||||
|
|
||||||
### ADAPTER
|
### ADAPTER
|
||||||
|
|
||||||
The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGML file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
|
The `ADAPTER` instruction specifies the LoRA adapter to apply to the base model. The value of this instruction should be an absolute path or a path relative to the Modelfile and the file must be in a GGUF file format. The adapter should be tuned from the base model otherwise the behaviour is undefined.
|
||||||
|
|
||||||
```modelfile
|
```modelfile
|
||||||
ADAPTER ./ollama-lora.bin
|
ADAPTER ./ollama-lora.bin
|
||||||
|
|
78
llm/ggml.go
78
llm/ggml.go
|
@ -86,74 +86,6 @@ type container interface {
|
||||||
Decode(*readSeekOffset) (model, error)
|
Decode(*readSeekOffset) (model, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type containerGGML struct{}
|
|
||||||
|
|
||||||
func (c *containerGGML) Name() string {
|
|
||||||
return "ggml"
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *containerGGML) Decode(ro *readSeekOffset) (model, error) {
|
|
||||||
// file contents aren't decoded
|
|
||||||
ro.Seek(0, io.SeekEnd)
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type containerGGMF struct {
|
|
||||||
version uint32
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *containerGGMF) Name() string {
|
|
||||||
return "ggmf"
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *containerGGMF) Decode(ro *readSeekOffset) (model, error) {
|
|
||||||
var version uint32
|
|
||||||
binary.Read(ro, binary.LittleEndian, &version)
|
|
||||||
|
|
||||||
switch version {
|
|
||||||
case 1:
|
|
||||||
default:
|
|
||||||
return nil, errors.New("invalid version")
|
|
||||||
}
|
|
||||||
|
|
||||||
c.version = version
|
|
||||||
|
|
||||||
// remaining file contents aren't decoded
|
|
||||||
ro.Seek(0, io.SeekEnd)
|
|
||||||
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type containerGGJT struct {
|
|
||||||
version uint32
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *containerGGJT) Name() string {
|
|
||||||
return "ggjt"
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *containerGGJT) Decode(ro *readSeekOffset) (model, error) {
|
|
||||||
var version uint32
|
|
||||||
binary.Read(ro, binary.LittleEndian, &version)
|
|
||||||
|
|
||||||
switch version {
|
|
||||||
case 1, 2, 3:
|
|
||||||
default:
|
|
||||||
return nil, errors.New("invalid version")
|
|
||||||
}
|
|
||||||
|
|
||||||
c.version = version
|
|
||||||
|
|
||||||
// different model types may have different layouts for hyperparameters
|
|
||||||
var llama llamaModel
|
|
||||||
binary.Read(ro, binary.LittleEndian, &llama.hyperparameters)
|
|
||||||
|
|
||||||
// remaining file contents aren't decoded
|
|
||||||
ro.Seek(0, io.SeekEnd)
|
|
||||||
|
|
||||||
return &llama, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type containerLORA struct {
|
type containerLORA struct {
|
||||||
version uint32
|
version uint32
|
||||||
}
|
}
|
||||||
|
@ -194,6 +126,8 @@ const (
|
||||||
FILE_MAGIC_GGUF_BE = 0x47475546
|
FILE_MAGIC_GGUF_BE = 0x47475546
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var ErrUnsupportedFormat = errors.New("unsupported model format")
|
||||||
|
|
||||||
func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
|
func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
|
||||||
ro := readSeekOffset{ReadSeeker: r}
|
ro := readSeekOffset{ReadSeeker: r}
|
||||||
|
|
||||||
|
@ -204,12 +138,8 @@ func DecodeGGML(r io.ReadSeeker) (*GGML, error) {
|
||||||
|
|
||||||
var c container
|
var c container
|
||||||
switch magic {
|
switch magic {
|
||||||
case FILE_MAGIC_GGML:
|
case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
|
||||||
c = &containerGGML{}
|
return nil, ErrUnsupportedFormat
|
||||||
case FILE_MAGIC_GGMF:
|
|
||||||
c = &containerGGMF{}
|
|
||||||
case FILE_MAGIC_GGJT:
|
|
||||||
c = &containerGGJT{}
|
|
||||||
case FILE_MAGIC_GGLA:
|
case FILE_MAGIC_GGLA:
|
||||||
c = &containerLORA{}
|
c = &containerLORA{}
|
||||||
case FILE_MAGIC_GGUF_LE:
|
case FILE_MAGIC_GGUF_LE:
|
||||||
|
|
|
@ -2,15 +2,6 @@ package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
|
|
||||||
//go:generate git submodule update --force ggml
|
|
||||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
|
||||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
|
||||||
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
|
||||||
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
|
||||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
|
||||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
|
||||||
//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
|
|
||||||
|
|
||||||
//go:generate git submodule update --force gguf
|
//go:generate git submodule update --force gguf
|
||||||
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
||||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on
|
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_METAL=off -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=x86_64 -DCMAKE_OSX_ARCHITECTURES=x86_64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=on
|
||||||
|
|
|
@ -2,15 +2,6 @@ package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
|
|
||||||
//go:generate git submodule update --force ggml
|
|
||||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
|
||||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
|
||||||
//go:generate git -C ggml apply ../patches/0003-metal-fix-synchronization-in-new-matrix-multiplicati.patch
|
|
||||||
//go:generate git -C ggml apply ../patches/0004-metal-add-missing-barriers-for-mul-mat-2699.patch
|
|
||||||
//go:generate cmake -S ggml -B ggml/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
|
||||||
//go:generate cmake --build ggml/build/metal --target server --config Release
|
|
||||||
//go:generate mv ggml/build/metal/bin/server ggml/build/metal/bin/ollama-runner
|
|
||||||
|
|
||||||
//go:generate git submodule update --force gguf
|
//go:generate git submodule update --force gguf
|
||||||
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
||||||
//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
//go:generate cmake -S gguf -B gguf/build/metal -DLLAMA_METAL=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DCMAKE_SYSTEM_PROCESSOR=arm64 -DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
|
||||||
|
|
|
@ -2,15 +2,6 @@ package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
|
|
||||||
//go:generate git submodule update --force ggml
|
|
||||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
|
||||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
|
||||||
//go:generate git -C ggml apply ../patches/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
|
|
||||||
//go:generate git -C ggml apply ../patches/0001-copy-cuda-runtime-libraries.patch
|
|
||||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
|
||||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
|
||||||
//go:generate mv ggml/build/cpu/bin/server ggml/build/cpu/bin/ollama-runner
|
|
||||||
|
|
||||||
//go:generate git submodule update --force gguf
|
//go:generate git submodule update --force gguf
|
||||||
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
|
//go:generate git -C gguf apply ../patches/0001-copy-cuda-runtime-libraries.patch
|
||||||
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
||||||
|
@ -18,9 +9,6 @@ package llm
|
||||||
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
//go:generate cmake --build gguf/build/cpu --target server --config Release
|
||||||
//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
|
//go:generate mv gguf/build/cpu/bin/server gguf/build/cpu/bin/ollama-runner
|
||||||
|
|
||||||
//go:generate cmake -S ggml -B ggml/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
|
|
||||||
//go:generate cmake --build ggml/build/cuda --target server --config Release
|
|
||||||
//go:generate mv ggml/build/cuda/bin/server ggml/build/cuda/bin/ollama-runner
|
|
||||||
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0
|
//go:generate cmake -S gguf -B gguf/build/cuda -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA_PEER_MAX_BATCH_SIZE=0
|
||||||
//go:generate cmake --build gguf/build/cuda --target server --config Release
|
//go:generate cmake --build gguf/build/cuda --target server --config Release
|
||||||
//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
|
//go:generate mv gguf/build/cuda/bin/server gguf/build/cuda/bin/ollama-runner
|
||||||
|
|
|
@ -2,13 +2,6 @@ package llm
|
||||||
|
|
||||||
//go:generate git submodule init
|
//go:generate git submodule init
|
||||||
|
|
||||||
//go:generate git submodule update --force ggml
|
|
||||||
//go:generate git -C ggml apply ../patches/0001-add-detokenize-endpoint.patch
|
|
||||||
//go:generate git -C ggml apply ../patches/0002-34B-model-support.patch
|
|
||||||
//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
|
|
||||||
//go:generate cmake --build ggml/build/cpu --target server --config Release
|
|
||||||
//go:generate cmd /c move ggml\build\cpu\bin\Release\server.exe ggml\build\cpu\bin\Release\ollama-runner.exe
|
|
||||||
|
|
||||||
//go:generate git submodule update --force gguf
|
//go:generate git submodule update --force gguf
|
||||||
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
//go:generate git -C gguf apply ../patches/0001-update-default-log-target.patch
|
||||||
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
|
//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
Subproject commit 9e232f0234073358e7031c1b8d7aa45020469a3b
|
|
|
@ -1,51 +0,0 @@
|
||||||
From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001
|
|
||||||
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
|
||||||
Date: Mon, 28 Aug 2023 18:08:12 -0400
|
|
||||||
Subject: [PATCH] add detokenize endpoint
|
|
||||||
|
|
||||||
---
|
|
||||||
examples/server/server.cpp | 21 +++++++++++++++++++++
|
|
||||||
1 file changed, 21 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
|
|
||||||
index 9966045..5014691 100644
|
|
||||||
--- a/examples/server/server.cpp
|
|
||||||
+++ b/examples/server/server.cpp
|
|
||||||
@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
|
||||||
{"tokens", tokens}};
|
|
||||||
}
|
|
||||||
|
|
||||||
+static json format_detokenized_response(std::string content)
|
|
||||||
+{
|
|
||||||
+ return json{
|
|
||||||
+ {"content", content}};
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
static void parse_options_completion(const json &body, llama_server_context &llama)
|
|
||||||
{
|
|
||||||
gpt_params default_params;
|
|
||||||
@@ -1361,6 +1367,21 @@ int main(int argc, char **argv)
|
|
||||||
const json data = format_tokenizer_response(tokens);
|
|
||||||
return res.set_content(data.dump(), "application/json"); });
|
|
||||||
|
|
||||||
+ svr.Post("/detokenize", [&llama](const Request &req, Response &res)
|
|
||||||
+ {
|
|
||||||
+ auto lock = llama.lock();
|
|
||||||
+
|
|
||||||
+ const json body = json::parse(req.body);
|
|
||||||
+ std::string content;
|
|
||||||
+ if (body.count("tokens") != 0)
|
|
||||||
+ {
|
|
||||||
+ const std::vector<llama_token> tokens = body["tokens"];
|
|
||||||
+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ const json data = format_detokenized_response(content);
|
|
||||||
+ return res.set_content(data.dump(), "application/json"); });
|
|
||||||
+
|
|
||||||
svr.Post("/embedding", [&llama](const Request &req, Response &res)
|
|
||||||
{
|
|
||||||
auto lock = llama.lock();
|
|
||||||
--
|
|
||||||
2.39.2 (Apple Git-143)
|
|
||||||
|
|
|
@ -1,89 +0,0 @@
|
||||||
From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
|
||||||
Date: Mon, 28 Aug 2023 18:08:53 -0400
|
|
||||||
Subject: [PATCH] 34B model support
|
|
||||||
|
|
||||||
---
|
|
||||||
llama.cpp | 10 ++++++++++
|
|
||||||
1 file changed, 10 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/llama.cpp b/llama.cpp
|
|
||||||
index f2cbe76..62c5cdf 100644
|
|
||||||
--- a/llama.cpp
|
|
||||||
+++ b/llama.cpp
|
|
||||||
@@ -79,6 +79,7 @@ enum e_model {
|
|
||||||
MODEL_7B,
|
|
||||||
MODEL_13B,
|
|
||||||
MODEL_30B,
|
|
||||||
+ MODEL_34B,
|
|
||||||
MODEL_65B,
|
|
||||||
MODEL_70B,
|
|
||||||
};
|
|
||||||
@@ -122,6 +123,7 @@ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
|
|
||||||
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
|
||||||
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
|
||||||
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
|
||||||
+ { MODEL_34B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
|
||||||
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
|
||||||
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
|
|
||||||
};
|
|
||||||
@@ -135,6 +137,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
||||||
{ MODEL_7B, 160ull * MB },
|
|
||||||
{ MODEL_13B, 192ull * MB },
|
|
||||||
{ MODEL_30B, 256ull * MB },
|
|
||||||
+ { MODEL_34B, 256ull * MB },
|
|
||||||
{ MODEL_65B, 384ull * MB }, // guess
|
|
||||||
{ MODEL_70B, 304ull * MB },
|
|
||||||
};
|
|
||||||
@@ -149,6 +152,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
||||||
{ MODEL_7B, 10ull * MB },
|
|
||||||
{ MODEL_13B, 12ull * MB },
|
|
||||||
{ MODEL_30B, 16ull * MB },
|
|
||||||
+ { MODEL_34B, 16ull * MB },
|
|
||||||
{ MODEL_65B, 24ull * MB }, // guess
|
|
||||||
{ MODEL_70B, 24ull * MB },
|
|
||||||
};
|
|
||||||
@@ -164,6 +168,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
|
||||||
{ MODEL_7B, 512ull * kB },
|
|
||||||
{ MODEL_13B, 640ull * kB },
|
|
||||||
{ MODEL_30B, 768ull * kB },
|
|
||||||
+ { MODEL_34B, 768ull * kB },
|
|
||||||
{ MODEL_65B, 1280ull * kB },
|
|
||||||
{ MODEL_70B, 1280ull * kB },
|
|
||||||
};
|
|
||||||
@@ -179,6 +184,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
|
||||||
{ MODEL_7B, 128ull },
|
|
||||||
{ MODEL_13B, 160ull },
|
|
||||||
{ MODEL_30B, 208ull },
|
|
||||||
+ { MODEL_34B, 208ull },
|
|
||||||
{ MODEL_65B, 256ull },
|
|
||||||
{ MODEL_70B, 256ull },
|
|
||||||
};
|
|
||||||
@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
||||||
case MODEL_7B: return "7B";
|
|
||||||
case MODEL_13B: return "13B";
|
|
||||||
case MODEL_30B: return "30B";
|
|
||||||
+ case MODEL_34B: return "34B";
|
|
||||||
case MODEL_65B: return "65B";
|
|
||||||
case MODEL_70B: return "70B";
|
|
||||||
default: LLAMA_ASSERT(false);
|
|
||||||
@@ -1074,6 +1081,7 @@ static void llama_model_load_internal(
|
|
||||||
case 26: model.type = e_model::MODEL_3B; break;
|
|
||||||
case 32: model.type = e_model::MODEL_7B; break;
|
|
||||||
case 40: model.type = e_model::MODEL_13B; break;
|
|
||||||
+ case 48: model.type = e_model::MODEL_34B; break;
|
|
||||||
case 60: model.type = e_model::MODEL_30B; break;
|
|
||||||
case 80: model.type = e_model::MODEL_65B; break;
|
|
||||||
default:
|
|
||||||
@@ -1094,6 +1102,8 @@ static void llama_model_load_internal(
|
|
||||||
LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
|
||||||
model.type = e_model::MODEL_70B;
|
|
||||||
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
|
||||||
+ } else if (model.type == e_model::MODEL_34B && n_gqa == 8) {
|
|
||||||
+ hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model
|
|
||||||
}
|
|
||||||
|
|
||||||
hparams.rope_freq_base = rope_freq_base;
|
|
||||||
--
|
|
||||||
2.39.2 (Apple Git-143)
|
|
||||||
|
|
|
@ -1,30 +0,0 @@
|
||||||
From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
|
|
||||||
From: Shouzheng Liu <lshzh.hi@gmail.com>
|
|
||||||
Date: Mon, 21 Aug 2023 06:59:29 -0400
|
|
||||||
Subject: [PATCH] metal : fix synchronization in new matrix multiplication
|
|
||||||
kernel (#2686)
|
|
||||||
|
|
||||||
---
|
|
||||||
ggml-metal.metal | 3 ++-
|
|
||||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
|
||||||
index 3f31252..88d48f6 100644
|
|
||||||
--- a/ggml-metal.metal
|
|
||||||
+++ b/ggml-metal.metal
|
|
||||||
@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
||||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
|
||||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
|
||||||
for (int i = 0; i < 8; i++) {
|
|
||||||
+ threadgroup_barrier(mem_flags::mem_device);
|
|
||||||
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
|
||||||
}
|
|
||||||
|
|
||||||
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
+ threadgroup_barrier(mem_flags::mem_device);
|
|
||||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
|
||||||
if (sgitg==0) {
|
|
||||||
for (int i = 0; i < n_rows; i++) {
|
|
||||||
--
|
|
||||||
2.41.0
|
|
||||||
|
|
|
@ -1,41 +0,0 @@
|
||||||
From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Shouzheng Liu <lshzh.hi@gmail.com>
|
|
||||||
Date: Tue, 22 Aug 2023 02:18:40 -0400
|
|
||||||
Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)
|
|
||||||
|
|
||||||
---
|
|
||||||
ggml-metal.metal | 5 +++--
|
|
||||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
|
||||||
index 88d48f6..ce3541f 100644
|
|
||||||
--- a/ggml-metal.metal
|
|
||||||
+++ b/ggml-metal.metal
|
|
||||||
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
||||||
//load data and store to threadgroup memory
|
|
||||||
half4x4 temp_a;
|
|
||||||
dequantize_func(x, il, temp_a);
|
|
||||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
#pragma unroll(16)
|
|
||||||
for (int i = 0; i < 16; i++) {
|
|
||||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
|
||||||
@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
|
||||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
|
||||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
|
||||||
for (int i = 0; i < 8; i++) {
|
|
||||||
- threadgroup_barrier(mem_flags::mem_device);
|
|
||||||
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
|
||||||
}
|
|
||||||
|
|
||||||
- threadgroup_barrier(mem_flags::mem_device);
|
|
||||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
||||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
|
||||||
if (sgitg==0) {
|
|
||||||
for (int i = 0; i < n_rows; i++) {
|
|
||||||
--
|
|
||||||
2.41.0
|
|
||||||
|
|
|
@ -1,32 +0,0 @@
|
||||||
From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Kylin <56434533+KyL0N@users.noreply.github.com>
|
|
||||||
Date: Tue, 22 Aug 2023 15:14:23 +0800
|
|
||||||
Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
|
|
||||||
|
|
||||||
* ggml: support CUDA's half type for aarch64(#1455)
|
|
||||||
support CUDA's half type for aarch64 in ggml_fp16_t definition
|
|
||||||
|
|
||||||
* ggml: use __CUDACC__ to recognise nvcc compiler
|
|
||||||
---
|
|
||||||
ggml.h | 5 +++--
|
|
||||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/ggml.h b/ggml.h
|
|
||||||
index 544ad2d..0ec7ec5 100644
|
|
||||||
--- a/ggml.h
|
|
||||||
+++ b/ggml.h
|
|
||||||
@@ -259,8 +259,9 @@
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
-#ifdef __ARM_NEON
|
|
||||||
- // we use the built-in 16-bit float type
|
|
||||||
+#if defined(__ARM_NEON) && defined(__CUDACC__)
|
|
||||||
+ typedef half ggml_fp16_t;
|
|
||||||
+#elif defined(__ARM_NEON)
|
|
||||||
typedef __fp16 ggml_fp16_t;
|
|
||||||
#else
|
|
||||||
typedef uint16_t ggml_fp16_t;
|
|
||||||
--
|
|
||||||
2.39.2 (Apple Git-143)
|
|
||||||
|
|
23
llm/llama.go
23
llm/llama.go
|
@ -59,13 +59,12 @@ ws ::= ([ \t\n] ws)?
|
||||||
var llamaCppEmbed embed.FS
|
var llamaCppEmbed embed.FS
|
||||||
|
|
||||||
type ModelRunner struct {
|
type ModelRunner struct {
|
||||||
Type string // "gguf" or "ggml"
|
|
||||||
Path string // path to the model runner executable
|
Path string // path to the model runner executable
|
||||||
Accelerated bool
|
Accelerated bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func chooseRunners(workDir, runnerType string) []ModelRunner {
|
func chooseRunners(workDir string) []ModelRunner {
|
||||||
buildPath := path.Join("llama.cpp", runnerType, "build")
|
buildPath := path.Join("llama.cpp", "gguf", "build")
|
||||||
var runners []ModelRunner
|
var runners []ModelRunner
|
||||||
|
|
||||||
// set the runners based on the OS
|
// set the runners based on the OS
|
||||||
|
@ -73,25 +72,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
case "darwin":
|
case "darwin":
|
||||||
if runtime.GOARCH == "arm64" {
|
if runtime.GOARCH == "arm64" {
|
||||||
runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
|
runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
|
||||||
} else {
|
} else {
|
||||||
runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
|
runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
|
||||||
}
|
}
|
||||||
case "linux":
|
case "linux":
|
||||||
runners = []ModelRunner{
|
runners = []ModelRunner{
|
||||||
{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
|
{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
|
||||||
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||||
}
|
}
|
||||||
case "windows":
|
case "windows":
|
||||||
// TODO: select windows GPU runner here when available
|
// TODO: select windows GPU runner here when available
|
||||||
runners = []ModelRunner{
|
runners = []ModelRunner{
|
||||||
{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
|
{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
|
||||||
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
|
{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
|
log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
|
||||||
runners = []ModelRunner{
|
runners = []ModelRunner{
|
||||||
{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -141,7 +140,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !runnerAvailable {
|
if !runnerAvailable {
|
||||||
log.Fatalf("%s runner not found", runnerType)
|
log.Fatalf("gguf runner not found")
|
||||||
}
|
}
|
||||||
|
|
||||||
// return the runners to try in priority order
|
// return the runners to try in priority order
|
||||||
|
@ -149,7 +148,6 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
|
||||||
for _, r := range runners {
|
for _, r := range runners {
|
||||||
// clean the ModelRunner paths so that they match the OS we are running on
|
// clean the ModelRunner paths so that they match the OS we are running on
|
||||||
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
|
localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
|
||||||
Type: r.Type,
|
|
||||||
Path: filepath.Clean(path.Join(workDir, r.Path)),
|
Path: filepath.Clean(path.Join(workDir, r.Path)),
|
||||||
Accelerated: r.Accelerated,
|
Accelerated: r.Accelerated,
|
||||||
})
|
})
|
||||||
|
@ -350,6 +348,7 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
|
||||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||||
"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
|
"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
|
||||||
"--embedding",
|
"--embedding",
|
||||||
|
"--parallel", "2",
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.MainGPU > 0 {
|
if opts.MainGPU > 0 {
|
||||||
|
|
16
llm/llm.go
16
llm/llm.go
|
@ -76,16 +76,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
switch ggml.Name() {
|
opts.NumGQA = 0
|
||||||
case "gguf":
|
opts.RopeFrequencyBase = 0.0
|
||||||
// TODO: gguf will load these options automatically from the model binary
|
opts.RopeFrequencyScale = 0.0
|
||||||
opts.NumGQA = 0
|
return newLlama(model, adapters, projectors, chooseRunners(workDir), ggml.NumLayers(), opts)
|
||||||
opts.RopeFrequencyBase = 0.0
|
|
||||||
opts.RopeFrequencyScale = 0.0
|
|
||||||
return newLlama(model, adapters, projectors, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
|
|
||||||
case "ggml", "ggmf", "ggjt", "ggla":
|
|
||||||
return newLlama(model, adapters, projectors, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
|
|
||||||
default:
|
|
||||||
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -418,6 +418,27 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if the model is not in gguf format, pull the base model to try and get it in gguf format
|
||||||
|
if fromConfig.ModelFormat != "gguf" {
|
||||||
|
fn(api.ProgressResponse{Status: "updating base model"})
|
||||||
|
if err := PullModel(ctx, c.Args, &RegistryOptions{}, fn); err != nil {
|
||||||
|
log.Printf("error pulling model: %v", err)
|
||||||
|
}
|
||||||
|
// Reset the file pointer to the beginning of the file
|
||||||
|
_, err = fromConfigFile.Seek(0, 0)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("update from config after pull: %w", err)
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(fromConfigFile).Decode(&fromConfig); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if the model is still not in gguf format, error out
|
||||||
|
if fromConfig.ModelFormat != "gguf" {
|
||||||
|
return fmt.Errorf("%s is not in gguf format, this base model is not compatible with this version of ollama", c.Args)
|
||||||
|
}
|
||||||
|
|
||||||
config.SetModelFormat(fromConfig.ModelFormat)
|
config.SetModelFormat(fromConfig.ModelFormat)
|
||||||
config.SetModelFamily(append(fromConfig.ModelFamilies, fromConfig.ModelFamily)...)
|
config.SetModelFamily(append(fromConfig.ModelFamilies, fromConfig.ModelFamily)...)
|
||||||
config.SetModelType(fromConfig.ModelType)
|
config.SetModelType(fromConfig.ModelType)
|
||||||
|
@ -456,15 +477,21 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars
|
||||||
defer bin.Close()
|
defer bin.Close()
|
||||||
|
|
||||||
var offset int64
|
var offset int64
|
||||||
|
CREATE:
|
||||||
for {
|
for {
|
||||||
fn(api.ProgressResponse{Status: "creating model layer"})
|
fn(api.ProgressResponse{Status: "creating model layer"})
|
||||||
|
|
||||||
bin.Seek(offset, io.SeekStart)
|
bin.Seek(offset, io.SeekStart)
|
||||||
ggml, err := llm.DecodeGGML(bin)
|
ggml, err := llm.DecodeGGML(bin)
|
||||||
if errors.Is(err, io.EOF) {
|
if err != nil {
|
||||||
break
|
switch {
|
||||||
} else if err != nil {
|
case errors.Is(err, io.EOF):
|
||||||
return err
|
break CREATE
|
||||||
|
case errors.Is(err, llm.ErrUnsupportedFormat):
|
||||||
|
return fmt.Errorf("model binary specified in FROM field is not a valid gguf format model, %w", err)
|
||||||
|
default:
|
||||||
|
return err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
config.SetModelFormat(ggml.Name())
|
config.SetModelFormat(ggml.Name())
|
||||||
|
|
|
@ -114,7 +114,7 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
|
||||||
// some older models are not compatible with newer versions of llama.cpp
|
// some older models are not compatible with newer versions of llama.cpp
|
||||||
// show a generalized compatibility error until there is a better way to
|
// show a generalized compatibility error until there is a better way to
|
||||||
// check for model compatibility
|
// check for model compatibility
|
||||||
if strings.Contains(err.Error(), "failed to load model") {
|
if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") {
|
||||||
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, model.ShortName)
|
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, model.ShortName)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue