Compare commits
1 commit
76c9dc57fd
...
391a633d2f
Author | SHA1 | Date | |
---|---|---|---|
391a633d2f |
31 changed files with 25119 additions and 678 deletions
|
@ -18,7 +18,7 @@ See the [development documentation](./docs/development.md) for instructions on h
|
||||||
|
|
||||||
* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
|
* New features: new features (e.g. API fields, environment variables) add surface area to Ollama and make it harder to maintain in the long run as they cannot be removed without potentially breaking users in the future.
|
||||||
* Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
|
* Refactoring: large code improvements are important, but can be harder or take longer to review and merge.
|
||||||
* Documentation: small updates to fill in or correct missing documentation is helpful, however large documentation additions can be hard to maintain over time.
|
* Documentation: small updates to fill in or dorrect missing documentation is helpful, however large documentation additions can be hard to maintain over time.
|
||||||
|
|
||||||
### Issues that may not be accepted
|
### Issues that may not be accepted
|
||||||
|
|
||||||
|
|
25
README.md
25
README.md
|
@ -295,23 +295,13 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
|
- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
|
||||||
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
|
- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)
|
||||||
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
|
- [LLocal.in](https://github.com/kartikm7/llocal) (Easy to use Electron Desktop Client for Ollama)
|
||||||
- [AiLama](https://github.com/zeyoyt/ailama) (A Discord User App that allows you to interact with Ollama anywhere in discord )
|
|
||||||
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
|
- [Ollama with Google Mesop](https://github.com/rapidarchitect/ollama_mesop/) (Mesop Chat Client implementation with Ollama)
|
||||||
- [Painting Droid](https://github.com/mateuszmigas/painting-droid) (Painting app with AI integrations)
|
|
||||||
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
|
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
|
||||||
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
|
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
|
||||||
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
|
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
|
||||||
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
|
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
|
||||||
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
|
- [BoltAI for Mac](https://boltai.com) (AI Chat Client for Mac)
|
||||||
- [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
|
- [Harbor](https://github.com/av/harbor) (Containerized LLM Toolkit with Ollama as default backend)
|
||||||
- [Go-CREW](https://www.jonathanhecl.com/go-crew/) (Powerful Offline RAG in Golang)
|
|
||||||
- [PartCAD](https://github.com/openvmp/partcad/) (CAD model generation with OpenSCAD and CadQuery)
|
|
||||||
- [Ollama4j Web UI](https://github.com/ollama4j/ollama4j-web-ui) - Java-based Web UI for Ollama built with Vaadin, Spring Boot and Ollama4j
|
|
||||||
- [PyOllaMx](https://github.com/kspviswa/pyOllaMx) - macOS application capable of chatting with both Ollama and Apple MLX models.
|
|
||||||
- [Claude Dev](https://github.com/saoudrizwan/claude-dev) - VSCode extension for multi-file/whole-repo coding
|
|
||||||
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
|
|
||||||
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
|
||||||
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
|
|
||||||
|
|
||||||
### Terminal
|
### Terminal
|
||||||
|
|
||||||
|
@ -337,9 +327,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [gollama](https://github.com/sammcj/gollama)
|
- [gollama](https://github.com/sammcj/gollama)
|
||||||
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
||||||
|
|
||||||
### Apple Vision Pro
|
|
||||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
|
||||||
|
|
||||||
### Database
|
### Database
|
||||||
|
|
||||||
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
|
- [MindsDB](https://github.com/mindsdb/mindsdb/blob/staging/mindsdb/integrations/handlers/ollama_handler/README.md) (Connects Ollama models with nearly 200 data platforms and apps)
|
||||||
|
@ -348,7 +335,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
### Package managers
|
### Package managers
|
||||||
|
|
||||||
- [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
|
- [Pacman](https://archlinux.org/packages/extra/x86_64/ollama/)
|
||||||
- [Gentoo](https://github.com/gentoo/guru/tree/master/app-misc/ollama)
|
|
||||||
- [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
|
- [Helm Chart](https://artifacthub.io/packages/helm/ollama-helm/ollama)
|
||||||
- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
|
- [Guix channel](https://codeberg.org/tusharhero/ollama-guix)
|
||||||
- [Nix package](https://search.nixos.org/packages?channel=24.05&show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
|
- [Nix package](https://search.nixos.org/packages?channel=24.05&show=ollama&from=0&size=50&sort=relevance&type=packages&query=ollama)
|
||||||
|
@ -363,12 +349,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
||||||
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
|
- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
|
||||||
- [LiteLLM](https://github.com/BerriAI/litellm)
|
- [LiteLLM](https://github.com/BerriAI/litellm)
|
||||||
- [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
|
|
||||||
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
|
- [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)
|
||||||
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
|
- [Ollama for Ruby](https://github.com/gbaptista/ollama-ai)
|
||||||
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
|
- [Ollama-rs for Rust](https://github.com/pepperoni21/ollama-rs)
|
||||||
- [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
|
- [Ollama-hpp for C++](https://github.com/jmont-dev/ollama-hpp)
|
||||||
- [Ollama4j for Java](https://github.com/ollama4j/ollama4j)
|
- [Ollama4j for Java](https://github.com/amithkoujalgi/ollama4j)
|
||||||
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
|
- [ModelFusion Typescript Library](https://modelfusion.dev/integration/model-provider/ollama)
|
||||||
- [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
|
- [OllamaKit for Swift](https://github.com/kevinhermawan/OllamaKit)
|
||||||
- [Ollama for Dart](https://github.com/breitburg/dart-ollama)
|
- [Ollama for Dart](https://github.com/breitburg/dart-ollama)
|
||||||
|
@ -385,16 +370,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
|
- [Portkey](https://portkey.ai/docs/welcome/integration-guides/ollama)
|
||||||
- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
|
- [PromptingTools.jl](https://github.com/svilupp/PromptingTools.jl) with an [example](https://svilupp.github.io/PromptingTools.jl/dev/examples/working_with_ollama)
|
||||||
- [LlamaScript](https://github.com/Project-Llama/llamascript)
|
- [LlamaScript](https://github.com/Project-Llama/llamascript)
|
||||||
- [Gollm](https://docs.gollm.co/examples/ollama-example)
|
|
||||||
- [Ollamaclient for Golang](https://github.com/xyproto/ollamaclient)
|
|
||||||
- [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
|
|
||||||
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
|
|
||||||
|
|
||||||
### Mobile
|
### Mobile
|
||||||
|
|
||||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||||
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
|
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
|
||||||
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
|
||||||
|
|
||||||
### Extensions & Plugins
|
### Extensions & Plugins
|
||||||
|
|
||||||
|
@ -419,14 +399,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
|
- [twinny](https://github.com/rjmacarthy/twinny) (Copilot and Copilot chat alternative using Ollama)
|
||||||
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
|
- [Wingman-AI](https://github.com/RussellCanfield/wingman-ai) (Copilot code and chat alternative using Ollama and Hugging Face)
|
||||||
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
|
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
|
||||||
- [Plasmoid Ollama Control](https://github.com/imoize/plasmoid-ollamacontrol) (KDE Plasma extension that allows you to quickly manage/control Ollama model)
|
|
||||||
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
|
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
|
||||||
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
|
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
|
||||||
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
- [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
|
||||||
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.
|
||||||
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
||||||
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
|
||||||
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
|
||||||
|
|
||||||
### Supported backends
|
### Supported backends
|
||||||
|
|
||||||
|
|
|
@ -1421,8 +1421,6 @@ func NewCLI() *cobra.Command {
|
||||||
envVars["OLLAMA_TMPDIR"],
|
envVars["OLLAMA_TMPDIR"],
|
||||||
envVars["OLLAMA_FLASH_ATTENTION"],
|
envVars["OLLAMA_FLASH_ATTENTION"],
|
||||||
envVars["OLLAMA_LLM_LIBRARY"],
|
envVars["OLLAMA_LLM_LIBRARY"],
|
||||||
envVars["OLLAMA_GPU_OVERHEAD"],
|
|
||||||
envVars["OLLAMA_LOAD_TIMEOUT"],
|
|
||||||
})
|
})
|
||||||
default:
|
default:
|
||||||
appendEnvDocs(cmd, envs)
|
appendEnvDocs(cmd, envs)
|
||||||
|
|
|
@ -34,20 +34,10 @@ func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma2Model) Replacements() []string {
|
func (p *gemma2Model) Replacements() []string {
|
||||||
return []string{
|
return append(
|
||||||
"model.embed_tokens", "token_embd",
|
p.gemmaModel.Replacements(),
|
||||||
"model.norm", "output_norm",
|
|
||||||
"model.layers", "blk",
|
|
||||||
"input_layernorm", "attn_norm",
|
|
||||||
"self_attn.q_proj", "attn_q",
|
|
||||||
"self_attn.k_proj", "attn_k",
|
|
||||||
"self_attn.v_proj", "attn_v",
|
|
||||||
"self_attn.o_proj", "attn_output",
|
|
||||||
"mlp.gate_proj", "ffn_gate",
|
|
||||||
"mlp.down_proj", "ffn_down",
|
|
||||||
"mlp.up_proj", "ffn_up",
|
|
||||||
"post_attention_layernorm", "post_attention_norm",
|
"post_attention_layernorm", "post_attention_norm",
|
||||||
"pre_feedforward_layernorm", "ffn_norm",
|
"pre_feedforward_layernorm", "ffn_norm",
|
||||||
"post_feedforward_layernorm", "post_ffw_norm",
|
"post_feedforward_layernorm", "post_ffw_norm",
|
||||||
}
|
)
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,6 @@ import (
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"slices"
|
"slices"
|
||||||
"strings"
|
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"golang.org/x/exp/maps"
|
"golang.org/x/exp/maps"
|
||||||
|
@ -23,12 +22,6 @@ import (
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type tensorData struct {
|
|
||||||
Offsets []int `json:"data_offsets"`
|
|
||||||
Type string `json:"dtype"`
|
|
||||||
Shape []int `json:"shape"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
|
@ -103,7 +96,6 @@ func TestConvertModel(t *testing.T) {
|
||||||
"Mistral-7B-Instruct-v0.2",
|
"Mistral-7B-Instruct-v0.2",
|
||||||
"Mixtral-8x7B-Instruct-v0.1",
|
"Mixtral-8x7B-Instruct-v0.1",
|
||||||
"gemma-2b-it",
|
"gemma-2b-it",
|
||||||
"gemma-2-2b-it",
|
|
||||||
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
|
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
|
||||||
"Phi-3-mini-128k-instruct",
|
"Phi-3-mini-128k-instruct",
|
||||||
"all-MiniLM-L6-v2",
|
"all-MiniLM-L6-v2",
|
||||||
|
@ -148,36 +140,6 @@ func TestConvertModel(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestConvertInvalidTensorNames(t *testing.T) {
|
|
||||||
f, err := os.CreateTemp(t.TempDir(), "testmodel")
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
|
|
||||||
tempDir := t.TempDir()
|
|
||||||
|
|
||||||
td := map[string]*tensorData{}
|
|
||||||
offset := 4096
|
|
||||||
|
|
||||||
td["model.layers.0.self_attn.q_proj.weight"] = &tensorData{
|
|
||||||
Offsets: []int{0, offset},
|
|
||||||
Type: "F32",
|
|
||||||
Shape: []int{4096, 4096},
|
|
||||||
}
|
|
||||||
td["blk.0.attn_q.weight"] = &tensorData{
|
|
||||||
Offsets: []int{offset, offset * 2},
|
|
||||||
Type: "F32",
|
|
||||||
Shape: []int{4096, 4096},
|
|
||||||
}
|
|
||||||
generateSafetensorTestData(t, tempDir, td)
|
|
||||||
|
|
||||||
err = ConvertModel(os.DirFS(tempDir), f)
|
|
||||||
if err == nil || !strings.HasPrefix(err.Error(), "duplicate tensor name") {
|
|
||||||
t.Errorf("expected error but didn't get one")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestConvertInvalidDatatype(t *testing.T) {
|
func TestConvertInvalidDatatype(t *testing.T) {
|
||||||
f, err := os.CreateTemp(t.TempDir(), "testmodel")
|
f, err := os.CreateTemp(t.TempDir(), "testmodel")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -186,10 +148,23 @@ func TestConvertInvalidDatatype(t *testing.T) {
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
tempDir := t.TempDir()
|
tempDir := t.TempDir()
|
||||||
|
generateSafetensorTestData(t, tempDir)
|
||||||
|
|
||||||
td := map[string]*tensorData{}
|
err = ConvertModel(os.DirFS(tempDir), f)
|
||||||
|
if err == nil || err.Error() != "unsupported safetensors model" {
|
||||||
|
t.Errorf("expected error but didn't get one")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateSafetensorTestData(t *testing.T, tempDir string) {
|
||||||
|
type tensorData struct {
|
||||||
|
Offsets []int `json:"data_offsets"`
|
||||||
|
Type string `json:"dtype"`
|
||||||
|
Shape []int `json:"shape"`
|
||||||
|
}
|
||||||
offset := 4096 * 14336
|
offset := 4096 * 14336
|
||||||
|
|
||||||
|
td := map[string]*tensorData{}
|
||||||
td["model.layers.0.mlp.down_proj.weight"] = &tensorData{
|
td["model.layers.0.mlp.down_proj.weight"] = &tensorData{
|
||||||
Offsets: []int{0, offset},
|
Offsets: []int{0, offset},
|
||||||
Type: "I8",
|
Type: "I8",
|
||||||
|
@ -200,16 +175,8 @@ func TestConvertInvalidDatatype(t *testing.T) {
|
||||||
Type: "U8",
|
Type: "U8",
|
||||||
Shape: []int{},
|
Shape: []int{},
|
||||||
}
|
}
|
||||||
generateSafetensorTestData(t, tempDir, td)
|
|
||||||
|
|
||||||
err = ConvertModel(os.DirFS(tempDir), f)
|
data, err := json.Marshal(td)
|
||||||
if err == nil || err.Error() != "unsupported safetensors model" {
|
|
||||||
t.Errorf("expected error but didn't get one")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func generateSafetensorTestData(t *testing.T, tempDir string, tensorData map[string]*tensorData) {
|
|
||||||
data, err := json.Marshal(tensorData)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
@ -355,6 +322,11 @@ func TestConvertAdapter(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func generateLoraTestData(t *testing.T, tempDir string) {
|
func generateLoraTestData(t *testing.T, tempDir string) {
|
||||||
|
type tensorData struct {
|
||||||
|
Offsets []int `json:"data_offsets"`
|
||||||
|
Type string `json:"dtype"`
|
||||||
|
Shape []int `json:"shape"`
|
||||||
|
}
|
||||||
offset := 4096 * 8 * 4
|
offset := 4096 * 8 * 4
|
||||||
|
|
||||||
td := map[string]*tensorData{"__metadata__": nil}
|
td := map[string]*tensorData{"__metadata__": nil}
|
||||||
|
|
|
@ -49,19 +49,12 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
|
||||||
keys := maps.Keys(headers)
|
keys := maps.Keys(headers)
|
||||||
slices.Sort(keys)
|
slices.Sort(keys)
|
||||||
|
|
||||||
names := make(map[string]struct{}, len(keys))
|
|
||||||
|
|
||||||
for _, key := range keys {
|
for _, key := range keys {
|
||||||
if value := headers[key]; value.Type != "" {
|
if value := headers[key]; value.Type != "" {
|
||||||
// bitsandbytes quantized models are unsupported
|
// bitsandbytes quantized models are unsupported
|
||||||
if len(value.Shape) == 0 {
|
if len(value.Shape) == 0 {
|
||||||
return nil, errors.New("unsupported safetensors model")
|
return nil, errors.New("unsupported safetensors model")
|
||||||
}
|
}
|
||||||
ggufName := replacer.Replace(key)
|
|
||||||
if _, ok := names[ggufName]; ok {
|
|
||||||
return nil, fmt.Errorf("duplicate tensor name '%s' was found for this model", ggufName)
|
|
||||||
}
|
|
||||||
names[ggufName] = struct{}{}
|
|
||||||
ts = append(ts, safetensor{
|
ts = append(ts, safetensor{
|
||||||
fs: fsys,
|
fs: fsys,
|
||||||
path: p,
|
path: p,
|
||||||
|
@ -69,7 +62,7 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
|
||||||
offset: safetensorsPad(n, value.Offsets[0]),
|
offset: safetensorsPad(n, value.Offsets[0]),
|
||||||
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
|
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
|
||||||
tensorBase: &tensorBase{
|
tensorBase: &tensorBase{
|
||||||
name: ggufName,
|
name: replacer.Replace(key),
|
||||||
shape: value.Shape,
|
shape: value.Shape,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
312
convert/testdata/gemma-2-2b-it.json
vendored
312
convert/testdata/gemma-2-2b-it.json
vendored
|
@ -1,312 +0,0 @@
|
||||||
{
|
|
||||||
"general.architecture": "gemma2",
|
|
||||||
"general.file_type": "1",
|
|
||||||
"general.quantization_version": "2",
|
|
||||||
"gemma2.block_count": "26",
|
|
||||||
"gemma2.context_length": "8192",
|
|
||||||
"gemma2.embedding_length": "2304",
|
|
||||||
"gemma2.feed_forward_length": "9216",
|
|
||||||
"gemma2.attention.head_count": "8",
|
|
||||||
"gemma2.attention.head_count_kv": "4",
|
|
||||||
"gemma2.attention.key_length": "256",
|
|
||||||
"gemma2.attention.value_length": "256",
|
|
||||||
"gemma2.attention.layer_norm_rms_epsilon": "1e-06",
|
|
||||||
"tokenizer.ggml.model": "llama",
|
|
||||||
"tokenizer.ggml.add_bos_token": "true",
|
|
||||||
"tokenizer.ggml.add_eos_token": "false",
|
|
||||||
"tokenizer.ggml.bos_token_id": "2",
|
|
||||||
"tokenizer.ggml.eos_token_id": "1",
|
|
||||||
"tokenizer.ggml.padding_token_id": "0",
|
|
||||||
"tokenizer.ggml.unknown_token_id": "3",
|
|
||||||
"tokenizer.ggml.scores": "0872465d173867d755d3ee728f882b9dc2057a0bfd596fe1e3d131522f1250d8",
|
|
||||||
"tokenizer.ggml.token_type": "8d40143b3477df77beea4139420335ede458bf5e14102f01b0170197b55da8d8",
|
|
||||||
"tokenizer.ggml.tokens": "c6e66de1841f04de8b8d236d461ab720a4c9b9b5414dc293a09c6e10eab45fda",
|
|
||||||
"token_embd.weight": "64a9d30707e659e2e673656d71f5aef7a9fb9fd83bb9a77558dfc5abbe218a05",
|
|
||||||
"blk.0.attn_k.weight": "d8b4437c5edb3cddf6af9987038e1bb2b191c4f0fce0e160d2abace717f5d5d7",
|
|
||||||
"blk.0.attn_norm.weight": "1eb73e3f7aa8e502f6ca31cd19efbb8e4fd9a89692e13e48ac8205545a7fa7e8",
|
|
||||||
"blk.0.attn_output.weight": "39e7b78e57d356a22dd89ce1c4d7163b970712ba756545e1703f97866cd2192e",
|
|
||||||
"blk.0.attn_q.weight": "795058e23b6109febd9d55c89e1eebe6af0714ec8c56fd86a160876a6135ffe8",
|
|
||||||
"blk.0.attn_v.weight": "0cd6e583d1887c020472e961bbb113fe5a0d23ae2f1c2c876fc366cdb7692b52",
|
|
||||||
"blk.0.ffn_down.weight": "51eb4d962189e945a84e94e0dc1aad3f8f90cc1a11e18029670afcd0ea0acb1b",
|
|
||||||
"blk.0.ffn_gate.weight": "9811a29b8ad48432925897ab21dfcb13c5cbd372aeccbbefca9b7866883b4ce3",
|
|
||||||
"blk.0.ffn_norm.weight": "92cbf4652ef503c1de5b10f2be00b3fcf00100980cb3baa8f3013a8d8bf3d851",
|
|
||||||
"blk.0.ffn_up.weight": "af87de21746879483ed1b374cdd76b19ba11ca2b6dbb1beba98efdf3be3e8077",
|
|
||||||
"blk.0.post_attention_norm.weight": "32e135f1f258ffe407018899e39af1725d59d66d60022b9a21575ba160e0357a",
|
|
||||||
"blk.0.post_ffw_norm.weight": "ba286f5ac11b07fbc986173708c66f1920427be5a6d108af38fa0a837c1c8eb6",
|
|
||||||
"blk.1.attn_k.weight": "51584435552051f7fade76beca582b3f7190cf7fc07adcf527c2774d4b1c3901",
|
|
||||||
"blk.1.attn_norm.weight": "6833104c7fbf35a7e799ae56c262b97fffa14789642aee14381b25acd21ed80a",
|
|
||||||
"blk.1.attn_output.weight": "14c39481369087bf292ac9a3ab2ef166f9fe376a9f90c246653213ef264febdc",
|
|
||||||
"blk.1.attn_q.weight": "443f64ae2229f857c69d6bebb7800b685786cb77884c3ae19d4286aeed081325",
|
|
||||||
"blk.1.attn_v.weight": "0df482de2038f1e4c8a7733ac0ddb69ad90759dab5968b942af0155588de4c4a",
|
|
||||||
"blk.1.ffn_down.weight": "66f30763a8bbbcaea609a0087ed75fadb5e771c06378dd2cea94cf17e492e8cf",
|
|
||||||
"blk.1.ffn_gate.weight": "a7151bff00a545fa18b2c92dcd2a14572ccf9beb957a6c494f1374e8ebe174c9",
|
|
||||||
"blk.1.ffn_norm.weight": "e197d71ea11b5276bc0167d2663b88089b3ff42b47ba91e85f6c5d95f6306435",
|
|
||||||
"blk.1.ffn_up.weight": "57c182e0b14cccd1350d388f0c616991702e74281db54637451b70f4ccc24f9b",
|
|
||||||
"blk.1.post_attention_norm.weight": "3c56f837168d784c2d8bac247c130bdca6610c095c8da4558c536ccad7605609",
|
|
||||||
"blk.1.post_ffw_norm.weight": "d2a51d320fd01069dd7ccaa7082f16a7faeb671885607d7900b10a89c354d0fa",
|
|
||||||
"blk.2.attn_k.weight": "bc103c818192de7ce36caaf89dc117be4df13fb902e0bd9a23c64edace5df9b6",
|
|
||||||
"blk.2.attn_norm.weight": "0f2503aa126083a5d6ac72481be1ef66c6014705b573682b35bd864e4749a3d5",
|
|
||||||
"blk.2.attn_output.weight": "05fcd4a1226e482f91803a266f72caca887a93e63c2d2ba5611ab3c68d38743a",
|
|
||||||
"blk.2.attn_q.weight": "6a10b5c2fd423d1e4c4fd60fa8c154a0159b6b2501ea79cae2ef19f45a674e5e",
|
|
||||||
"blk.2.attn_v.weight": "3cf891945a1f8ae7cc908a5c6b729ff5b70f4436c5ffdbf245cc0ed4cc19cd1b",
|
|
||||||
"blk.2.ffn_down.weight": "ea204fd04e0d2fc728a9861a459216bbfec629c152004ba625f52cd8837bd51e",
|
|
||||||
"blk.2.ffn_gate.weight": "3a3518729f1b8b64a82b8792f33987db5418fdb094be0263c68f146a5c38de54",
|
|
||||||
"blk.2.ffn_norm.weight": "754ede678b725de41a34b82f0edf7688b5c065be7c0d46df6f7ad9430d986884",
|
|
||||||
"blk.2.ffn_up.weight": "ffdcb88439f5828ffbd9fc844b03ff91637b790b9838097258cc3ae75935720c",
|
|
||||||
"blk.2.post_attention_norm.weight": "4b3f53b7ba26e8c36b2dfda3b7e5fc4b1065257cefdea235fc7df9af130ac2fd",
|
|
||||||
"blk.2.post_ffw_norm.weight": "e550369e26b8485e2b54ad34b34bc98af5494287dcc513c2c39cf1eaa5b89d07",
|
|
||||||
"blk.3.attn_k.weight": "89f24ea450e37d9e95757651a83205c085d81b354ee9489dd6310a391d8409f3",
|
|
||||||
"blk.3.attn_norm.weight": "24e2ea662b7cb822b4ca5cd61bc17f2709f406d990ec3b4a0dac1cc112db45cf",
|
|
||||||
"blk.3.attn_output.weight": "ac4dad69473c6e3fac56669212cadd8c34ecc5973d945972e974d94805334967",
|
|
||||||
"blk.3.attn_q.weight": "b6a9c9a7d4722b9096631c65de62228dfddca6e26edfe6af7fce01e116ef0f4c",
|
|
||||||
"blk.3.attn_v.weight": "f272a960a40093942309bc342a379984cbacec2d7bc64428db3f64e6b1887ed4",
|
|
||||||
"blk.3.ffn_down.weight": "c0188ba50d8228805982029c277fc0e87aa57473b8363037c648f6d006ff828a",
|
|
||||||
"blk.3.ffn_gate.weight": "a04aec1561ee6c0fbb18c3db49dc62fb533619cf697fd548cbf2279761aaec3b",
|
|
||||||
"blk.3.ffn_norm.weight": "bc053837d44087ec05eb5d9458357b2a5be787789b19cdbbdc694b57697f99a6",
|
|
||||||
"blk.3.ffn_up.weight": "b3ce8b274f20796d3b1a7c08ba27a919066f9de89a782faa544c4a8d6bea1382",
|
|
||||||
"blk.3.post_attention_norm.weight": "9c922dee7a7df5667289e2788e60170238239cee2dfdbbd9e435763f9f416718",
|
|
||||||
"blk.3.post_ffw_norm.weight": "b682544ac953ad2e0b49027ed8916f2e9d1aba5d1587bb4127ac703570c7a03a",
|
|
||||||
"blk.4.attn_k.weight": "143b0cbb4b787b95c2b6212374410e32173ccef2adb914908a2f89a7916de512",
|
|
||||||
"blk.4.attn_norm.weight": "5668f60491b780273745192662d02c9a92a4f692b29d16aa0bbc7413fec4f85b",
|
|
||||||
"blk.4.attn_output.weight": "b9f2bdb68be1e0cf66dd19f8fa2afb105910ad2ef394864cb32cea8f8944e0d5",
|
|
||||||
"blk.4.attn_q.weight": "ddcf1343dafbc2dfcd0b8741225af22fe4b54b2becce29240bd01c34265d126c",
|
|
||||||
"blk.4.attn_v.weight": "6dc7074366e7ed52d9f48c594dcc85bef738e096276cb99d28228c89eecc5b9c",
|
|
||||||
"blk.4.ffn_down.weight": "30334ffc59ce343cf2a1b973174acb7722823463adc07e19a99bd0f404bc9906",
|
|
||||||
"blk.4.ffn_gate.weight": "890f7c8af208d63b28db52c4b8c16c2288a382d87ff5a6a6d6b0a5b3bf27e6cd",
|
|
||||||
"blk.4.ffn_norm.weight": "ff0316cc7847221eb86a90c1ab441d4ee61553d410c66414a7755021b3b12448",
|
|
||||||
"blk.4.ffn_up.weight": "6af97d113f91564c636734f215e25ee602d48eb045458f300b3ec7582be0f41d",
|
|
||||||
"blk.4.post_attention_norm.weight": "69438f231e105e68216b078bdeb35a7cdc8b12c4e2845e18ecf4c8d361d6a321",
|
|
||||||
"blk.4.post_ffw_norm.weight": "0fd535da78bcf2b32c95b05b2b83dc49817393765be90d8cc1ed3d56f47b68ec",
|
|
||||||
"blk.5.attn_k.weight": "0166eb3c6d20dcf3d3c169e94caa8dee057535bb525e29f698fb6f8844f18a6c",
|
|
||||||
"blk.5.attn_norm.weight": "a7808f27f164023d5cde2be00fc23cac6c71aa0ddeb60bc23e12411b80087672",
|
|
||||||
"blk.5.attn_output.weight": "8b65b2027a0842b68c5308f91d6a31de9599d794157d77df8418b19f9e0d9334",
|
|
||||||
"blk.5.attn_q.weight": "966bc626ef2c2394d872087a41c126bb1b67d1d5f6de920204ef5e5b16c34003",
|
|
||||||
"blk.5.attn_v.weight": "9a362aef3f4437fbf0ef6e1ba785f3329c3db2960f93fe36547d2795e9c254ea",
|
|
||||||
"blk.5.ffn_down.weight": "63e53541d34197720c06f297aa8142ac6b6eec002c7987b296f26e8b1400f931",
|
|
||||||
"blk.5.ffn_gate.weight": "d9591fdd32f783e0fc26e20d5d587ee8971ac8ae2e4c818c6eac1c125c7c7f37",
|
|
||||||
"blk.5.ffn_norm.weight": "677334cc60ecce3a7f4ab3acda15d359353d7358872f614ad8914e3780e9fc6e",
|
|
||||||
"blk.5.ffn_up.weight": "a63764110e1c655ffbd55af0669b2dfe4cc29d0e198d33a8e5426461b08a85f7",
|
|
||||||
"blk.5.post_attention_norm.weight": "c55499f859b2c0a7f5cabceaae47309a5ad38bc29d0f4a8db81f1357023162a9",
|
|
||||||
"blk.5.post_ffw_norm.weight": "82752754665f842418f3e302cb5f43d1e0504dcd124c4b8ddb77018b2c793837",
|
|
||||||
"blk.6.attn_k.weight": "e20a5f0d6c807273c8d491439566b428497ac02097cf0aa55e33748c28e14be6",
|
|
||||||
"blk.6.attn_norm.weight": "2c6ba42fd3c73d72073ced03a32dd28d70a89ed9bbbc8fea1ba03a7ade951e6c",
|
|
||||||
"blk.6.attn_output.weight": "4de7c5c2f4a133a266e17ed8c14c52959466b54cc7ab9e19f789a33b4850f284",
|
|
||||||
"blk.6.attn_q.weight": "56462d921800e6b8cd2213fef04c4ff16d728905cb2f4c58e966d0a053a3b0ae",
|
|
||||||
"blk.6.attn_v.weight": "b758dcbff769d6240c2245ede1dbc62c4170a67c77458e866312589220fe29af",
|
|
||||||
"blk.6.ffn_down.weight": "582247fb3c2bf687cbe9413fe18d18ad47bef4b65df7d78905e10335c6134764",
|
|
||||||
"blk.6.ffn_gate.weight": "3035444d5286aefb7a6d04e55bc27e1fac7cf895cd5be02319a431b8e047b4ae",
|
|
||||||
"blk.6.ffn_norm.weight": "e582d24c66e01b96faa20ce6adfda3d8583b11e809bff89969927398175e369a",
|
|
||||||
"blk.6.ffn_up.weight": "6f4b7bbfedeacf61a4866ae0616c4ba6c9e856662e8f00ae6aaec7f52c53e7b4",
|
|
||||||
"blk.6.post_attention_norm.weight": "8fe51b50bd677d21586aecab0b565c4bf9fa68ad50bfe366f45e8fea3c657ca8",
|
|
||||||
"blk.6.post_ffw_norm.weight": "81ba3cb4c2bf5c546b86855b7a885d3fafededc67eb3a35cd3598b03c9e26e65",
|
|
||||||
"blk.7.attn_k.weight": "2e044179cdcae0946708c86bfea7aa0391e1f7e2a09b33fca035d384cc3ca758",
|
|
||||||
"blk.7.attn_norm.weight": "94b48c546b046803c60e75a3acb17a356b710735989938021b565f68df9b4985",
|
|
||||||
"blk.7.attn_output.weight": "65709b4ad7a581f4d75793d39d4032a359f6bcc0c3835205242a0b99e5b66824",
|
|
||||||
"blk.7.attn_q.weight": "8ded993c95d1f7caf201ceb6fa035cd6ed6d351b50b999fa9355dfee9486cb5b",
|
|
||||||
"blk.7.attn_v.weight": "c92d5e2d2d48397542bc03bea25bf39154075e66c5bb1ead85188505aa04ae91",
|
|
||||||
"blk.7.ffn_down.weight": "e8ba8fb57208805ef1dc23cd7c86e9a2d1fb7c52c3940d292cd5bb2eb24b3fac",
|
|
||||||
"blk.7.ffn_gate.weight": "f0f06d6a2e06c5ac252083bc61d05c814e6289d3f4e4a87d2f06918254c02c36",
|
|
||||||
"blk.7.ffn_norm.weight": "ebf8ef775f72624148e09d68a4332187a7a5020c521fe0623da1cd3485ad33e0",
|
|
||||||
"blk.7.ffn_up.weight": "a554adc4fc7122c247c77670e169916ba1794c787b5be30a2b36705138f1f746",
|
|
||||||
"blk.7.post_attention_norm.weight": "3aa6bc21d85c3a0c12b964e82b12feaedfdd13130c3cd2229228e24e0967ebdf",
|
|
||||||
"blk.7.post_ffw_norm.weight": "508bc7b19ee8ff08f0007c890133a462fc57c7e72b16ee8f6dd64def264ef876",
|
|
||||||
"blk.8.attn_k.weight": "363c8e74056642fe9e7c2f3f9769d57319cd3fa0a6022810189ab8d894322885",
|
|
||||||
"blk.8.attn_norm.weight": "685b49a1f1acb169f4df0bdd8e3de6943f3033cebad14b898a72000595610d92",
|
|
||||||
"blk.8.attn_output.weight": "7bde571e4efef1c6a6143f0526721dfb59e0a0ea0e1a3616a322b2eb937efa48",
|
|
||||||
"blk.8.attn_q.weight": "fc993dbc1074c28a0e1d85e5ab2f4ea6a9c6c1affe7ee56027000a275daed9b6",
|
|
||||||
"blk.8.attn_v.weight": "281e8791d3aef9b3864f1cb054da0ae0c2fef4ce0a58b1bad8bc136b2fa0f62b",
|
|
||||||
"blk.8.ffn_down.weight": "b1164a2578a7f87ed99c2bbc76c5dfbbbc6a1a803605391acc3f320fc989ffd7",
|
|
||||||
"blk.8.ffn_gate.weight": "6b39a3b3aaaa79aee61416b54d62160b9258042650e61c6b47bc77c2dd17daf3",
|
|
||||||
"blk.8.ffn_norm.weight": "17ea1362c72da27f12bc936500492035bdef3fd8f940cb12b57f37d42ba8ecb1",
|
|
||||||
"blk.8.ffn_up.weight": "bc3a7c47afc440d2bdf8fbe9ddf2c9220467472c60c8b4ded8c0f181470ec96c",
|
|
||||||
"blk.8.post_attention_norm.weight": "5c506204e00411ef9c8b4134d40eedcc19fffe68dd0af7d7cc49dcabf2dfac7e",
|
|
||||||
"blk.8.post_ffw_norm.weight": "002faec235c3678864e2901eed275ce4e9dc229164a91c9cd4c965142ba62305",
|
|
||||||
"blk.9.attn_k.weight": "0bab39d8c237f1b6d0010db40467142625a9e6f2e0e4c49a56c12b41e4e0b1fa",
|
|
||||||
"blk.9.attn_norm.weight": "de5f38e873b17f07aa7598831b89cc1cae2c9bc3eb2e042ee9af059d2563e84e",
|
|
||||||
"blk.9.attn_output.weight": "8a8184702c25a62df9ff309c0c7badc8587208523b2be3e8fa90ce7080573e6f",
|
|
||||||
"blk.9.attn_q.weight": "7c961b2431b09ddf95377acd07201cb91bf13d9cd3ae0f2c25c7d6a0358d9f50",
|
|
||||||
"blk.9.attn_v.weight": "e22d240cb4743067033e659cbf210ebe2ebbab3e1dea6ccbe5eaa982382ca038",
|
|
||||||
"blk.9.ffn_down.weight": "a426f81210f03d6ad53277416e1fdcdf37d8065e4817613edaf6c67a343426be",
|
|
||||||
"blk.9.ffn_gate.weight": "a82eba825cb77b8e64f85ff99ede2fc71bc9b01751eeb17e9e6c246ee12ea62e",
|
|
||||||
"blk.9.ffn_norm.weight": "1a97f9b1302a3a326d534c5c3fed2db6db0ae45fd0edd381a3e4fc1c75d81030",
|
|
||||||
"blk.9.ffn_up.weight": "5f20bac2bbf03bb42adb92fbf99561651e1edda57e0b61935ac7f6c08c0ed7cb",
|
|
||||||
"blk.9.post_attention_norm.weight": "9f9866d13988e1946b1e1c80d9374a92a6e3be33748f8eaed3e126d1e1a4c796",
|
|
||||||
"blk.9.post_ffw_norm.weight": "a6896dbf698db4dbbe5dbf12417d4fd80e9cad0c539c858892ec0aa5b046bb58",
|
|
||||||
"blk.10.attn_k.weight": "ca8446e5d21ecd4e6a70dca8d321be480be4fba94d70cba065205436feb44270",
|
|
||||||
"blk.10.attn_norm.weight": "4f41fe290e8f21f63b82151b6cce94bf7318d121468816b0c58af0ff7c1658ab",
|
|
||||||
"blk.10.attn_output.weight": "c626d2e9681c5c941bbde43dddfae1a8d4986bf2be4470857bc8e8bd7f869044",
|
|
||||||
"blk.10.attn_q.weight": "1e61b210a13a429977325cf15d781ab77d604cfa862f4270329cbd94237d5835",
|
|
||||||
"blk.10.attn_v.weight": "8ff8d3e3f058ec3b35ada1057f2ed59c06494d0e0be6a8dc3ff9edf9f0e1a115",
|
|
||||||
"blk.10.ffn_down.weight": "bcebc04219f8081a5f483e58103c0ddbbbc631a0a54fd6dd9d55778e041f70ee",
|
|
||||||
"blk.10.ffn_gate.weight": "7a23a1e620ef871384ddf9611ccdcfb893fbf013cc203ac8e72f745420f1eea0",
|
|
||||||
"blk.10.ffn_norm.weight": "e3a375e43c349a1c6c66c22328e513cc1af3137fe839e43dc8e9be2f65914fd7",
|
|
||||||
"blk.10.ffn_up.weight": "5d182e7c94369194fca5f19cbbe668a999911e57f3d363bc7fb6088428700cb9",
|
|
||||||
"blk.10.post_attention_norm.weight": "b841c6308296e8984f3c5f549c6e3a242f4b3e19141e1f54cc08de9c46759c09",
|
|
||||||
"blk.10.post_ffw_norm.weight": "9d66fa05b5c940208f634f5053d809094c99a2a10a1d1e8847c8281fbd99fb49",
|
|
||||||
"blk.11.attn_k.weight": "14adf24ebb2bb17b336ca81cec3e690fd854782f4440ca6c66cc1d7e7bf1c850",
|
|
||||||
"blk.11.attn_norm.weight": "2d2213f311f50414702b5b34f22aafb9d9a0b6787243e7578562583dc40ad195",
|
|
||||||
"blk.11.attn_output.weight": "de1f14cc2a7fff00cf11b229f0576999205f17b9536e97abc9d6de3cc79a7884",
|
|
||||||
"blk.11.attn_q.weight": "2bcc5c147524003109ece0be08b89ac8b25baa71416ffa76573c6c052ffc6eea",
|
|
||||||
"blk.11.attn_v.weight": "2e6ab8573070c22dc1e0d7aebe4d52123226dacf7822dcce06fadbb38fb036a4",
|
|
||||||
"blk.11.ffn_down.weight": "1b86902f4e36868421e5228b9445051f8290b292df22a6d1af836dcecc1f25c3",
|
|
||||||
"blk.11.ffn_gate.weight": "e756e8081bd0a16aea4a9ef5076ad102113524f7a3d50a3a77aaa7f7938b63e8",
|
|
||||||
"blk.11.ffn_norm.weight": "6913887267be227cf9d1991a3dd8db2e7e74bb9b5fbdfcb9ac954fd7d7b95b3b",
|
|
||||||
"blk.11.ffn_up.weight": "619a3ac0609ebdf42c3fb2b6e4b1db48df79e6dd8418d7ab8f1bbff13d8a6a50",
|
|
||||||
"blk.11.post_attention_norm.weight": "e4b4ba92cef7b6a78407e8ab1b0307d47dac6c3df7b6817e28038317ff662d7e",
|
|
||||||
"blk.11.post_ffw_norm.weight": "40aceeec58cb855f0c158c9cc217168fcd5d0e735567d587217b1d78df17bc5f",
|
|
||||||
"blk.12.attn_k.weight": "c54c5a4d4892522022d1aa2204cfc624f0b4042caa536e678967316293fe5cb1",
|
|
||||||
"blk.12.attn_norm.weight": "7cd2ef58298569ffdf244d9b390f3917245276c8206e5780af5f96d8c0bbb446",
|
|
||||||
"blk.12.attn_output.weight": "85495ef9cc8b3deb21f741bde463ff6493acae2be51f02ecdeef952cbdec3375",
|
|
||||||
"blk.12.attn_q.weight": "d19383f83fd119bfb8c0280c9515705c11d8e7d502019fcf8f49efeef0d106d0",
|
|
||||||
"blk.12.attn_v.weight": "869ac669ba49531d9128892a0e27cef15de508ff40cdf80cc1681dde50d09204",
|
|
||||||
"blk.12.ffn_down.weight": "578f39f8f9fc2f09138afc884a952d7cc3a9a31de4216acd10e88e19e0b75f8c",
|
|
||||||
"blk.12.ffn_gate.weight": "e29a0186bc6c4a0720246306e922d3a83f777dadcf4ac80bad468287031cc8b5",
|
|
||||||
"blk.12.ffn_norm.weight": "e1ee95c6584b5cb57fcf1db8ce2bcc03aff91eb389238c094a61c00dde93d1f2",
|
|
||||||
"blk.12.ffn_up.weight": "2a826f06d7cdfb3edc6ae250ff44363ef77a2a9cdf96313e23a331b99ebfa17d",
|
|
||||||
"blk.12.post_attention_norm.weight": "4bafc7699b948d5cbc0d3e09b418b06c6abc4651a61ada9609d9a2f21c7e5607",
|
|
||||||
"blk.12.post_ffw_norm.weight": "bbb8c34a7176bb1a49f9fe2bacca0bd26b673d52c0835b2e90fa11f2962f077f",
|
|
||||||
"blk.13.attn_k.weight": "ffeefccfe8255d1b694382012ff4134eee5fec9d9491c8d0ff0a13832d1a37e8",
|
|
||||||
"blk.13.attn_norm.weight": "35713726529e3887c4135a88e86e8a4d7270ba5b9f2d1ab462622fbf40a7cdce",
|
|
||||||
"blk.13.attn_output.weight": "0d60b7c5cd71190a9ef4b873b0f516be15447c32d83914db2794b14592b0b460",
|
|
||||||
"blk.13.attn_q.weight": "8296069e65bef794cefc61257fc65789b3cb22955e30f3df129205e5041b2222",
|
|
||||||
"blk.13.attn_v.weight": "ca0f4ab9d16a748fc643a5c0c7a19826a811bf2a4e7316a8c935d4bf0ce8abc6",
|
|
||||||
"blk.13.ffn_down.weight": "d5514e0c8e7b3ed1cbcc1605eb5be1733b6ab3514cf8a0508fc72f7d05ed8bcb",
|
|
||||||
"blk.13.ffn_gate.weight": "8108e517a82e08a3aefbbd267bfa50a1668f92a76273280ce8a6bc1f6dd61521",
|
|
||||||
"blk.13.ffn_norm.weight": "5fcb6132d2134bf1f835b904a99820fa501dbc57d2224129f7098bf3cabc1d36",
|
|
||||||
"blk.13.ffn_up.weight": "6d744b7cd390a3cae3aa350dd379b81246acd056a2259996b6aaadece8465ccc",
|
|
||||||
"blk.13.post_attention_norm.weight": "e08b14698912509790e9575b8676971fbb0a4d82d719367e3756c0d0c4ab8cc0",
|
|
||||||
"blk.13.post_ffw_norm.weight": "2b196e4450fc5f1e7367b2cf7fe33a15fe919fbcdd861d11002346f16e980535",
|
|
||||||
"blk.14.attn_k.weight": "120e5f48d7268dfd9ab5f4bc9cc57a7cec63ea9635f56b80d435eb22936e9483",
|
|
||||||
"blk.14.attn_norm.weight": "146367bcce4db72cc894419a2e0145a6f533507dd68e4739c10ee480308c401f",
|
|
||||||
"blk.14.attn_output.weight": "720fa0165e756876c5cb6ad9e2780dd910390933f3f8849e5add5da04266650b",
|
|
||||||
"blk.14.attn_q.weight": "f5183466f56219ca1aca52d8b82c2d966a4198fea40fdd6b39f4d8b06ca2a6dd",
|
|
||||||
"blk.14.attn_v.weight": "24f8ea3d5512cd37c43c8329cb0da0c90d1895aef763ac2dcee3fe5157ec50a2",
|
|
||||||
"blk.14.ffn_down.weight": "e29960965b384ae5ab3d898a4dbaa8fddd28fa0e477ac28bcac49dec12a5ac67",
|
|
||||||
"blk.14.ffn_gate.weight": "6d0d6a74bfe9692e8f8eedff0fc34fc4fa1c8687794f35f2e2b033ab2d7510b8",
|
|
||||||
"blk.14.ffn_norm.weight": "f7036c1a9a71e046c9d2af16e9218fda5dbb0f7241ab44747abed1f0f9d602ca",
|
|
||||||
"blk.14.ffn_up.weight": "7d69ea1424007ffc9c12247dd0308c616e93ac02a59ec341cfa48f92d6ce3b10",
|
|
||||||
"blk.14.post_attention_norm.weight": "65b9712834d9445d4236bec362f3fb795c20d60c541b3dc6dbb7914d9b493e41",
|
|
||||||
"blk.14.post_ffw_norm.weight": "9c6a8da2e4e437d5cfdf3b9097e9f8b64bf07946a048badec20f4d374613f38f",
|
|
||||||
"blk.15.attn_k.weight": "864bc618303a0e4ee67fb1d5e751de61e936cd51e96669dd86f8cd08f2305045",
|
|
||||||
"blk.15.attn_norm.weight": "f9f4187da6eeadc2fc5921d8fe669741697d16c13d71e4aaeb73b82f50dc577e",
|
|
||||||
"blk.15.attn_output.weight": "ce2419a0b097036b2a31f2f4ad731d5814bcc2ef4c511786e24471e5eefd273b",
|
|
||||||
"blk.15.attn_q.weight": "9539db5a970d11ebe99722d1e13fcd635e250033630811efe583d2f97778e4a9",
|
|
||||||
"blk.15.attn_v.weight": "1c834b48ccd88adaeabb7d8bcb6be0bcd6d5ac1354ce88fc28f19a1a96b81ab3",
|
|
||||||
"blk.15.ffn_down.weight": "bc1f97a65dde6fa2c1e5397afb612266944b343f2eaa868b635ddd25829f8a42",
|
|
||||||
"blk.15.ffn_gate.weight": "1b14529d57056b79037f6cb5008132e62cc35992353b38dda59572274623103b",
|
|
||||||
"blk.15.ffn_norm.weight": "9af77458de9ee55c66f93865759f9c2c398557f94f3fa8fa6af30543d7339cde",
|
|
||||||
"blk.15.ffn_up.weight": "41d524a26b61a9595816b4fd53cf57ef50a702e4ef32933ff6136dca9136a267",
|
|
||||||
"blk.15.post_attention_norm.weight": "c60a03cd0e63a7db5c80015e58e9b97ba2208caa19f66a6fef5c4447eca900ce",
|
|
||||||
"blk.15.post_ffw_norm.weight": "34f7f9f96769215bbc3d17084df091864aef96a6645b7d0b3b7d9bd92f1a4b0b",
|
|
||||||
"blk.16.attn_k.weight": "7e27240d9f3a8c6cf0f4a980113d43234f514eadc3e3e1792b86efb29ffb1a6d",
|
|
||||||
"blk.16.attn_norm.weight": "af798acc0899282a30448edec48223b3e8efda177090273e612d8eca5e377301",
|
|
||||||
"blk.16.attn_output.weight": "79df39a3709d3d53e84146291e0944a7a653d06705293d9ccb5648dceadb432c",
|
|
||||||
"blk.16.attn_q.weight": "db58a1c3b83ad294804e5fd7321005719e200659173466df5a52a182b80b7165",
|
|
||||||
"blk.16.attn_v.weight": "2af6d48cbaeb225b5c1a704f76abd89c8ab1521417695b112b4dcc2cbd39b74d",
|
|
||||||
"blk.16.ffn_down.weight": "fc1c813eb5e7da3d6194569d6cb21602fc6eff2dc8e1b0eb753f2d5df148189c",
|
|
||||||
"blk.16.ffn_gate.weight": "7a80bcbc42464bd55df4814a6edbd7b5c153e0428323bbe49de55e2d2add33e7",
|
|
||||||
"blk.16.ffn_norm.weight": "2041685ee926d30f3f2ae4ec35b5688f1cd834167a6359a7d4057eac804c58b2",
|
|
||||||
"blk.16.ffn_up.weight": "8da4b718973ac1d43b928829bc45e062fd101984d6c98dd825bd7c5d08ebfbe3",
|
|
||||||
"blk.16.post_attention_norm.weight": "975c48fe680a6167438a106140a8872eee7765191f152d80e3b8ddf47693e095",
|
|
||||||
"blk.16.post_ffw_norm.weight": "4de2d4d483acfe4fc77860ea929025df2f4e15c10729413f36a18c94eaa6d689",
|
|
||||||
"blk.17.attn_k.weight": "f937e61f0af8c4cd98ee742648eb60e02e579683e21d421071295a3b70aebaad",
|
|
||||||
"blk.17.attn_norm.weight": "c3270583ed28b7e423f5b170c59113234f258169b93a867d9274f4c10b7cb115",
|
|
||||||
"blk.17.attn_output.weight": "b8c1150e81e685e539a5dcf2c19047a24eba2b281fabe166674b1d71ef4612ea",
|
|
||||||
"blk.17.attn_q.weight": "c255100ae2011e7dc7e3bf3bc3ccd96d859fbb98581cae993d7b82c1ba8e8b39",
|
|
||||||
"blk.17.attn_v.weight": "5830bb0a555984c6485348067f70b5d22ae337c011aa9248dac2ff4c95944551",
|
|
||||||
"blk.17.ffn_down.weight": "8ff9a7cccaa3776434a9d895aae4fb5c36c736bf2ec98784226b4c234940fbb0",
|
|
||||||
"blk.17.ffn_gate.weight": "1b52876739712831c272911533da206f407b46034a1a4ae8a88c1f96b6bd5747",
|
|
||||||
"blk.17.ffn_norm.weight": "d0e16ba5e87c91b545334e022058c7d03849665c3b1a6298771b656531366b66",
|
|
||||||
"blk.17.ffn_up.weight": "4dd6211d01dbebbe21052708eddc242b082a58b5f18ed16479e17987c1d3432e",
|
|
||||||
"blk.17.post_attention_norm.weight": "6f49c775c7417dade77ba8268a0f8441c1e5ec28b5d7e4dc5ed07a04d04600c8",
|
|
||||||
"blk.17.post_ffw_norm.weight": "b91a0bb2e6679e9c9be06ad323adae441d00a3d673efb19d7c4954be2aa84b27",
|
|
||||||
"blk.18.attn_k.weight": "22b565ace1b4da8b33865a58625be1d90beea9891f29686a69fa9cf7c93217db",
|
|
||||||
"blk.18.attn_norm.weight": "3e0160d7063c8753de65d2356a66648e47d921efdc5c917efb8209892120f8db",
|
|
||||||
"blk.18.attn_output.weight": "e3180f0bb4ca90b31e9b08158db38e332de62dfbaefe34aa94cc316409331e09",
|
|
||||||
"blk.18.attn_q.weight": "f3a5a83614c3ba7ea41cdd5b1b0819a241ee2a951a381ce4a9e001d3f700ed8f",
|
|
||||||
"blk.18.attn_v.weight": "f3350a5984fb951fc738adcf78147e6d812ff1c576670c460cafc99c253c1654",
|
|
||||||
"blk.18.ffn_down.weight": "9e9d09b13a33525e14bdaee6efc65c551ac7cf7680e534b940ab122a3a7c1ac9",
|
|
||||||
"blk.18.ffn_gate.weight": "ebaec8b4b578a2e8d815baac12f1675c208f80c68074d5a18288a2e1a60680ee",
|
|
||||||
"blk.18.ffn_norm.weight": "33e7687c53a242f2f8dc7093a491c97b18d4a5a8c14d183f02bd586a770f05aa",
|
|
||||||
"blk.18.ffn_up.weight": "78a1816662378ce56cc870e705174492781897b3afd2d4d97a51f10f2f2987c1",
|
|
||||||
"blk.18.post_attention_norm.weight": "a58dde3f12df3e94cbc27d87c8ea86f89af8a388a506446ff6758f05399b05fc",
|
|
||||||
"blk.18.post_ffw_norm.weight": "cebf90cc143577d483cca27b032dfd82031ee59bdf17c0e2cf60a0a3ad5bf996",
|
|
||||||
"blk.19.attn_k.weight": "4683375d0599ac9e2232196aae1e90af13a14cae26e865465de5c8e257bb2055",
|
|
||||||
"blk.19.attn_norm.weight": "f3eba936bfb1814bbcb0a1d62739eb66daac839df8c9c836fe0e94860df88525",
|
|
||||||
"blk.19.attn_output.weight": "51c0f01d38a9dcfe9bdbc4643576fab164c1d9e4b7168b7695c0ee55e6965667",
|
|
||||||
"blk.19.attn_q.weight": "28d15b69b8416f2e7ddc88fe381cb1e2ef2ad705fb1c268139ba96498cc74848",
|
|
||||||
"blk.19.attn_v.weight": "6860f1cd720638e63a981fa2c0b4db900129826bcb9823c9ddf9fb8b1b9f3383",
|
|
||||||
"blk.19.ffn_down.weight": "bc7f2d7827ee01c2dd41401c7b3b1700ad3a4ff620e8bb734f92630d342dcc7f",
|
|
||||||
"blk.19.ffn_gate.weight": "54d03ef69ba373fc410fbca8f1e34a565d58e4296d9a035ff7e48340b9c848e7",
|
|
||||||
"blk.19.ffn_norm.weight": "9178fc796a340ee6e8128ca74c0cb6203d1adbed6927af4e5ac7863da57affc7",
|
|
||||||
"blk.19.ffn_up.weight": "a77bd708026c6e83ad5c79c223278e74621bcf74a9641c7818d96b595daaad20",
|
|
||||||
"blk.19.post_attention_norm.weight": "ae94aa26f4c411bf9496a6fd4a6df64ee589ee1ae9a04b531d45acc95721e582",
|
|
||||||
"blk.19.post_ffw_norm.weight": "9ad210700edeef12133bdcff04bf1c7f62b49f6f4a9ba483c7cdc59857c24a5c",
|
|
||||||
"blk.20.attn_k.weight": "e35bce1e9f4a7a09ef34721f57ea38cfca68c272f52d923fe50af8308f66cfaa",
|
|
||||||
"blk.20.attn_norm.weight": "644800f6926fd34f233795c4dec1151a295d2138ca8cac33e3e48167d26f8b41",
|
|
||||||
"blk.20.attn_output.weight": "8d3758cd236471741e1ad66c0710cb79077dc8c7a3a292d35bc551c0c5abe627",
|
|
||||||
"blk.20.attn_q.weight": "c333b1f0f6f956b5d73891df10b1a0321e55fc31c40d623a24e1f52caa6a998b",
|
|
||||||
"blk.20.attn_v.weight": "8562b418d0c4868a050fb19fa3fcaf50a8cf1c669f537d666c80c7b3a04714e1",
|
|
||||||
"blk.20.ffn_down.weight": "97efb608ac44cc804198faec3ee66eafe56ced6b7ca5359700c6f1df75b7205e",
|
|
||||||
"blk.20.ffn_gate.weight": "5c61151d86f28415c73c73d90ec088c646cbe5c1640197caf58eb501ba7db293",
|
|
||||||
"blk.20.ffn_norm.weight": "24bbe0a701afd4bbeea65b3edde712b3cbb2281043bbc43dbf250582453116ed",
|
|
||||||
"blk.20.ffn_up.weight": "e170cf68e249566aa99eb6f6b265679bf9a5a6b76830ba24e7e130c2515910c4",
|
|
||||||
"blk.20.post_attention_norm.weight": "e092d751cfe20dbf2d348358f3b38397bd83e4ed94d6bbaa6bbaddcd902b2ac4",
|
|
||||||
"blk.20.post_ffw_norm.weight": "219a18a47dcba76e669e4322223a5a9227bd3db1de3fbd3d3cfb22e54a783c5a",
|
|
||||||
"blk.21.attn_k.weight": "c3a095ebddb42c63824f1c98da65263dc88e4d790a26aa1632840b44f5cc7cb1",
|
|
||||||
"blk.21.attn_norm.weight": "ef8bbaded5fbc45ad9cf3985ae02174524e7090fe6362811124f942ef643bec7",
|
|
||||||
"blk.21.attn_output.weight": "668f018aba72baac6252aa3ad58569ddd55ab751a0dd8d7bcc9fb9b6efb4bf53",
|
|
||||||
"blk.21.attn_q.weight": "e759c65663089f3bbbd51847934c185e680c82f1249065d5d487da638e519e6d",
|
|
||||||
"blk.21.attn_v.weight": "2ff57762686cf9ba1f5a6be76503454b97556ce67f4ac98254bd0562231197ba",
|
|
||||||
"blk.21.ffn_down.weight": "3fd106556fb721b1c28ae3f4026bc83eb1b08ed910f2ba5f466c6b5f327d91cb",
|
|
||||||
"blk.21.ffn_gate.weight": "338022d882f4b6619e8054a6fb909696fa3eef3013cf69b65c3cacdfc5b9e42c",
|
|
||||||
"blk.21.ffn_norm.weight": "1e77660c23a3f9653ee721a863d1960f773d87437cabc4dc0a6e17ee3d4e5e44",
|
|
||||||
"blk.21.ffn_up.weight": "7d31b20fbc2e6eba8f350f170069dc36f0cb12f68fbc4206ec5022a74085ebcb",
|
|
||||||
"blk.21.post_attention_norm.weight": "9638bae8d8bdcd7ed68da282979cd84a07c41ff9cabcaea94ebc846a1803db23",
|
|
||||||
"blk.21.post_ffw_norm.weight": "d622ef11115fe0cbe04b727d5a3b6371e7f39bf08c8d5eb9bc6da52e3f3cfb9d",
|
|
||||||
"blk.22.attn_k.weight": "5c321cb29deffbe57de200dd206a62005f1e80acb86c4fd2349dd44c8d3594fd",
|
|
||||||
"blk.22.attn_norm.weight": "198d949705d7170a331d75889d8c7500c3635254dac2cc6aa4dc35d556584536",
|
|
||||||
"blk.22.attn_output.weight": "19805cd5d7025b457e5d41d70db8b3fd63c2dd0e4a94d3ef1704d50ef4e749e8",
|
|
||||||
"blk.22.attn_q.weight": "177836cd583fc87405975ddc21ebfebdaa090a0363799664c72caa3da851ae2c",
|
|
||||||
"blk.22.attn_v.weight": "fea255692483e30d0108f9e4e250eb3ed7dbda8d83f499b06519b8c223ae6096",
|
|
||||||
"blk.22.ffn_down.weight": "00cb8939f03e5817d6d412de8cf2c923c9568d5493e382cec7faf5718fb034eb",
|
|
||||||
"blk.22.ffn_gate.weight": "b0591065b91281b2fbd8a9567f3568d40479f680e1f0a29e27ae213f37642489",
|
|
||||||
"blk.22.ffn_norm.weight": "96b5c5d0737c2ceb8fc869f54adb9e5f46e28cb7b177c40f49fa926b923c00f8",
|
|
||||||
"blk.22.ffn_up.weight": "81f472185b24344ab0594ea8246cc6e200e0dc1cab4943e74fbe4ca19d5a9701",
|
|
||||||
"blk.22.post_attention_norm.weight": "27fa9aa6260aa3071e0391e1a1d49322dcb6e8072315b8a9b7064087108dbd06",
|
|
||||||
"blk.22.post_ffw_norm.weight": "f37e1dcd7f643d9545675ffe9dc527a11eba86eb204989c2f44f636b266d896a",
|
|
||||||
"blk.23.attn_k.weight": "5d82f36658a56c3f94d0bb2d61f65509c966fa6568f81812e0d3e338b380ef8c",
|
|
||||||
"blk.23.attn_norm.weight": "b7983f88d9cad88bc88a528923e6da592ad20e699965b223ebc10840fe1f4fec",
|
|
||||||
"blk.23.attn_output.weight": "59f97f80f430d71606aab0158a195aed29ccd3405e6c0a5c41c809be8eb01898",
|
|
||||||
"blk.23.attn_q.weight": "53ac4789fe958919cc02ea4222bcd64c0ea1b4baa54304bff46635bdf42f7490",
|
|
||||||
"blk.23.attn_v.weight": "ec8abe09b9e84dbb52c7a068094657c6d3c62fe551ba8d7c3a3f23da622e9756",
|
|
||||||
"blk.23.ffn_down.weight": "3cf547eccb1b82aa64f208cee9682d7f558ca84e0aead7d9d3d1420d90f3d992",
|
|
||||||
"blk.23.ffn_gate.weight": "366aa2486d911ba81eb519119e13807deacf7e9908bc1975a2a63e00d6b10124",
|
|
||||||
"blk.23.ffn_norm.weight": "6d1d4a4af34bb7dc090ac87d6457d398c3e0fb68bd2e2b60b099dc318b6cfac3",
|
|
||||||
"blk.23.ffn_up.weight": "53f76692e253f5d2420b3f200c731b9f3b7a83e379920b4a067c729b4674aa4d",
|
|
||||||
"blk.23.post_attention_norm.weight": "7c952fa0efa76b3f048c8c4c9e8dcb5e3724d231327eda6423a34d3f3d3367de",
|
|
||||||
"blk.23.post_ffw_norm.weight": "7ab188cfe61f0a91b40309a0ab6bfa99f19d0ff2a37b6ac10e5f0c7f44eb5270",
|
|
||||||
"blk.24.attn_k.weight": "225798792f9bfdd10eff0505ebe61e0aad0209c17b431f6044ee7968ffe8c198",
|
|
||||||
"blk.24.attn_norm.weight": "635e3c1ebf5219bbebfc40ef164bc32d2b726ef595a94da64ac524ae878e2915",
|
|
||||||
"blk.24.attn_output.weight": "482f5bb2db8d9ed22b253d9a3296333b239efe698e5992e5d77e7e12dc2a5cf5",
|
|
||||||
"blk.24.attn_q.weight": "43805bbccddb65d58fffc4be9b5c374d4e1df1395ec1e1ffb4bcff03e98d5adb",
|
|
||||||
"blk.24.attn_v.weight": "fa741af54b4a3b1775d32f59134756090c5df2e7345a12a2d8db94fe289667a7",
|
|
||||||
"blk.24.ffn_down.weight": "83c6351e3162626b276f524a57836144625c2556dbe321b57cbd8fd486a68fab",
|
|
||||||
"blk.24.ffn_gate.weight": "fbe66be0d84d12cea5176cc7eaef64382ffc7324cd9d6266a3342dc43442f2ac",
|
|
||||||
"blk.24.ffn_norm.weight": "77c1445a8639ad24938bdf0280233eea2362d47391421833dfa72ec756dfc1e8",
|
|
||||||
"blk.24.ffn_up.weight": "78235ac729ee23c1cf1ae543751e3af32776d8808cee6e529c2a625a1f027654",
|
|
||||||
"blk.24.post_attention_norm.weight": "161f71b6d07628d43e4ae51a4c9088ec6ca2db123a17986a14505d83fdd04dad",
|
|
||||||
"blk.24.post_ffw_norm.weight": "cf1ba692aa683368b02ac413e69b2521b98c69a5274eacbb54165b53bf38a8b2",
|
|
||||||
"blk.25.attn_k.weight": "057a56bd8c8d2b41608d1f71faa3052902152ddf85e47669ad950c1c3e77c33f",
|
|
||||||
"blk.25.attn_norm.weight": "b7179fe02c334da556ddcf6c1b502245639a728c4cbba8b552d8e1df4565ee9d",
|
|
||||||
"blk.25.attn_output.weight": "4fed8b05b08a0ff75ffd022701bbeb52f17b23d09332a1ddcba737244bd0d3b0",
|
|
||||||
"blk.25.attn_q.weight": "c52e99f5d38bf7538d6106a0bbf38ac6dc6296bca9a3f849afa384ea67b4af01",
|
|
||||||
"blk.25.attn_v.weight": "c49c23d8e1cfa6a8eb971eb69942204890c6d7d830dc8774c84b108a80598912",
|
|
||||||
"blk.25.ffn_down.weight": "c08d4dc8412b19fdc870c164b83c341b236ec6fe7bb4a9bcfe0dc100faa20286",
|
|
||||||
"blk.25.ffn_gate.weight": "1a4cb3f36735d59181721471452807903006539e5e1b5ceb4f72d1d7ae134127",
|
|
||||||
"blk.25.ffn_norm.weight": "8fd6bd0dcec5198761525a36992a57c9ec5e9da60a22092839a84ae8c4e87f26",
|
|
||||||
"blk.25.ffn_up.weight": "3a00f39bdd5f31dc5e3b281d2002e1ac4f2475d49a0ac1d7720a25b377dcd04a",
|
|
||||||
"blk.25.post_attention_norm.weight": "e5f31a648612c859b6d21c9ee426e87a86cb1973dfdd86276c767371d9cef5ad",
|
|
||||||
"blk.25.post_ffw_norm.weight": "553c3bd774922c99c2384380a142d019881d30dbf0fe3bf9430dabfb3f6cbd33",
|
|
||||||
"output_norm.weight": "49445c4585ab0a8135717a0bdb1cda4a062a030177d0119561d91542aec5744b"
|
|
||||||
}
|
|
|
@ -194,8 +194,6 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
|
||||||
|
|
||||||
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
|
If a different directory needs to be used, set the environment variable `OLLAMA_MODELS` to the chosen directory.
|
||||||
|
|
||||||
> Note: on Linux using the standard installer, the `ollama` user needs read and write access to the specified directory. To assign the directory to the `ollama` user run `sudo chown -R ollama:ollama <directory>`.
|
|
||||||
|
|
||||||
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
|
Refer to the section [above](#how-do-i-configure-ollama-server) for how to set environment variables on your platform.
|
||||||
|
|
||||||
## How can I use Ollama in Visual Studio Code?
|
## How can I use Ollama in Visual Studio Code?
|
||||||
|
|
|
@ -10,7 +10,7 @@ Check your compute compatibility to see if your card is supported:
|
||||||
| 9.0 | NVIDIA | `H100` |
|
| 9.0 | NVIDIA | `H100` |
|
||||||
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
|
| 8.9 | GeForce RTX 40xx | `RTX 4090` `RTX 4080 SUPER` `RTX 4080` `RTX 4070 Ti SUPER` `RTX 4070 Ti` `RTX 4070 SUPER` `RTX 4070` `RTX 4060 Ti` `RTX 4060` |
|
||||||
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
|
| | NVIDIA Professional | `L4` `L40` `RTX 6000` |
|
||||||
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` `RTX 3050 Ti` `RTX 3050` |
|
| 8.6 | GeForce RTX 30xx | `RTX 3090 Ti` `RTX 3090` `RTX 3080 Ti` `RTX 3080` `RTX 3070 Ti` `RTX 3070` `RTX 3060 Ti` `RTX 3060` |
|
||||||
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |
|
| | NVIDIA Professional | `A40` `RTX A6000` `RTX A5000` `RTX A4000` `RTX A3000` `RTX A2000` `A10` `A16` `A2` |
|
||||||
| 8.0 | NVIDIA | `A100` `A30` |
|
| 8.0 | NVIDIA | `A100` `A30` |
|
||||||
| 7.5 | GeForce GTX/RTX | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060` |
|
| 7.5 | GeForce GTX/RTX | `GTX 1650 Ti` `TITAN RTX` `RTX 2080 Ti` `RTX 2080` `RTX 2070` `RTX 2060` |
|
||||||
|
|
112
docs/linux.md
112
docs/linux.md
|
@ -1,59 +1,44 @@
|
||||||
# Linux
|
# Ollama on Linux
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
|
||||||
To install Ollama, run the following command:
|
Install Ollama running this one-liner:
|
||||||
|
|
||||||
```shell
|
>
|
||||||
|
|
||||||
|
```bash
|
||||||
curl -fsSL https://ollama.com/install.sh | sh
|
curl -fsSL https://ollama.com/install.sh | sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## AMD Radeon GPU support
|
||||||
|
|
||||||
|
While AMD has contributed the `amdgpu` driver upstream to the official linux
|
||||||
|
kernel source, the version is older and may not support all ROCm features. We
|
||||||
|
recommend you install the latest driver from
|
||||||
|
https://www.amd.com/en/support/linux-drivers for best support of your Radeon
|
||||||
|
GPU.
|
||||||
|
|
||||||
## Manual install
|
## Manual install
|
||||||
|
|
||||||
Download and extract the package:
|
### Download `ollama`
|
||||||
|
|
||||||
```shell
|
Download and extract the Linux package:
|
||||||
curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
|
|
||||||
sudo tar -C /usr -xzf ollama-linux-amd64.tgz
|
```bash
|
||||||
|
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
|
||||||
```
|
```
|
||||||
|
|
||||||
Start Ollama:
|
If you have an AMD GPU, also download and extract the ROCm package into the same location
|
||||||
|
```bash
|
||||||
```shell
|
curl -fsSL https://ollama.com/download/ollama-linux-amd64-rocm.tgz | sudo tar zx -C /usr
|
||||||
ollama serve
|
|
||||||
```
|
|
||||||
|
|
||||||
In another terminal, verify that Ollama is running:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
ollama -v
|
|
||||||
```
|
|
||||||
|
|
||||||
### AMD GPU install
|
|
||||||
|
|
||||||
If you have an AMD GPU, also download and extract the additional ROCm package:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl -L https://ollama.com/download/ollama-linux-amd64-rocm.tgz -o ollama-linux-amd64-rocm.tgz
|
|
||||||
sudo tar -C /usr -xzf ollama-linux-amd64-rocm.tgz
|
|
||||||
```
|
|
||||||
|
|
||||||
### ARM64 install
|
|
||||||
|
|
||||||
Download and extract the ARM64-specific package:
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl -L https://ollama.com/download/ollama-linux-arm64.tgz -o ollama-linux-arm64.tgz
|
|
||||||
sudo tar -C /usr -xzf ollama-linux-arm64.tgz
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Adding Ollama as a startup service (recommended)
|
### Adding Ollama as a startup service (recommended)
|
||||||
|
|
||||||
Create a user and group for Ollama:
|
Create a user for Ollama:
|
||||||
|
|
||||||
```shell
|
```bash
|
||||||
sudo useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama
|
sudo useradd -r -s /bin/false -m -d /usr/share/ollama ollama
|
||||||
sudo usermod -a -G ollama $(whoami)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Create a service file in `/etc/systemd/system/ollama.service`:
|
Create a service file in `/etc/systemd/system/ollama.service`:
|
||||||
|
@ -69,7 +54,6 @@ User=ollama
|
||||||
Group=ollama
|
Group=ollama
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=3
|
RestartSec=3
|
||||||
Environment="PATH=$PATH"
|
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=default.target
|
WantedBy=default.target
|
||||||
|
@ -77,54 +61,46 @@ WantedBy=default.target
|
||||||
|
|
||||||
Then start the service:
|
Then start the service:
|
||||||
|
|
||||||
```shell
|
```bash
|
||||||
sudo systemctl daemon-reload
|
sudo systemctl daemon-reload
|
||||||
sudo systemctl enable ollama
|
sudo systemctl enable ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
### Install CUDA drivers (optional)
|
### Install CUDA drivers (optional – for Nvidia GPUs)
|
||||||
|
|
||||||
[Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
|
[Download and install](https://developer.nvidia.com/cuda-downloads) CUDA.
|
||||||
|
|
||||||
Verify that the drivers are installed by running the following command, which should print details about your GPU:
|
Verify that the drivers are installed by running the following command, which should print details about your GPU:
|
||||||
|
|
||||||
```shell
|
```bash
|
||||||
nvidia-smi
|
nvidia-smi
|
||||||
```
|
```
|
||||||
|
|
||||||
### Install AMD ROCm drivers (optional)
|
### Install ROCm (optional - for Radeon GPUs)
|
||||||
|
[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html)
|
||||||
|
|
||||||
[Download and Install](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html) ROCm v6.
|
Make sure to install ROCm v6
|
||||||
|
|
||||||
### Start Ollama
|
### Start Ollama
|
||||||
|
|
||||||
Start Ollama and verify it is running:
|
Start Ollama using `systemd`:
|
||||||
|
|
||||||
```shell
|
```bash
|
||||||
sudo systemctl start ollama
|
sudo systemctl start ollama
|
||||||
sudo systemctl status ollama
|
|
||||||
```
|
```
|
||||||
|
|
||||||
> [!NOTE]
|
## Update
|
||||||
> While AMD has contributed the `amdgpu` driver upstream to the official linux
|
|
||||||
> kernel source, the version is older and may not support all ROCm features. We
|
|
||||||
> recommend you install the latest driver from
|
|
||||||
> https://www.amd.com/en/support/linux-drivers for best support of your Radeon
|
|
||||||
> GPU.
|
|
||||||
|
|
||||||
## Updating
|
Update ollama by running the install script again:
|
||||||
|
|
||||||
Update Ollama by running the install script again:
|
```bash
|
||||||
|
|
||||||
```shell
|
|
||||||
curl -fsSL https://ollama.com/install.sh | sh
|
curl -fsSL https://ollama.com/install.sh | sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Or by re-downloading Ollama:
|
Or by downloading the ollama binary:
|
||||||
|
|
||||||
```shell
|
```bash
|
||||||
curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
|
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
|
||||||
sudo tar -C /usr -xzf ollama-linux-amd64.tgz
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Installing specific versions
|
## Installing specific versions
|
||||||
|
@ -133,15 +109,15 @@ Use `OLLAMA_VERSION` environment variable with the install script to install a s
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```shell
|
```
|
||||||
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.3.9 sh
|
curl -fsSL https://ollama.com/install.sh | OLLAMA_VERSION=0.1.32 sh
|
||||||
```
|
```
|
||||||
|
|
||||||
## Viewing logs
|
## Viewing logs
|
||||||
|
|
||||||
To view logs of Ollama running as a startup service, run:
|
To view logs of Ollama running as a startup service, run:
|
||||||
|
|
||||||
```shell
|
```bash
|
||||||
journalctl -e -u ollama
|
journalctl -e -u ollama
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -149,7 +125,7 @@ journalctl -e -u ollama
|
||||||
|
|
||||||
Remove the ollama service:
|
Remove the ollama service:
|
||||||
|
|
||||||
```shell
|
```bash
|
||||||
sudo systemctl stop ollama
|
sudo systemctl stop ollama
|
||||||
sudo systemctl disable ollama
|
sudo systemctl disable ollama
|
||||||
sudo rm /etc/systemd/system/ollama.service
|
sudo rm /etc/systemd/system/ollama.service
|
||||||
|
@ -157,13 +133,13 @@ sudo rm /etc/systemd/system/ollama.service
|
||||||
|
|
||||||
Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`):
|
Remove the ollama binary from your bin directory (either `/usr/local/bin`, `/usr/bin`, or `/bin`):
|
||||||
|
|
||||||
```shell
|
```bash
|
||||||
sudo rm $(which ollama)
|
sudo rm $(which ollama)
|
||||||
```
|
```
|
||||||
|
|
||||||
Remove the downloaded models and Ollama service user and group:
|
Remove the downloaded models and Ollama service user and group:
|
||||||
|
|
||||||
```shell
|
```bash
|
||||||
sudo rm -r /usr/share/ollama
|
sudo rm -r /usr/share/ollama
|
||||||
sudo userdel ollama
|
sudo userdel ollama
|
||||||
sudo groupdel ollama
|
sudo groupdel ollama
|
||||||
|
|
|
@ -48,9 +48,6 @@ the explorer window by hitting `<cmd>+R` and type in:
|
||||||
- `explorer %HOMEPATH%\.ollama` contains models and configuration
|
- `explorer %HOMEPATH%\.ollama` contains models and configuration
|
||||||
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
|
- `explorer %TEMP%` contains temporary executable files in one or more `ollama*` directories
|
||||||
|
|
||||||
## Uninstall
|
|
||||||
|
|
||||||
The Ollama Windows installer registers an Uninstaller application. Under `Add or remove programs` in Windows Settings, you can uninstall Ollama.
|
|
||||||
|
|
||||||
## Standalone CLI
|
## Standalone CLI
|
||||||
|
|
||||||
|
|
|
@ -112,26 +112,6 @@ func KeepAlive() (keepAlive time.Duration) {
|
||||||
return keepAlive
|
return keepAlive
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
|
|
||||||
// Zero or Negative values are treated as infinite.
|
|
||||||
// Default is 5 minutes.
|
|
||||||
func LoadTimeout() (loadTimeout time.Duration) {
|
|
||||||
loadTimeout = 5 * time.Minute
|
|
||||||
if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
|
|
||||||
if d, err := time.ParseDuration(s); err == nil {
|
|
||||||
loadTimeout = d
|
|
||||||
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
|
|
||||||
loadTimeout = time.Duration(n) * time.Second
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if loadTimeout <= 0 {
|
|
||||||
return time.Duration(math.MaxInt64)
|
|
||||||
}
|
|
||||||
|
|
||||||
return loadTimeout
|
|
||||||
}
|
|
||||||
|
|
||||||
func Bool(k string) func() bool {
|
func Bool(k string) func() bool {
|
||||||
return func() bool {
|
return func() bool {
|
||||||
if s := Var(k); s != "" {
|
if s := Var(k); s != "" {
|
||||||
|
@ -251,23 +231,6 @@ var (
|
||||||
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
|
||||||
)
|
)
|
||||||
|
|
||||||
func Uint64(key string, defaultValue uint64) func() uint64 {
|
|
||||||
return func() uint64 {
|
|
||||||
if s := Var(key); s != "" {
|
|
||||||
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
|
|
||||||
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
|
|
||||||
} else {
|
|
||||||
return n
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return defaultValue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set aside VRAM per GPU
|
|
||||||
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
|
|
||||||
|
|
||||||
type EnvVar struct {
|
type EnvVar struct {
|
||||||
Name string
|
Name string
|
||||||
Value any
|
Value any
|
||||||
|
@ -278,11 +241,9 @@ func AsMap() map[string]EnvVar {
|
||||||
ret := map[string]EnvVar{
|
ret := map[string]EnvVar{
|
||||||
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
|
||||||
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
|
||||||
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
|
|
||||||
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
|
||||||
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
|
||||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
|
||||||
"OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
|
|
||||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
|
||||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
|
||||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},
|
||||||
|
|
|
@ -215,40 +215,6 @@ func TestKeepAlive(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestLoadTimeout(t *testing.T) {
|
|
||||||
defaultTimeout := 5 * time.Minute
|
|
||||||
cases := map[string]time.Duration{
|
|
||||||
"": defaultTimeout,
|
|
||||||
"1s": time.Second,
|
|
||||||
"1m": time.Minute,
|
|
||||||
"1h": time.Hour,
|
|
||||||
"5m0s": defaultTimeout,
|
|
||||||
"1h2m3s": 1*time.Hour + 2*time.Minute + 3*time.Second,
|
|
||||||
"0": time.Duration(math.MaxInt64),
|
|
||||||
"60": 60 * time.Second,
|
|
||||||
"120": 2 * time.Minute,
|
|
||||||
"3600": time.Hour,
|
|
||||||
"-0": time.Duration(math.MaxInt64),
|
|
||||||
"-1": time.Duration(math.MaxInt64),
|
|
||||||
"-1m": time.Duration(math.MaxInt64),
|
|
||||||
// invalid values
|
|
||||||
" ": defaultTimeout,
|
|
||||||
"???": defaultTimeout,
|
|
||||||
"1d": defaultTimeout,
|
|
||||||
"1y": defaultTimeout,
|
|
||||||
"1w": defaultTimeout,
|
|
||||||
}
|
|
||||||
|
|
||||||
for tt, expect := range cases {
|
|
||||||
t.Run(tt, func(t *testing.T) {
|
|
||||||
t.Setenv("OLLAMA_LOAD_TIMEOUT", tt)
|
|
||||||
if actual := LoadTimeout(); actual != expect {
|
|
||||||
t.Errorf("%s: expected %s, got %s", tt, expect, actual)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestVar(t *testing.T) {
|
func TestVar(t *testing.T) {
|
||||||
cases := map[string]string{
|
cases := map[string]string{
|
||||||
"value": "value",
|
"value": "value",
|
||||||
|
|
|
@ -57,7 +57,7 @@ func cudaVariant(gpuInfo CudaGPUInfo) string {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 || (gpuInfo.DriverMajor == 12 && gpuInfo.DriverMinor == 0) {
|
if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
|
||||||
return "v11"
|
return "v11"
|
||||||
}
|
}
|
||||||
return "v12"
|
return "v12"
|
||||||
|
|
2
llm/ext_server/CMakeLists.txt
vendored
2
llm/ext_server/CMakeLists.txt
vendored
|
@ -2,7 +2,7 @@ set(TARGET ollama_llama_server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
|
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
add_executable(${TARGET} server.cpp utils.hpp httplib.h)
|
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
|
|
24596
llm/ext_server/json.hpp
vendored
Normal file
24596
llm/ext_server/json.hpp
vendored
Normal file
File diff suppressed because it is too large
Load diff
25
llm/ext_server/server.cpp
vendored
25
llm/ext_server/server.cpp
vendored
|
@ -262,7 +262,7 @@ struct server_slot {
|
||||||
char buffer[512];
|
char buffer[512];
|
||||||
double t_token = t_prompt_processing / n_prompt_tokens_processed;
|
double t_token = t_prompt_processing / n_prompt_tokens_processed;
|
||||||
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||||
snprintf(buffer, sizeof(buffer), "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
t_prompt_processing, n_prompt_tokens_processed,
|
t_prompt_processing, n_prompt_tokens_processed,
|
||||||
t_token, n_tokens_second);
|
t_token, n_tokens_second);
|
||||||
LOG_DEBUG(buffer, {
|
LOG_DEBUG(buffer, {
|
||||||
|
@ -276,7 +276,7 @@ struct server_slot {
|
||||||
|
|
||||||
t_token = t_token_generation / n_decoded;
|
t_token = t_token_generation / n_decoded;
|
||||||
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
n_tokens_second = 1e3 / t_token_generation * n_decoded;
|
||||||
snprintf(buffer, sizeof(buffer), "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
t_token_generation, n_decoded,
|
t_token_generation, n_decoded,
|
||||||
t_token, n_tokens_second);
|
t_token, n_tokens_second);
|
||||||
LOG_DEBUG(buffer, {
|
LOG_DEBUG(buffer, {
|
||||||
|
@ -288,7 +288,7 @@ struct server_slot {
|
||||||
{"n_tokens_second", n_tokens_second},
|
{"n_tokens_second", n_tokens_second},
|
||||||
});
|
});
|
||||||
|
|
||||||
snprintf(buffer, sizeof(buffer), " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
|
||||||
LOG_DEBUG(buffer, {
|
LOG_DEBUG(buffer, {
|
||||||
{"slot_id", id},
|
{"slot_id", id},
|
||||||
{"task_id", task_id},
|
{"task_id", task_id},
|
||||||
|
@ -425,7 +425,7 @@ struct llama_server_context
|
||||||
|
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
add_bos_token = llama_add_bos_token(model);
|
add_bos_token = llama_should_add_bos_token(model);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1031,7 +1031,7 @@ struct llama_server_context
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
|
||||||
LOG_TEE("Error processing the given image");
|
LOG_TEE("Error processing the given image");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf("options:\n");
|
printf("options:\n");
|
||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
|
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
|
printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
|
@ -2287,7 +2287,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.cpuparams.n_threads = std::stoi(argv[i]);
|
params.n_threads = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
else if (arg == "--grp-attn-n" || arg == "-gan")
|
else if (arg == "--grp-attn-n" || arg == "-gan")
|
||||||
{
|
{
|
||||||
|
@ -2315,7 +2315,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
|
params.n_threads_batch = std::stoi(argv[i]);
|
||||||
}
|
}
|
||||||
else if (arg == "--threads-http")
|
else if (arg == "--threads-http")
|
||||||
{
|
{
|
||||||
|
@ -2626,11 +2626,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||||
params.kv_overrides.back().key[0] = 0;
|
params.kv_overrides.back().key[0] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
postprocess_cpu_params(params.cpuparams, nullptr);
|
|
||||||
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
|
||||||
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
|
||||||
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
|
||||||
|
|
||||||
if (invalid_param)
|
if (invalid_param)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||||
|
@ -2780,8 +2775,8 @@ int main(int argc, char **argv) {
|
||||||
{"commit", LLAMA_COMMIT}});
|
{"commit", LLAMA_COMMIT}});
|
||||||
|
|
||||||
LOG_INFO("system info", {
|
LOG_INFO("system info", {
|
||||||
{"n_threads", params.cpuparams.n_threads},
|
{"n_threads", params.n_threads},
|
||||||
{"n_threads_batch", params.cpuparams_batch.n_threads},
|
{"n_threads_batch", params.n_threads_batch},
|
||||||
{"total_threads", std::thread::hardware_concurrency()},
|
{"total_threads", std::thread::hardware_concurrency()},
|
||||||
{"system_info", llama_print_system_info()},
|
{"system_info", llama_print_system_info()},
|
||||||
});
|
});
|
||||||
|
|
|
@ -19,7 +19,7 @@ sign() {
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
|
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
|
||||||
|
|
||||||
case "${GOARCH}" in
|
case "${GOARCH}" in
|
||||||
"amd64")
|
"amd64")
|
||||||
|
|
|
@ -360,13 +360,11 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||||
|
|
||||||
switch llm.KV().Architecture() {
|
switch llm.KV().Architecture() {
|
||||||
case "llama":
|
case "llama":
|
||||||
fullOffload = max(
|
fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
|
||||||
4*batch*(1+4*embedding+context*(1+heads)),
|
|
||||||
4*batch*(embedding+vocab),
|
|
||||||
)
|
|
||||||
|
|
||||||
partialOffload = 4 * batch * embedding
|
partialOffload = 4 * batch * embedding
|
||||||
partialOffload += max(
|
partialOffload += max(
|
||||||
|
// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
|
||||||
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
|
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
|
||||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
)
|
)
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177
|
Subproject commit 1e6f6554aa11fa10160a5fda689e736c3c34169f
|
|
@ -7,7 +7,6 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/envconfig"
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
)
|
)
|
||||||
|
@ -95,7 +94,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
// Overflow that didn't fit into the GPU
|
// Overflow that didn't fit into the GPU
|
||||||
var overflow uint64
|
var overflow uint64
|
||||||
|
|
||||||
overhead := envconfig.GpuOverhead()
|
|
||||||
availableList := make([]string, len(gpus))
|
availableList := make([]string, len(gpus))
|
||||||
for i, gpu := range gpus {
|
for i, gpu := range gpus {
|
||||||
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
|
||||||
|
@ -166,22 +164,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
gzo = gpuZeroOverhead
|
gzo = gpuZeroOverhead
|
||||||
}
|
}
|
||||||
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||||
if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
||||||
slog.Debug("gpu has too little memory to allocate any layers",
|
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
||||||
"id", gpus[i].ID,
|
|
||||||
"library", gpus[i].Library,
|
|
||||||
"variant", gpus[i].Variant,
|
|
||||||
"compute", gpus[i].Compute,
|
|
||||||
"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
|
|
||||||
"name", gpus[i].Name,
|
|
||||||
"total", format.HumanBytes2(gpus[i].TotalMemory),
|
|
||||||
"available", format.HumanBytes2(gpus[i].FreeMemory),
|
|
||||||
"minimum_memory", gpus[i].MinimumMemory,
|
|
||||||
"layer_size", format.HumanBytes2(layerSize),
|
|
||||||
"gpu_zer_overhead", format.HumanBytes2(gzo),
|
|
||||||
"partial_offload", format.HumanBytes2(graphPartialOffload),
|
|
||||||
"full_offload", format.HumanBytes2(graphFullOffload),
|
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
||||||
|
@ -212,7 +196,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
g := gpusWithSpace[i%j]
|
g := gpusWithSpace[i%j]
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
if (g.g.FreeMemory - overhead) > used+layerSize {
|
if g.g.FreeMemory > used+layerSize {
|
||||||
gpuAllocations[g.i] += layerSize
|
gpuAllocations[g.i] += layerSize
|
||||||
layerCounts[g.i]++
|
layerCounts[g.i]++
|
||||||
layerCount++
|
layerCount++
|
||||||
|
@ -235,7 +219,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
g := gpusWithSpace[layerCount%j]
|
g := gpusWithSpace[layerCount%j]
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
|
if g.g.FreeMemory > used+memoryLayerOutput {
|
||||||
gpuAllocations[g.i] += memoryLayerOutput
|
gpuAllocations[g.i] += memoryLayerOutput
|
||||||
layerCounts[g.i]++
|
layerCounts[g.i]++
|
||||||
layerCount++
|
layerCount++
|
||||||
|
@ -322,7 +306,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m MemoryEstimate) log() {
|
func (m MemoryEstimate) log() {
|
||||||
overhead := envconfig.GpuOverhead()
|
|
||||||
slog.Info(
|
slog.Info(
|
||||||
"offload to "+m.inferenceLibrary,
|
"offload to "+m.inferenceLibrary,
|
||||||
slog.Group(
|
slog.Group(
|
||||||
|
@ -340,7 +323,6 @@ func (m MemoryEstimate) log() {
|
||||||
"memory",
|
"memory",
|
||||||
// memory available by GPU for offloading
|
// memory available by GPU for offloading
|
||||||
"available", m.availableList,
|
"available", m.availableList,
|
||||||
"gpu_overhead", format.HumanBytes2(overhead),
|
|
||||||
slog.Group(
|
slog.Group(
|
||||||
"required",
|
"required",
|
||||||
// memory required for full offloading
|
// memory required for full offloading
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
index 88355971..dd7d41ed 100644
|
index a207451f..2ddf431d 100644
|
||||||
--- a/src/llama.cpp
|
--- a/src/llama.cpp
|
||||||
+++ b/src/llama.cpp
|
+++ b/src/llama.cpp
|
||||||
@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
|
@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
|
||||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||||
vocab.tokenizer_add_space_prefix = false;
|
vocab.tokenizer_add_space_prefix = false;
|
||||||
vocab.tokenizer_clean_spaces = true;
|
vocab.tokenizer_clean_spaces = true;
|
||||||
|
@ -20,9 +20,9 @@ index 88355971..dd7d41ed 100644
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "llama3" ||
|
tokenizer_pre == "llama3" ||
|
||||||
@@ -6188,7 +6179,8 @@ static void llm_load_vocab(
|
@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
|
||||||
tokenizer_pre == "exaone") {
|
tokenizer_pre == "codeshell") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
||||||
} else {
|
} else {
|
||||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||||
|
|
|
@ -1,36 +1,37 @@
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
index 88355971..d7db689b 100644
|
index 1fe2b9f7..a43312a7 100644
|
||||||
--- a/src/llama.cpp
|
--- a/src/llama.cpp
|
||||||
+++ b/src/llama.cpp
|
+++ b/src/llama.cpp
|
||||||
@@ -15906,7 +15906,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
|
|
||||||
// TODO: use a per-batch flag for logits presence instead
|
// TODO: use a per-batch flag for logits presence instead
|
||||||
- const bool has_logits = !cparams.embeddings;
|
- const bool has_logits = !cparams.embeddings;
|
||||||
+ const bool has_logits = cparams.causal_attn;
|
+ const bool has_logits = cparams.causal_attn;
|
||||||
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
|
||||||
|
|
||||||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||||
@@ -16175,20 +16175,23 @@ static int llama_decode_internal(
|
@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
|
||||||
// no output
|
// no output
|
||||||
res = nullptr;
|
res = nullptr;
|
||||||
embd = nullptr;
|
embd = nullptr;
|
||||||
- } else if (cparams.embeddings) {
|
- } else if (cparams.embeddings) {
|
||||||
- res = nullptr; // do not extract logits for embedding case
|
- res = nullptr; // do not extract logits for embedding case
|
||||||
- embd = nullptr;
|
- embd = gf->nodes[gf->n_nodes - 1];
|
||||||
|
- if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
||||||
|
- embd = gf->nodes[gf->n_nodes - 2];
|
||||||
+ }
|
+ }
|
||||||
+
|
+
|
||||||
+ if (cparams.embeddings) {
|
+ if (cparams.embeddings) {
|
||||||
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
||||||
- if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
|
|
||||||
- embd = gf->nodes[i];
|
|
||||||
+ embd = gf->nodes[i];
|
+ embd = gf->nodes[i];
|
||||||
+ if (strcmp(embd->name, "result_embd_pooled") == 0) {
|
+ if (strcmp(embd->name, "result_embd_pooled") == 0) {
|
||||||
break;
|
+ break;
|
||||||
|
+ }
|
||||||
}
|
}
|
||||||
}
|
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
||||||
- GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
|
- } else {
|
||||||
} else {
|
+ } else {
|
||||||
embd = nullptr; // do not extract embeddings when not needed
|
embd = nullptr; // do not extract embeddings when not needed
|
||||||
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
||||||
}
|
}
|
||||||
|
@ -38,6 +39,7 @@ index 88355971..d7db689b 100644
|
||||||
+ if (!cparams.causal_attn) {
|
+ if (!cparams.causal_attn) {
|
||||||
+ res = nullptr; // do not extract logits when not needed
|
+ res = nullptr; // do not extract logits when not needed
|
||||||
+ }
|
+ }
|
||||||
|
+
|
||||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||||
|
|
||||||
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
||||||
|
|
350
llm/patches/09-lora.diff
Normal file
350
llm/patches/09-lora.diff
Normal file
|
@ -0,0 +1,350 @@
|
||||||
|
diff --git a/common/common.cpp b/common/common.cpp
|
||||||
|
index 2e8374d5..70d0afde 100644
|
||||||
|
--- a/common/common.cpp
|
||||||
|
+++ b/common/common.cpp
|
||||||
|
@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
||||||
|
if (loaded_la.adapter == nullptr) {
|
||||||
|
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||||
|
- llama_free(lctx);
|
||||||
|
- llama_free_model(model);
|
||||||
|
- return iparams;
|
||||||
|
+
|
||||||
|
+ // if that fails, try loading as ggla for compatibility
|
||||||
|
+ int err = llama_model_apply_lora_from_file(model,
|
||||||
|
+ la.path.c_str(),
|
||||||
|
+ la.scale,
|
||||||
|
+ nullptr,
|
||||||
|
+ params.n_threads);
|
||||||
|
+ if (err != 0) {
|
||||||
|
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
|
+ llama_free(lctx);
|
||||||
|
+ llama_free_model(model);
|
||||||
|
+ return iparams;
|
||||||
|
+ } else {
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
||||||
|
}
|
||||||
|
diff --git a/include/llama.h b/include/llama.h
|
||||||
|
index 93fd77ca..b0fb37a6 100644
|
||||||
|
--- a/include/llama.h
|
||||||
|
+++ b/include/llama.h
|
||||||
|
@@ -1160,6 +1160,20 @@ extern "C" {
|
||||||
|
|
||||||
|
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
||||||
|
|
||||||
|
+ // Apply a LoRA adapter to a loaded model
|
||||||
|
+ // path_base_model is the path to a higher quality model to use as a base for
|
||||||
|
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||||
|
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||||
|
+ // will be applied on top of the previous one
|
||||||
|
+ // Returns 0 on success
|
||||||
|
+ LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||||
|
+ const struct llama_model * model,
|
||||||
|
+ const char * path_lora,
|
||||||
|
+ float scale,
|
||||||
|
+ const char * path_base_model,
|
||||||
|
+ int32_t n_threads);
|
||||||
|
+
|
||||||
|
+
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
|
index 80a0dd0f..9d7b0e17 100644
|
||||||
|
--- a/src/llama.cpp
|
||||||
|
+++ b/src/llama.cpp
|
||||||
|
@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
|
||||||
|
fputs(text, stderr);
|
||||||
|
fflush(stderr);
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+static int llama_apply_lora_from_file_internal(
|
||||||
|
+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
||||||
|
+) {
|
||||||
|
+ LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||||
|
+
|
||||||
|
+ const int64_t t_start_lora_us = ggml_time_us();
|
||||||
|
+
|
||||||
|
+ llama_file fin(path_lora, "rb");
|
||||||
|
+
|
||||||
|
+ // verify magic and version
|
||||||
|
+ {
|
||||||
|
+ uint32_t magic = fin.read_u32();
|
||||||
|
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ uint32_t format_version = fin.read_u32();
|
||||||
|
+ if (format_version != 1) {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ int32_t lora_r = fin.read_u32();
|
||||||
|
+ int32_t lora_alpha = fin.read_u32();
|
||||||
|
+ float scaling = scale * (float)lora_alpha / (float)lora_r;
|
||||||
|
+
|
||||||
|
+ LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
||||||
|
+
|
||||||
|
+ // load base model
|
||||||
|
+ std::unique_ptr<llama_model_loader> ml;
|
||||||
|
+ if (path_base_model) {
|
||||||
|
+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||||
|
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
||||||
|
+ ml->init_mappings(/*prefetch*/ false); // no prefetching
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ struct tensor_meta {
|
||||||
|
+ std::string name;
|
||||||
|
+ ggml_type type;
|
||||||
|
+ int32_t ne[2];
|
||||||
|
+ size_t offset;
|
||||||
|
+ };
|
||||||
|
+ std::map<std::string, tensor_meta> tensor_meta_map;
|
||||||
|
+
|
||||||
|
+ // load all tensor meta
|
||||||
|
+ while (true) {
|
||||||
|
+ if (fin.tell() == fin.size) {
|
||||||
|
+ // eof
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ int32_t n_dims;
|
||||||
|
+ int32_t name_len;
|
||||||
|
+ int32_t ftype;
|
||||||
|
+
|
||||||
|
+ fin.read_raw(&n_dims, sizeof(n_dims));
|
||||||
|
+ fin.read_raw(&name_len, sizeof(name_len));
|
||||||
|
+ fin.read_raw(&ftype, sizeof(ftype));
|
||||||
|
+
|
||||||
|
+ if (n_dims != 1 && n_dims != 2) {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ int32_t ne[2] = { 1, 1 };
|
||||||
|
+ for (int i = 0; i < n_dims; ++i) {
|
||||||
|
+ fin.read_raw(&ne[i], sizeof(ne[i]));
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ std::string name;
|
||||||
|
+ {
|
||||||
|
+ GGML_ASSERT(name_len < GGML_MAX_NAME);
|
||||||
|
+ char buf[GGML_MAX_NAME];
|
||||||
|
+ fin.read_raw(buf, name_len);
|
||||||
|
+ name = std::string(buf, name_len);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ // check for lora suffix
|
||||||
|
+ std::string lora_suffix;
|
||||||
|
+ if (name.length() > 6) {
|
||||||
|
+ lora_suffix = name.substr(name.length() - 6);
|
||||||
|
+ }
|
||||||
|
+ if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ // tensor type
|
||||||
|
+ ggml_type wtype;
|
||||||
|
+ switch (ftype) {
|
||||||
|
+ case 0: wtype = GGML_TYPE_F32; break;
|
||||||
|
+ case 1: wtype = GGML_TYPE_F16; break;
|
||||||
|
+ default:
|
||||||
|
+ {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
||||||
|
+ __func__, ftype);
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ // data offset
|
||||||
|
+ size_t offset = fin.tell();
|
||||||
|
+ offset = (offset + 31) & -32;
|
||||||
|
+
|
||||||
|
+ // skip tensor data
|
||||||
|
+ fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
|
||||||
|
+
|
||||||
|
+ tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ bool warned = false;
|
||||||
|
+ int n_tensors = 0;
|
||||||
|
+
|
||||||
|
+ // apply
|
||||||
|
+ ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
||||||
|
+ if (backend_cpu == nullptr) {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+ ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
|
||||||
|
+
|
||||||
|
+ std::vector<no_init<uint8_t>> read_buf;
|
||||||
|
+ for (const auto & it : model.tensors_by_name) {
|
||||||
|
+ const std::string & base_name = it.first;
|
||||||
|
+ ggml_tensor * model_t = it.second;
|
||||||
|
+
|
||||||
|
+ if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
|
||||||
|
+ tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
|
||||||
|
+ continue;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
|
||||||
|
+ tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
|
||||||
|
+
|
||||||
|
+ ggml_init_params lora_init_params = {
|
||||||
|
+ /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
||||||
|
+ /* .mem_buffer */ nullptr,
|
||||||
|
+ /* .no_alloc */ true,
|
||||||
|
+ };
|
||||||
|
+ ggml_context * lora_ctx = ggml_init(lora_init_params);
|
||||||
|
+ if (lora_ctx == nullptr) {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
|
||||||
|
+ ggml_backend_free(backend_cpu);
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ // create tensors
|
||||||
|
+ ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
|
||||||
|
+ ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
|
||||||
|
+ ggml_set_name(loraA, metaA.name.c_str());
|
||||||
|
+ ggml_set_name(loraB, metaB.name.c_str());
|
||||||
|
+
|
||||||
|
+ ggml_tensor * base_t;
|
||||||
|
+ if (ml) {
|
||||||
|
+ if (!ml->get_tensor_meta(base_name.c_str())) {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+ base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
|
||||||
|
+ } else {
|
||||||
|
+ base_t = ggml_dup_tensor(lora_ctx, model_t);
|
||||||
|
+ }
|
||||||
|
+ ggml_set_name(base_t, base_name.c_str());
|
||||||
|
+
|
||||||
|
+ // allocate in backend buffer
|
||||||
|
+ ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
||||||
|
+ if (lora_buf == nullptr) {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ // load tensor data
|
||||||
|
+ auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
|
||||||
|
+ read_buf.resize(ggml_nbytes(tensor));
|
||||||
|
+ fin.seek(tensor_meta.offset, SEEK_SET);
|
||||||
|
+ fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
|
||||||
|
+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
|
||||||
|
+ };
|
||||||
|
+ load_tensor(metaA, loraA);
|
||||||
|
+ load_tensor(metaB, loraB);
|
||||||
|
+
|
||||||
|
+ // load base model tensor data
|
||||||
|
+ if (ml) {
|
||||||
|
+ ml->load_data_for(base_t);
|
||||||
|
+ } else {
|
||||||
|
+ ggml_backend_tensor_copy(model_t, base_t);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (ggml_is_quantized(base_t->type) && !warned) {
|
||||||
|
+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
||||||
|
+ "use a f16 or f32 base model with --lora-base\n", __func__);
|
||||||
|
+ warned = true;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
||||||
|
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
||||||
|
+ ggml_free(lora_ctx);
|
||||||
|
+ ggml_backend_buffer_free(lora_buf);
|
||||||
|
+ ggml_backend_free(backend_cpu);
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ auto build_lora_graph = [&]() {
|
||||||
|
+ // w = w + BA*s
|
||||||
|
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||||
|
+ ggml_set_name(BA, "BA");
|
||||||
|
+
|
||||||
|
+ if (scaling != 1.0f) {
|
||||||
|
+ BA = ggml_scale(lora_ctx, BA, scaling);
|
||||||
|
+ ggml_set_name(BA, "BA_scaled");
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ ggml_tensor * r;
|
||||||
|
+ r = ggml_add_inplace(lora_ctx, base_t, BA);
|
||||||
|
+ ggml_set_name(r, "r_add");
|
||||||
|
+
|
||||||
|
+ if (base_t->type != model_t->type) {
|
||||||
|
+ // convert the result to the model type
|
||||||
|
+ r = ggml_cast(lora_ctx, r, model_t->type);
|
||||||
|
+ ggml_set_name(r, "r_cast");
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ return r;
|
||||||
|
+ };
|
||||||
|
+
|
||||||
|
+ ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
||||||
|
+ ggml_tensor * r = build_lora_graph();
|
||||||
|
+ ggml_build_forward_expand(gf, r);
|
||||||
|
+
|
||||||
|
+ ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
||||||
|
+ if (graph_buf == nullptr) {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
|
||||||
|
+ ggml_free(lora_ctx);
|
||||||
|
+ ggml_backend_buffer_free(lora_buf);
|
||||||
|
+ ggml_backend_free(backend_cpu);
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ ggml_backend_graph_compute(backend_cpu, gf);
|
||||||
|
+
|
||||||
|
+ ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
|
||||||
|
+
|
||||||
|
+#if 0
|
||||||
|
+ // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
|
||||||
|
+ //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
|
||||||
|
+
|
||||||
|
+ // sched compute
|
||||||
|
+ ggml_build_forward_expand(gf, build_graph());
|
||||||
|
+ ggml_backend_sched_init_measure(sched, gf);
|
||||||
|
+
|
||||||
|
+ // create the graph again, since the previous one was destroyed by the measure
|
||||||
|
+ ggml_graph_clear(gf);
|
||||||
|
+ ggml_build_forward_expand(gf, build_graph());
|
||||||
|
+ ggml_backend_sched_graph_compute(sched, gf);
|
||||||
|
+ ggml_backend_sched_free(sched);
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+ ggml_backend_buffer_free(lora_buf);
|
||||||
|
+ ggml_backend_buffer_free(graph_buf);
|
||||||
|
+ ggml_free(lora_ctx);
|
||||||
|
+
|
||||||
|
+ n_tensors++;
|
||||||
|
+ if (n_tensors % 4 == 0) {
|
||||||
|
+ LLAMA_LOG_INFO(".");
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ ggml_backend_free(backend_cpu);
|
||||||
|
+
|
||||||
|
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
||||||
|
+ LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
||||||
|
+
|
||||||
|
+ return 0;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
||||||
|
+ try {
|
||||||
|
+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
||||||
|
+ } catch (const std::exception & err) {
|
||||||
|
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||||
|
+ return 1;
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
\ No newline at end of file
|
43
llm/patches/11-phi3-sliding-window.diff
Normal file
43
llm/patches/11-phi3-sliding-window.diff
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Michael Yang <mxyng@pm.me>
|
||||||
|
Date: Wed, 31 Jul 2024 14:57:04 -0700
|
||||||
|
Subject: [PATCH] phi3 sliding window
|
||||||
|
|
||||||
|
---
|
||||||
|
src/llama.cpp | 6 +++---
|
||||||
|
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
|
index a207451f..f2872d4e 100644
|
||||||
|
--- a/src/llama.cpp
|
||||||
|
+++ b/src/llama.cpp
|
||||||
|
@@ -4893,7 +4893,7 @@ static void llm_load_hparams(
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_PHI3:
|
||||||
|
{
|
||||||
|
- ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||||
|
+ ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
@@ -10762,7 +10762,7 @@ struct llm_build_context {
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
- struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
|
||||||
|
+ struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
auto residual = inpL;
|
||||||
|
@@ -10820,7 +10820,7 @@ struct llm_build_context {
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
- Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||||
|
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
--
|
||||||
|
2.45.2
|
||||||
|
|
|
@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
systemTotalMemory = systemMemInfo.TotalMemory
|
systemTotalMemory = systemMemInfo.TotalMemory
|
||||||
systemFreeMemory = systemMemInfo.FreeMemory
|
systemFreeMemory = systemMemInfo.FreeMemory
|
||||||
systemSwapFreeMemory = systemMemInfo.FreeSwap
|
systemSwapFreeMemory = systemMemInfo.FreeSwap
|
||||||
slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
|
slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
||||||
|
@ -584,7 +584,8 @@ func (s *llmServer) Ping(ctx context.Context) error {
|
||||||
|
|
||||||
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
stallDuration := envconfig.LoadTimeout() // If no progress happens
|
stallDuration := 5 * time.Minute // If no progress happens
|
||||||
|
finalLoadDuration := 5 * time.Minute // After we hit 100%, give the runner more time to come online
|
||||||
stallTimer := time.Now().Add(stallDuration) // give up if we stall
|
stallTimer := time.Now().Add(stallDuration) // give up if we stall
|
||||||
|
|
||||||
slog.Info("waiting for llama runner to start responding")
|
slog.Info("waiting for llama runner to start responding")
|
||||||
|
@ -636,7 +637,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||||
stallTimer = time.Now().Add(stallDuration)
|
stallTimer = time.Now().Add(stallDuration)
|
||||||
} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
|
} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
|
||||||
slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
|
slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
|
||||||
stallTimer = time.Now().Add(stallDuration)
|
stallTimer = time.Now().Add(finalLoadDuration)
|
||||||
fullyLoaded = true
|
fullyLoaded = true
|
||||||
}
|
}
|
||||||
time.Sleep(time.Millisecond * 250)
|
time.Sleep(time.Millisecond * 250)
|
||||||
|
|
|
@ -79,7 +79,7 @@ type ChatCompletionRequest struct {
|
||||||
Stop any `json:"stop"`
|
Stop any `json:"stop"`
|
||||||
Temperature *float64 `json:"temperature"`
|
Temperature *float64 `json:"temperature"`
|
||||||
FrequencyPenalty *float64 `json:"frequency_penalty"`
|
FrequencyPenalty *float64 `json:"frequency_penalty"`
|
||||||
PresencePenalty *float64 `json:"presence_penalty"`
|
PresencePenalty *float64 `json:"presence_penalty_penalty"`
|
||||||
TopP *float64 `json:"top_p"`
|
TopP *float64 `json:"top_p"`
|
||||||
ResponseFormat *ResponseFormat `json:"response_format"`
|
ResponseFormat *ResponseFormat `json:"response_format"`
|
||||||
Tools []api.Tool `json:"tools"`
|
Tools []api.Tool `json:"tools"`
|
||||||
|
@ -452,7 +452,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if r.Temperature != nil {
|
if r.Temperature != nil {
|
||||||
options["temperature"] = *r.Temperature
|
options["temperature"] = *r.Temperature * 2.0
|
||||||
} else {
|
} else {
|
||||||
options["temperature"] = 1.0
|
options["temperature"] = 1.0
|
||||||
}
|
}
|
||||||
|
@ -462,11 +462,11 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if r.FrequencyPenalty != nil {
|
if r.FrequencyPenalty != nil {
|
||||||
options["frequency_penalty"] = *r.FrequencyPenalty
|
options["frequency_penalty"] = *r.FrequencyPenalty * 2.0
|
||||||
}
|
}
|
||||||
|
|
||||||
if r.PresencePenalty != nil {
|
if r.PresencePenalty != nil {
|
||||||
options["presence_penalty"] = *r.PresencePenalty
|
options["presence_penalty"] = *r.PresencePenalty * 2.0
|
||||||
}
|
}
|
||||||
|
|
||||||
if r.TopP != nil {
|
if r.TopP != nil {
|
||||||
|
@ -513,7 +513,7 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if r.Temperature != nil {
|
if r.Temperature != nil {
|
||||||
options["temperature"] = *r.Temperature
|
options["temperature"] = *r.Temperature * 2.0
|
||||||
} else {
|
} else {
|
||||||
options["temperature"] = 1.0
|
options["temperature"] = 1.0
|
||||||
}
|
}
|
||||||
|
@ -522,9 +522,9 @@ func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
|
||||||
options["seed"] = *r.Seed
|
options["seed"] = *r.Seed
|
||||||
}
|
}
|
||||||
|
|
||||||
options["frequency_penalty"] = r.FrequencyPenalty
|
options["frequency_penalty"] = r.FrequencyPenalty * 2.0
|
||||||
|
|
||||||
options["presence_penalty"] = r.PresencePenalty
|
options["presence_penalty"] = r.PresencePenalty * 2.0
|
||||||
|
|
||||||
if r.TopP != 0.0 {
|
if r.TopP != 0.0 {
|
||||||
options["top_p"] = r.TopP
|
options["top_p"] = r.TopP
|
||||||
|
|
|
@ -22,10 +22,7 @@ const (
|
||||||
image = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
|
image = `iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=`
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var False = false
|
||||||
False = false
|
|
||||||
True = true
|
|
||||||
)
|
|
||||||
|
|
||||||
func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {
|
func captureRequestMiddleware(capturedRequest any) gin.HandlerFunc {
|
||||||
return func(c *gin.Context) {
|
return func(c *gin.Context) {
|
||||||
|
@ -73,44 +70,6 @@ func TestChatMiddleware(t *testing.T) {
|
||||||
Stream: &False,
|
Stream: &False,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
|
||||||
name: "chat handler with options",
|
|
||||||
body: `{
|
|
||||||
"model": "test-model",
|
|
||||||
"messages": [
|
|
||||||
{"role": "user", "content": "Hello"}
|
|
||||||
],
|
|
||||||
"stream": true,
|
|
||||||
"max_tokens": 999,
|
|
||||||
"seed": 123,
|
|
||||||
"stop": ["\n", "stop"],
|
|
||||||
"temperature": 3.0,
|
|
||||||
"frequency_penalty": 4.0,
|
|
||||||
"presence_penalty": 5.0,
|
|
||||||
"top_p": 6.0,
|
|
||||||
"response_format": {"type": "json_object"}
|
|
||||||
}`,
|
|
||||||
req: api.ChatRequest{
|
|
||||||
Model: "test-model",
|
|
||||||
Messages: []api.Message{
|
|
||||||
{
|
|
||||||
Role: "user",
|
|
||||||
Content: "Hello",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
Options: map[string]any{
|
|
||||||
"num_predict": 999.0, // float because JSON doesn't distinguish between float and int
|
|
||||||
"seed": 123.0,
|
|
||||||
"stop": []any{"\n", "stop"},
|
|
||||||
"temperature": 3.0,
|
|
||||||
"frequency_penalty": 4.0,
|
|
||||||
"presence_penalty": 5.0,
|
|
||||||
"top_p": 6.0,
|
|
||||||
},
|
|
||||||
Format: "json",
|
|
||||||
Stream: &True,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
name: "chat handler with image content",
|
name: "chat handler with image content",
|
||||||
body: `{
|
body: `{
|
||||||
|
@ -227,8 +186,6 @@ func TestChatMiddleware(t *testing.T) {
|
||||||
req, _ := http.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(tc.body))
|
req, _ := http.NewRequest(http.MethodPost, "/api/chat", strings.NewReader(tc.body))
|
||||||
req.Header.Set("Content-Type", "application/json")
|
req.Header.Set("Content-Type", "application/json")
|
||||||
|
|
||||||
defer func() { capturedRequest = nil }()
|
|
||||||
|
|
||||||
resp := httptest.NewRecorder()
|
resp := httptest.NewRecorder()
|
||||||
router.ServeHTTP(resp, req)
|
router.ServeHTTP(resp, req)
|
||||||
|
|
||||||
|
@ -245,6 +202,7 @@ func TestChatMiddleware(t *testing.T) {
|
||||||
if !reflect.DeepEqual(tc.err, errResp) {
|
if !reflect.DeepEqual(tc.err, errResp) {
|
||||||
t.Fatal("errors did not match")
|
t.Fatal("errors did not match")
|
||||||
}
|
}
|
||||||
|
capturedRequest = nil
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -275,7 +233,7 @@ func TestCompletionsMiddleware(t *testing.T) {
|
||||||
Options: map[string]any{
|
Options: map[string]any{
|
||||||
"frequency_penalty": 0.0,
|
"frequency_penalty": 0.0,
|
||||||
"presence_penalty": 0.0,
|
"presence_penalty": 0.0,
|
||||||
"temperature": 0.8,
|
"temperature": 1.6,
|
||||||
"top_p": 1.0,
|
"top_p": 1.0,
|
||||||
"stop": []any{"\n", "stop"},
|
"stop": []any{"\n", "stop"},
|
||||||
},
|
},
|
||||||
|
|
|
@ -38,7 +38,7 @@ IS_WSL2=false
|
||||||
KERN=$(uname -r)
|
KERN=$(uname -r)
|
||||||
case "$KERN" in
|
case "$KERN" in
|
||||||
*icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
|
*icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
|
||||||
*icrosoft) error "Microsoft WSL1 is not currently supported. Please use WSL2 with 'wsl --set-version <distro> 2'" ;;
|
*icrosoft) error "Microsoft WSL1 is not currently supported. Please upgrade to WSL2 with 'wsl --set-version <distro> 2'" ;;
|
||||||
*) ;;
|
*) ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
@ -356,12 +356,12 @@ if ! lsmod | grep -q nvidia || ! lsmod | grep -q nvidia_uvm; then
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# make sure the NVIDIA modules are loaded on boot with nvidia-persistenced
|
# make sure the NVIDIA modules are loaded on boot with nvidia-persistenced
|
||||||
if available nvidia-persistenced; then
|
if command -v nvidia-persistenced > /dev/null 2>&1; then
|
||||||
$SUDO touch /etc/modules-load.d/nvidia.conf
|
$SUDO touch /etc/modules-load.d/nvidia.conf
|
||||||
MODULES="nvidia nvidia-uvm"
|
MODULES="nvidia nvidia-uvm"
|
||||||
for MODULE in $MODULES; do
|
for MODULE in $MODULES; do
|
||||||
if ! grep -qxF "$MODULE" /etc/modules-load.d/nvidia.conf; then
|
if ! grep -qxF "$MODULE" /etc/modules-load.d/nvidia.conf; then
|
||||||
echo "$MODULE" | $SUDO tee -a /etc/modules-load.d/nvidia.conf > /dev/null
|
echo "$MODULE" | sudo tee -a /etc/modules-load.d/nvidia.conf > /dev/null
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -256,7 +256,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
if resp.StatusCode != http.StatusTemporaryRedirect && resp.StatusCode != http.StatusOK {
|
if resp.StatusCode != http.StatusTemporaryRedirect {
|
||||||
return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
|
return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
|
||||||
}
|
}
|
||||||
return resp.Location()
|
return resp.Location()
|
||||||
|
|
Loading…
Reference in a new issue