Compare commits

..

No commits in common. "0c61920bc960283307643a8980ff06468bbf657f" and "99dfb67553f05361dc68144c801424d86c5bb423" have entirely different histories.

59 changed files with 471 additions and 1647 deletions

View file

@ -187,13 +187,6 @@ jobs:
generate-windows-cuda: generate-windows-cuda:
environment: release environment: release
runs-on: windows runs-on: windows
strategy:
matrix:
cuda:
- version: "11"
url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
- version: "12"
url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
env: env:
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }} KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
steps: steps:
@ -227,11 +220,11 @@ jobs:
with: with:
go-version-file: go.mod go-version-file: go.mod
cache: true cache: true
- name: 'Install CUDA ${{ matrix.cuda.version }}' - name: 'Install CUDA'
run: | run: |
$ErrorActionPreference = "Stop" $ErrorActionPreference = "Stop"
write-host "downloading CUDA Installer" write-host "downloading CUDA Installer"
Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
write-host "Installing CUDA" write-host "Installing CUDA"
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
write-host "Completed CUDA" write-host "Completed CUDA"
@ -263,16 +256,15 @@ jobs:
cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: generate-windows-cuda-${{ matrix.cuda.version }} name: generate-windows-cuda
path: | path: |
llm/build/**/bin/* llm/build/**/bin/*
dist/windows-amd64/** dist/windows-amd64/**
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: windows-cuda-deps-${{ matrix.cuda.version }} name: windows-cuda-deps
path: dist/deps/* path: dist/deps/*
# Import the prior generation steps and build the final windows assets # Import the prior generation steps and build the final windows assets
build-windows: build-windows:
environment: release environment: release
@ -322,16 +314,10 @@ jobs:
name: generate-windows-cpu name: generate-windows-cpu
- uses: actions/download-artifact@v4 - uses: actions/download-artifact@v4
with: with:
name: generate-windows-cuda-11 name: generate-windows-cuda
- uses: actions/download-artifact@v4 - uses: actions/download-artifact@v4
with: with:
name: generate-windows-cuda-12 name: windows-cuda-deps
- uses: actions/download-artifact@v4
with:
name: windows-cuda-deps-11
- uses: actions/download-artifact@v4
with:
name: windows-cuda-deps-12
- uses: actions/download-artifact@v4 - uses: actions/download-artifact@v4
with: with:
name: windows-rocm-deps name: windows-rocm-deps
@ -377,6 +363,7 @@ jobs:
- run: | - run: |
./scripts/build_linux.sh ./scripts/build_linux.sh
./scripts/build_docker.sh ./scripts/build_docker.sh
mv dist/deps/* dist/
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: dist-linux-amd64 name: dist-linux-amd64
@ -472,10 +459,7 @@ jobs:
merge-multiple: true merge-multiple: true
- run: | - run: |
ls -lh dist/ ls -lh dist/
(cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt) (cd dist; sha256sum * > sha256sum.txt)
mv sha256sum.txt dist/
mv dist/linux-???64 .
mv dist/linux-amd64-rocm .
cat dist/sha256sum.txt cat dist/sha256sum.txt
- name: Create or update Release - name: Create or update Release
run: | run: |

View file

@ -58,4 +58,4 @@ ENV OLLAMA_HOST="0.0.0.0:8080"
EXPOSE 8080 EXPOSE 8080
CMD ["supervisord", "-c", "/app/supervisord.conf"] CMD ["supervisord", "-c", "/app/supervisord.conf"]

View file

@ -87,11 +87,20 @@ DialogFontSize=12
[Files] [Files]
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs #if DirExists("..\dist\windows-amd64\cuda")
Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\oneapi")
Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
#endif
#if DirExists("..\dist\windows-amd64\rocm")
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
#endif
[Icons] [Icons]
Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
@ -99,7 +108,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
[Run] [Run]
Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
[UninstallRun] [UninstallRun]
; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
@ -134,8 +143,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
[Registry] [Registry]
Root: HKCU; Subkey: "Environment"; \ Root: HKCU; Subkey: "Environment"; \
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \ ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
Check: NeedsAddPath('{app}\bin') Check: NeedsAddPath('{app}')
[Code] [Code]

View file

@ -11,7 +11,6 @@ import (
"path/filepath" "path/filepath"
"sort" "sort"
"sync" "sync"
"syscall"
"unsafe" "unsafe"
"golang.org/x/sys/windows" "golang.org/x/sys/windows"
@ -434,12 +433,7 @@ func (t *winTray) setIcon(src string) error {
t.muNID.Lock() t.muNID.Lock()
defer t.muNID.Unlock() defer t.muNID.Unlock()
t.nid.Icon = h t.nid.Icon = h
t.nid.Flags |= NIF_ICON | NIF_TIP t.nid.Flags |= NIF_ICON
if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil {
copy(t.nid.Tip[:], toolTipUTF16)
} else {
return err
}
t.nid.Size = uint32(unsafe.Sizeof(*t.nid)) t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
return t.nid.modify() return t.nid.modify()

View file

@ -61,7 +61,6 @@ const (
MIIM_SUBMENU = 0x00000004 MIIM_SUBMENU = 0x00000004
MIM_APPLYTOSUBMENUS = 0x80000000 MIM_APPLYTOSUBMENUS = 0x80000000
NIF_ICON = 0x00000002 NIF_ICON = 0x00000002
NIF_TIP = 0x00000004
NIF_INFO = 0x00000010 NIF_INFO = 0x00000010
NIF_MESSAGE = 0x00000001 NIF_MESSAGE = 0x00000001
SW_HIDE = 0 SW_HIDE = 0

View file

@ -204,12 +204,6 @@ func tempZipFiles(path string) (string, error) {
// safetensors files might be unresolved git lfs references; skip if they are // safetensors files might be unresolved git lfs references; skip if they are
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors // covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
files = append(files, st...) files = append(files, st...)
} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
// covers adapters.safetensors
files = append(files, st...)
} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
// covers adapter_model.safetensors
files = append(files, st...)
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 { } else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
// pytorch files might also be unresolved git lfs references; skip if they are // pytorch files might also be unresolved git lfs references; skip if they are
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin // covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
@ -229,14 +223,6 @@ func tempZipFiles(path string) (string, error) {
} }
files = append(files, js...) files = append(files, js...)
// bert models require a nested config.json
// TODO(mxyng): merge this with the glob above
js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
if err != nil {
return "", err
}
files = append(files, js...)
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 { if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob // add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
// tokenizer.model might be a unresolved git lfs reference; error if it is // tokenizer.model might be a unresolved git lfs reference; error if it is
@ -266,11 +252,6 @@ func tempZipFiles(path string) (string, error) {
return "", err return "", err
} }
zfi.Name, err = filepath.Rel(path, file)
if err != nil {
return "", err
}
zf, err := zipfile.CreateHeader(zfi) zf, err := zipfile.CreateHeader(zfi)
if err != nil { if err != nil {
return "", err return "", err

View file

@ -7,27 +7,16 @@ import (
"io" "io"
"io/fs" "io/fs"
"log/slog" "log/slog"
"strings"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
type ModelParameters struct { type Parameters struct {
Architectures []string `json:"architectures"` Architectures []string `json:"architectures"`
VocabSize uint32 `json:"vocab_size"` VocabSize uint32 `json:"vocab_size"`
} }
type AdapterParameters struct { func (Parameters) KV(t *Tokenizer) llm.KV {
Alpha uint32 `json:"lora_alpha"`
LoraLayers uint32 `json:"lora_layers"`
LoraParameters struct {
Rank uint32 `json:"rank"`
Alpha float32 `json:"alpha"`
Scale float32 `json:"scale"`
} `json:"lora_parameters"`
}
func (ModelParameters) KV(t *Tokenizer) llm.KV {
kv := llm.KV{ kv := llm.KV{
"general.file_type": uint32(1), "general.file_type": uint32(1),
"general.quantization_version": uint32(2), "general.quantization_version": uint32(2),
@ -54,119 +43,40 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
return kv return kv
} }
func (p AdapterParameters) KV() llm.KV { func (Parameters) specialTokenTypes() []string {
var alpha float32
if p.LoraParameters.Alpha == 0 {
alpha = float32(p.Alpha)
} else {
alpha = p.LoraParameters.Alpha
}
kv := llm.KV{
"adapter.lora.alpha": alpha,
"adapter.type": "lora",
"general.file_type": uint32(1),
"general.type": "adapter",
"general.version": "v0.2",
}
return kv
}
func (ModelParameters) specialTokenTypes() []string {
return []string{ return []string{
"bos", "eos", "unk", "sep", "pad", "cls", "mask", "bos", "eos", "unk", "sep", "pad", "cls", "mask",
} }
} }
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
return llm.WriteGGUF(ws, kv, ts) return llm.WriteGGUF(ws, kv, ts)
} }
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { type Converter interface {
return llm.WriteGGUF(ws, kv, ts)
}
type ModelConverter interface {
// KV maps parameters to LLM key-values // KV maps parameters to LLM key-values
KV(*Tokenizer) llm.KV KV(*Tokenizer) llm.KV
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
Tensors([]Tensor) []llm.Tensor Tensors([]Tensor) []llm.Tensor
// Replacements returns a list of string pairs to replace in tensor names.
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
Replacements() []string
// tensorName returns the LLM tensor name for a specific input name
tensorName(string) string
// specialTokenTypes returns any special token types the model uses // specialTokenTypes returns any special token types the model uses
specialTokenTypes() []string specialTokenTypes() []string
// writeFile writes the model to the provided io.WriteSeeker
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
} }
type moreParser interface {
parseMore(fs.FS) error
}
type AdapterConverter interface {
// KV maps parameters to LLM key-values
KV(llm.KV) llm.KV
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
Tensors([]Tensor) []llm.Tensor
// Replacements returns a list of string pairs to replace in tensor names.
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
Replacements() []string
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
}
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
bts, err := fs.ReadFile(fsys, "adapter_config.json")
if err != nil {
return err
}
var p AdapterParameters
if err := json.Unmarshal(bts, &p); err != nil {
return err
}
arch, ok := baseKV["general.architecture"]
if !ok {
return errors.New("architecture not set for the base model")
}
var conv AdapterConverter
switch arch {
case "llama":
conv = &llamaAdapter{}
case "gemma2":
conv = &gemma2Adapter{}
default:
return errors.New("unsupported architecture")
}
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
if err != nil {
return err
}
if err := json.Unmarshal(bts, conv); err != nil {
return err
}
return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
}
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
// and files it finds in the input path. // and files it finds in the input path.
// Supported input model formats include safetensors. // Supported input model formats include safetensors.
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model. // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { func Convert(fsys fs.FS, ws io.WriteSeeker) error {
bts, err := fs.ReadFile(fsys, "config.json") bts, err := fs.ReadFile(fsys, "config.json")
if err != nil { if err != nil {
return err return err
} }
var p ModelParameters var p Parameters
if err := json.Unmarshal(bts, &p); err != nil { if err := json.Unmarshal(bts, &p); err != nil {
return err return err
} }
@ -175,20 +85,16 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
return errors.New("unknown architecture") return errors.New("unknown architecture")
} }
var conv ModelConverter var conv Converter
switch p.Architectures[0] { switch p.Architectures[0] {
case "LlamaForCausalLM", "MistralForCausalLM": case "LlamaForCausalLM", "MistralForCausalLM":
conv = &llamaModel{} conv = &llama{}
case "MixtralForCausalLM": case "MixtralForCausalLM":
conv = &mixtralModel{} conv = &mixtral{}
case "GemmaForCausalLM": case "GemmaForCausalLM":
conv = &gemmaModel{} conv = &gemma{}
case "Gemma2ForCausalLM":
conv = &gemma2Model{}
case "Phi3ForCausalLM": case "Phi3ForCausalLM":
conv = &phi3Model{} conv = &phi3{}
case "BertModel":
conv = &bertModel{}
default: default:
return errors.New("unsupported architecture") return errors.New("unsupported architecture")
} }
@ -197,12 +103,6 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
return err return err
} }
if t, ok := conv.(moreParser); ok {
if err := t.parseMore(fsys); err != nil {
return err
}
}
t, err := parseTokenizer(fsys, conv.specialTokenTypes()) t, err := parseTokenizer(fsys, conv.specialTokenTypes())
if err != nil { if err != nil {
return err return err
@ -219,7 +119,7 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
} }
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...)) ts, err := parseTensors(fsys)
if err != nil { if err != nil {
return err return err
} }

View file

@ -1,174 +0,0 @@
package convert
import (
"cmp"
"encoding/json"
"io/fs"
"path/filepath"
"slices"
"strings"
"github.com/ollama/ollama/llm"
)
type bertModel struct {
ModelParameters
NLayers uint32 `json:"n_layers"`
NumHiddenLayers uint32 `json:"num_hidden_layers"`
NLayer uint32 `json:"n_layer"`
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
NCtx uint32 `json:"n_ctx"`
HiddenSize uint32 `json:"hidden_size"`
NEmbd uint32 `json:"n_embd"`
IntermediateSize uint32 `json:"intermediate_size"`
NInner uint32 `json:"n_inner"`
NumAttentionHeads uint32 `json:"num_attention_heads"`
NHead uint32 `json:"n_head"`
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
LayerNormEPS float32 `json:"layer_norm_eps"`
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
NormEpsilon float32 `json:"norm_epsilon"`
PoolingType uint32
}
var (
_ ModelConverter = (*bertModel)(nil)
_ moreParser = (*bertModel)(nil)
)
func (p *bertModel) parseMore(fsys fs.FS) error {
bts, err := fs.ReadFile(fsys, "modules.json")
if err != nil {
return err
}
var modules []struct {
Type string `json:"type"`
Path string `json:"path"`
}
if err := json.Unmarshal(bts, &modules); err != nil {
return err
}
var pooling string
for _, m := range modules {
if m.Type == "sentence_transformers.models.Pooling" {
pooling = m.Path
break
}
}
if pooling != "" {
bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
if err != nil {
return err
}
var pc struct {
PoolingModeCLSToken bool `json:"pooling_mode_cls_token"`
PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
}
if err := json.Unmarshal(bts, &pc); err != nil {
return err
}
if pc.PoolingModeMeanTokens {
p.PoolingType = 1
} else if pc.PoolingModeCLSToken {
p.PoolingType = 2
}
}
return nil
}
func (p *bertModel) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "bert"
kv["bert.attention.causal"] = false
kv["bert.pooling_type"] = p.PoolingType
kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
kv["bert.context_length"] = contextLength
}
if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
}
if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
}
if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
}
if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
}
kv["tokenizer.ggml.model"] = "bert"
kv["tokenizer.ggml.token_type_count"] = uint32(2)
// convert to phantom space tokens
for i, e := range t.Tokens {
if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
// noop
} else if strings.HasPrefix(e, "##") {
t.Tokens[i] = e[2:]
} else {
t.Tokens[i] = "\u2581" + e
}
}
kv["tokenizer.ggml.tokens"] = t.Tokens
return kv
}
func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
for _, t := range ts {
if slices.Contains([]string{
"embeddings.position_ids",
"pooler.dense.weight",
"pooler.dense.bias",
}, t.Name()) {
continue
}
out = append(out, llm.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
}
return out
}
func (bertModel) Replacements() []string {
return []string{
"encoder.layer", "blk",
"encoder.layers", "blk",
"embeddings.word_embeddings", "token_embd",
"embeddings.token_type_embeddings", "token_types",
"embeddings.LayerNorm", "token_embd_norm",
"embeddings.position_embeddings", "position_embd",
"attention.self.query", "attn_q",
"attention.self.key", "attn_k",
"attention.self.value", "attn_v",
"attention.output.dense", "attn_output",
"attention.output.LayerNorm", "attn_output_norm",
"intermediate.dense", "ffn_up",
"output.dense", "ffn_down",
"output.LayerNorm", "layer_output_norm",
}
}

View file

@ -9,8 +9,8 @@ import (
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
type gemmaModel struct { type gemma struct {
ModelParameters Parameters
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
HiddenSize uint32 `json:"hidden_size"` HiddenSize uint32 `json:"hidden_size"`
HiddenLayers uint32 `json:"num_hidden_layers"` HiddenLayers uint32 `json:"num_hidden_layers"`
@ -21,11 +21,12 @@ type gemmaModel struct {
HeadDim uint32 `json:"head_dim"` HeadDim uint32 `json:"head_dim"`
} }
var _ ModelConverter = (*gemmaModel)(nil) var _ Converter = (*gemma)(nil)
func (p *gemmaModel) KV(t *Tokenizer) llm.KV { func (p *gemma) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t) kv := p.Parameters.KV(t)
kv["general.architecture"] = "gemma" kv["general.architecture"] = "gemma"
kv["general.name"] = "gemma"
kv["gemma.context_length"] = p.MaxPositionEmbeddings kv["gemma.context_length"] = p.MaxPositionEmbeddings
kv["gemma.embedding_length"] = p.HiddenSize kv["gemma.embedding_length"] = p.HiddenSize
kv["gemma.block_count"] = p.HiddenLayers kv["gemma.block_count"] = p.HiddenLayers
@ -42,15 +43,16 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
return kv return kv
} }
func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor { func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor var out []llm.Tensor
for _, t := range ts { for _, t := range ts {
if strings.HasSuffix(t.Name(), "_norm.weight") { name := p.tensorName(t.Name())
if strings.HasSuffix(name, "_norm.weight") {
t.SetRepacker(p.addOne) t.SetRepacker(p.addOne)
} }
out = append(out, llm.Tensor{ out = append(out, llm.Tensor{
Name: t.Name(), Name: name,
Kind: t.Kind(), Kind: t.Kind(),
Shape: t.Shape(), Shape: t.Shape(),
WriterTo: t, WriterTo: t,
@ -60,8 +62,8 @@ func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
return out return out
} }
func (p *gemmaModel) Replacements() []string { func (p *gemma) tensorName(n string) string {
return []string{ return strings.NewReplacer(
"model.embed_tokens", "token_embd", "model.embed_tokens", "token_embd",
"model.norm", "output_norm", "model.norm", "output_norm",
"model.layers", "blk", "model.layers", "blk",
@ -74,10 +76,11 @@ func (p *gemmaModel) Replacements() []string {
"mlp.down_proj", "ffn_down", "mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up", "mlp.up_proj", "ffn_up",
"post_attention_layernorm", "ffn_norm", "post_attention_layernorm", "ffn_norm",
} "block_sparse_moe.gate", "ffn_inp",
).Replace(n)
} }
func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) { func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data)) n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
ones := tensor.Ones(tensor.Float32, int(shape[0])) ones := tensor.Ones(tensor.Float32, int(shape[0]))

View file

@ -1,43 +0,0 @@
package convert
import (
"github.com/ollama/ollama/llm"
)
type gemma2Model struct {
gemmaModel
SlidingWindow uint32 `json:"sliding_window"`
AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
}
func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t)
kv["general.architecture"] = "gemma2"
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
kv["gemma2.embedding_length"] = p.HiddenSize
kv["gemma2.block_count"] = p.HiddenLayers
kv["gemma2.feed_forward_length"] = p.IntermediateSize
kv["gemma2.attention.head_count"] = p.NumAttentionHeads
kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
kv["gemma2.attention.key_length"] = p.HeadDim
kv["gemma2.attention.value_length"] = p.HeadDim
kv["gemma2.attention.sliding_window"] = p.SlidingWindow
kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
kv["tokenizer.ggml.eot_token_id"] = uint32(107)
kv["tokenizer.ggml.middle_token_id"] = uint32(68)
kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
return kv
}
func (p *gemma2Model) Replacements() []string {
return append(
p.gemmaModel.Replacements(),
"post_attention_layernorm", "post_attention_norm",
"pre_feedforward_layernorm", "ffn_norm",
"post_feedforward_layernorm", "post_ffw_norm",
)
}

View file

@ -1,91 +0,0 @@
package convert
import (
"strings"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/ollama/ollama/llm"
)
type gemma2Adapter struct {
AdapterParameters
}
var _ AdapterConverter = (*gemma2Adapter)(nil)
func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
kv := p.AdapterParameters.KV()
kv["general.architecture"] = "gemma2"
return kv
}
func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
for _, t := range ts {
shape := t.Shape()
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
shape[0], shape[1] = shape[1], shape[0]
t.SetRepacker(p.repack)
}
out = append(out, llm.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: t.Shape(),
WriterTo: t,
})
}
return out
}
func (p *gemma2Adapter) Replacements() []string {
return []string{
"base_model.model.", "",
"model.layers", "blk",
"self_attn.q_proj", "attn_q",
"self_attn.k_proj", "attn_k",
"self_attn.v_proj", "attn_v",
"self_attn.o_proj", "attn_output",
"mlp.gate_proj", "ffn_gate",
"mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up",
"lora_A.weight", "weight.lora_a",
"lora_B.weight", "weight.lora_b",
"lora_a", "weight.lora_a",
"lora_b", "weight.lora_b",
}
}
func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
dims := []int{int(shape[1]), int(shape[0])}
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
if err := n.T(1, 0); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
ts, err := native.SelectF32(n, 1)
if err != nil {
return nil, err
}
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return f32s, nil
}

View file

@ -3,7 +3,6 @@ package convert
import ( import (
"cmp" "cmp"
"fmt" "fmt"
"math"
"strings" "strings"
"github.com/pdevine/tensor" "github.com/pdevine/tensor"
@ -12,8 +11,8 @@ import (
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
type llamaModel struct { type llama struct {
ModelParameters Parameters
NLayers uint32 `json:"n_layers"` NLayers uint32 `json:"n_layers"`
NumHiddenLayers uint32 `json:"num_hidden_layers"` NumHiddenLayers uint32 `json:"num_hidden_layers"`
NLayer uint32 `json:"n_layer"` NLayer uint32 `json:"n_layer"`
@ -28,14 +27,8 @@ type llamaModel struct {
NumKeyValueHeads uint32 `json:"num_key_value_heads"` NumKeyValueHeads uint32 `json:"num_key_value_heads"`
RopeTheta float32 `json:"rope_theta"` RopeTheta float32 `json:"rope_theta"`
RopeScaling struct { RopeScaling struct {
Type string `json:"type"` Type string `json:"type"`
RopeType string `json:"rope_type"` Factor float32 `json:"factor"`
Factor float32 `json:"factor"`
LowFrequencyFactor float32 `json:"low_freq_factor"`
HighFrequencyFactor float32 `json:"high_freq_factor"`
OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"`
factors ropeFactor
} `json:"rope_scaling"` } `json:"rope_scaling"`
RMSNormEPS float32 `json:"rms_norm_eps"` RMSNormEPS float32 `json:"rms_norm_eps"`
LayerNormEPS float32 `json:"layer_norm_eps"` LayerNormEPS float32 `json:"layer_norm_eps"`
@ -44,11 +37,12 @@ type llamaModel struct {
HeadDim uint32 `json:"head_dim"` HeadDim uint32 `json:"head_dim"`
} }
var _ ModelConverter = (*llamaModel)(nil) var _ Converter = (*llama)(nil)
func (p *llamaModel) KV(t *Tokenizer) llm.KV { func (p *llama) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t) kv := p.Parameters.KV(t)
kv["general.architecture"] = "llama" kv["general.architecture"] = "llama"
kv["general.name"] = "llama"
kv["llama.vocab_size"] = p.VocabSize kv["llama.vocab_size"] = p.VocabSize
kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
@ -77,27 +71,6 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
if p.RopeScaling.Type == "linear" { if p.RopeScaling.Type == "linear" {
kv["llama.rope.scaling.type"] = p.RopeScaling.Type kv["llama.rope.scaling.type"] = p.RopeScaling.Type
kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
} else if p.RopeScaling.RopeType == "llama3" {
dim := p.HiddenSize / p.NumAttentionHeads
for i := uint32(0); i < dim; i += 2 {
factor := cmp.Or(p.RopeScaling.Factor, 8.0)
factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
lambdaLow := float32(original) / factorLow
lambdaHigh := float32(original) / factorHigh
lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
if lambda < float64(lambdaHigh) {
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
} else if lambda > float64(lambdaLow) {
p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
} else {
smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
}
}
} }
if p.NumKeyValueHeads > 0 { if p.NumKeyValueHeads > 0 {
@ -120,26 +93,17 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
return kv return kv
} }
func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor { func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor var out []llm.Tensor
if p.RopeScaling.factors != nil {
out = append(out, llm.Tensor{
Name: "rope_freqs.weight",
Kind: 0,
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
WriterTo: p.RopeScaling.factors,
})
}
for _, t := range ts { for _, t := range ts {
if strings.HasSuffix(t.Name(), "attn_q.weight") || name := p.tensorName(t.Name())
strings.HasSuffix(t.Name(), "attn_k.weight") { if strings.HasSuffix(name, "attn_q.weight") ||
strings.HasSuffix(name, "attn_k.weight") {
t.SetRepacker(p.repack) t.SetRepacker(p.repack)
} }
out = append(out, llm.Tensor{ out = append(out, llm.Tensor{
Name: t.Name(), Name: name,
Kind: t.Kind(), Kind: t.Kind(),
Shape: t.Shape(), Shape: t.Shape(),
WriterTo: t, WriterTo: t,
@ -149,8 +113,8 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
return out return out
} }
func (p *llamaModel) Replacements() []string { func (p *llama) tensorName(n string) string {
return []string{ return strings.NewReplacer(
"lm_head", "output", "lm_head", "output",
"model.embed_tokens", "token_embd", "model.embed_tokens", "token_embd",
"model.norm", "output_norm", "model.norm", "output_norm",
@ -164,19 +128,21 @@ func (p *llamaModel) Replacements() []string {
"mlp.down_proj", "ffn_down", "mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up", "mlp.up_proj", "ffn_up",
"post_attention_layernorm", "ffn_norm", "post_attention_layernorm", "ffn_norm",
} // mixtral
"block_sparse_moe.gate", "ffn_gate_inp",
).Replace(n)
} }
func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) { func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
var dims []int var dims []int
for _, dim := range shape { for _, dim := range shape {
dims = append(dims, int(dim)) dims = append(dims, int(dim))
} }
var heads uint32 var heads uint32
if strings.HasSuffix(name, "attn_q.weight") { if strings.HasSuffix(name, "q_proj.weight") {
heads = p.NumAttentionHeads heads = p.NumAttentionHeads
} else if strings.HasSuffix(name, "attn_k.weight") { } else if strings.HasSuffix(name, "k_proj.weight") {
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
} else { } else {
return nil, fmt.Errorf("unknown tensor for repack: %s", name) return nil, fmt.Errorf("unknown tensor for repack: %s", name)

View file

@ -1,169 +0,0 @@
package convert
import (
"cmp"
"strings"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/ollama/ollama/llm"
)
type llamaAdapter struct {
AdapterParameters
NumAttentionHeads uint32 `json:"num_attention_heads"`
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
}
var _ AdapterConverter = (*llamaAdapter)(nil)
func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
kv := p.AdapterParameters.KV()
kv["general.architecture"] = "llama"
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
return kv
}
func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
var out []llm.Tensor
for _, t := range ts {
shape := t.Shape()
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
shape[0], shape[1] = shape[1], shape[0]
t.SetRepacker(p.repackAndTranspose)
} else {
t.SetRepacker(p.repack)
}
out = append(out, llm.Tensor{
Name: t.Name(),
Kind: t.Kind(),
Shape: shape,
WriterTo: t,
})
}
return out
}
func (p *llamaAdapter) Replacements() []string {
return []string{
"base_model.model.", "",
"model.layers", "blk",
"self_attn.q_proj", "attn_q",
"self_attn.k_proj", "attn_k",
"self_attn.v_proj", "attn_v",
"self_attn.o_proj", "attn_output",
"mlp.gate_proj", "ffn_gate",
"mlp.down_proj", "ffn_down",
"mlp.up_proj", "ffn_up",
"lora_A.weight", "weight.lora_a",
"lora_B.weight", "weight.lora_b",
"lora_a", "weight.lora_a",
"lora_b", "weight.lora_b",
}
}
func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
dims := []int{int(shape[1]), int(shape[0])}
var heads uint32
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
heads = p.NumAttentionHeads
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
} else {
return data, nil
}
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
ts, err := native.SelectF32(n, 1)
if err != nil {
return nil, err
}
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return f32s, nil
}
func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
dims := []int{int(shape[1]), int(shape[0])}
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
var heads uint32
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
heads = p.NumAttentionHeads
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
}
if heads > 0 {
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
}
if err := n.T(1, 0); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
ts, err := native.SelectF32(n, 1)
if err != nil {
return nil, err
}
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return f32s, nil
}

View file

@ -9,14 +9,16 @@ import (
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
type mixtralModel struct { type mixtral struct {
llamaModel llama
NumLocalExperts uint32 `json:"num_local_experts"` NumLocalExperts uint32 `json:"num_local_experts"`
NumExpertsPerToken uint32 `json:"num_experts_per_tok"` NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
} }
func (p *mixtralModel) KV(t *Tokenizer) llm.KV { var _ Converter = (*mixtral)(nil)
kv := p.llamaModel.KV(t)
func (p *mixtral) KV(t *Tokenizer) llm.KV {
kv := p.llama.KV(t)
if p.NumLocalExperts > 0 { if p.NumLocalExperts > 0 {
kv["llama.expert_count"] = p.NumLocalExperts kv["llama.expert_count"] = p.NumLocalExperts
@ -29,7 +31,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
return kv return kv
} }
func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor { func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
oldnew := []string{ oldnew := []string{
"model.layers", "blk", "model.layers", "blk",
"w1", "ffn_gate_exps", "w1", "ffn_gate_exps",
@ -67,14 +69,7 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
}) })
} }
return append(out, p.llamaModel.Tensors(ts)...) return append(out, p.llama.Tensors(ts)...)
}
func (p *mixtralModel) Replacements() []string {
return append(
p.llamaModel.Replacements(),
"block_sparse_moe.gate", "ffn_gate_inp",
)
} }
type experts []Tensor type experts []Tensor

View file

@ -11,8 +11,8 @@ import (
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
) )
type phi3Model struct { type phi3 struct {
ModelParameters Parameters
NumHiddenLayers uint32 `json:"num_hidden_layers"` NumHiddenLayers uint32 `json:"num_hidden_layers"`
NLayers uint32 `json:"n_layers"` NLayers uint32 `json:"n_layers"`
HiddenSize uint32 `json:"hidden_size"` HiddenSize uint32 `json:"hidden_size"`
@ -35,11 +35,12 @@ type phi3Model struct {
SlidingWindow uint32 `json:"sliding_window"` SlidingWindow uint32 `json:"sliding_window"`
} }
var _ ModelConverter = (*phi3Model)(nil) var _ Converter = (*phi3)(nil)
func (p *phi3Model) KV(t *Tokenizer) llm.KV { func (p *phi3) KV(t *Tokenizer) llm.KV {
kv := p.ModelParameters.KV(t) kv := p.Parameters.KV(t)
kv["general.architecture"] = "phi3" kv["general.architecture"] = "phi3"
kv["general.name"] = "phi3"
kv["phi3.context_length"] = p.MaxPositionEmbeddings kv["phi3.context_length"] = p.MaxPositionEmbeddings
kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
kv["phi3.feed_forward_length"] = p.IntermediateSize kv["phi3.feed_forward_length"] = p.IntermediateSize
@ -68,12 +69,13 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
return kv return kv
} }
func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor { func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
var addRopeFactors sync.Once var addRopeFactors sync.Once
out := make([]llm.Tensor, 0, len(ts)+2) out := make([]llm.Tensor, 0, len(ts)+2)
for _, t := range ts { for _, t := range ts {
if strings.HasPrefix(t.Name(), "blk.0.") { name := p.tensorName(t.Name())
if strings.HasPrefix(name, "blk.0.") {
addRopeFactors.Do(func() { addRopeFactors.Do(func() {
out = append(out, llm.Tensor{ out = append(out, llm.Tensor{
Name: "rope_factors_long.weight", Name: "rope_factors_long.weight",
@ -90,7 +92,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
} }
out = append(out, llm.Tensor{ out = append(out, llm.Tensor{
Name: t.Name(), Name: name,
Kind: t.Kind(), Kind: t.Kind(),
Shape: t.Shape(), Shape: t.Shape(),
WriterTo: t, WriterTo: t,
@ -100,8 +102,8 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
return out return out
} }
func (p *phi3Model) Replacements() []string { func (p *phi3) tensorName(n string) string {
return []string{ return strings.NewReplacer(
"lm_head", "output", "lm_head", "output",
"model.embed_tokens", "token_embd", "model.embed_tokens", "token_embd",
"model.norm", "output_norm", "model.norm", "output_norm",
@ -112,7 +114,7 @@ func (p *phi3Model) Replacements() []string {
"mlp.down_proj", "ffn_down", "mlp.down_proj", "ffn_down",
"mlp.gate_up_proj", "ffn_up", "mlp.gate_up_proj", "ffn_up",
"post_attention_layernorm", "ffn_norm", "post_attention_layernorm", "ffn_norm",
} ).Replace(n)
} }
type ropeFactor []float32 type ropeFactor []float32

View file

@ -1,9 +1,7 @@
package convert package convert
import ( import (
"bytes"
"crypto/sha256" "crypto/sha256"
"encoding/binary"
"encoding/hex" "encoding/hex"
"encoding/json" "encoding/json"
"flag" "flag"
@ -31,7 +29,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
} }
defer f.Close() defer f.Close()
if err := ConvertModel(fsys, f); err != nil { if err := Convert(fsys, f); err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -53,34 +51,6 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
return r, m.KV(), m.Tensors() return r, m.KV(), m.Tensors()
} }
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
actual := make(map[string]string)
for k, v := range kv {
if s, ok := v.(json.Marshaler); !ok {
actual[k] = fmt.Sprintf("%v", v)
} else {
bts, err := json.Marshal(s)
if err != nil {
t.Fatal(err)
}
actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
}
}
for _, tensor := range tensors.Items {
sha256sum := sha256.New()
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
if _, err := io.Copy(sha256sum, sr); err != nil {
t.Fatal(err)
}
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
}
return actual
}
func TestMain(m *testing.M) { func TestMain(m *testing.M) {
var level slog.Level var level slog.Level
flag.TextVar(&level, "level", slog.LevelInfo, "log level") flag.TextVar(&level, "level", slog.LevelInfo, "log level")
@ -92,14 +62,11 @@ func TestMain(m *testing.M) {
func TestConvertFull(t *testing.T) { func TestConvertFull(t *testing.T) {
cases := []string{ cases := []string{
"Meta-Llama-3-8B-Instruct", "Meta-Llama-3-8B-Instruct",
"Meta-Llama-3.1-8B-Instruct",
"Mistral-7B-Instruct-v0.2", "Mistral-7B-Instruct-v0.2",
"Mixtral-8x7B-Instruct-v0.1", "Mixtral-8x7B-Instruct-v0.1",
"gemma-2b-it", "gemma-2b-it",
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8 // microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
"Phi-3-mini-128k-instruct", "Phi-3-mini-128k-instruct",
"all-MiniLM-L6-v2",
"gemma-2-9b-it",
} }
for i := range cases { for i := range cases {
@ -115,7 +82,29 @@ func TestConvertFull(t *testing.T) {
} }
f, kv, tensors := convertFull(t, os.DirFS(p)) f, kv, tensors := convertFull(t, os.DirFS(p))
actual := generateResultsJSON(t, f, kv, tensors) actual := make(map[string]string)
for k, v := range kv {
if s, ok := v.(json.Marshaler); !ok {
actual[k] = fmt.Sprintf("%v", v)
} else {
bts, err := json.Marshal(s)
if err != nil {
t.Fatal(err)
}
actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
}
}
for _, tensor := range tensors.Items {
sha256sum := sha256.New()
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
if _, err := io.Copy(sha256sum, sr); err != nil {
t.Fatal(err)
}
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
}
expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt))) expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
if err != nil { if err != nil {
@ -139,209 +128,3 @@ func TestConvertFull(t *testing.T) {
}) })
} }
} }
func TestConvertAdapter(t *testing.T) {
type AdapterCase struct {
Name string
BaseKV map[string]any
Expected map[string]string
}
cases := []AdapterCase{
{
Name: "discollama",
BaseKV: map[string]any{
"general.architecture": "llama",
"llama.attention.head_count": uint32(32),
"llama.attention.head_count_kv": uint32(8),
},
Expected: map[string]string{
"general.architecture": "llama",
"general.file_type": "1",
"general.parameter_count": "106496",
"general.type": "adapter",
"general.version": "v0.2",
"adapter.lora.alpha": "16",
"adapter.type": "lora",
"llama.attention.head_count": "32",
"llama.attention.head_count_kv": "8",
"blk.31.attn_q.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
"blk.31.attn_q.weight.lora_b": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
"blk.31.attn_v.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
"blk.31.attn_v.weight.lora_b": "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
},
},
}
for _, c := range cases {
t.Run(c.Name, func(t *testing.T) {
t.Parallel()
f, err := os.CreateTemp(t.TempDir(), "f16")
if err != nil {
t.Fatal(err)
}
defer f.Close()
tempDir := t.TempDir()
generateLoraTestData(t, tempDir)
if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
t.Fatal(err)
}
r, err := os.Open(f.Name())
if err != nil {
t.Fatal(err)
}
defer r.Close()
m, _, err := llm.DecodeGGML(r, math.MaxInt)
if err != nil {
t.Fatal(err)
}
if _, err := r.Seek(0, io.SeekStart); err != nil {
t.Fatal(err)
}
actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
keys := maps.Keys(c.Expected)
slices.Sort(keys)
for _, k := range keys {
if v, ok := actual[k]; !ok {
t.Errorf("missing %s", k)
} else if v != c.Expected[k] {
t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
}
}
})
}
}
func generateLoraTestData(t *testing.T, tempDir string) {
type tensorData struct {
Offsets []int `json:"data_offsets"`
Type string `json:"dtype"`
Shape []int `json:"shape"`
}
offset := 4096 * 8 * 4
td := map[string]*tensorData{"__metadata__": nil}
td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
Offsets: []int{0, offset},
Type: "F32",
Shape: []int{4096, 8},
}
td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
Offsets: []int{offset, offset * 2},
Type: "F32",
Shape: []int{8, 4096},
}
td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
Offsets: []int{offset * 2, offset * 3},
Type: "F32",
Shape: []int{4096, 8},
}
td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
Offsets: []int{offset * 3, offset*3 + 8*1024*4},
Type: "F32",
Shape: []int{8, 1024},
}
data, err := json.Marshal(td)
if err != nil {
t.Fatal(err)
}
var buf bytes.Buffer
l := int64(len(data))
err = binary.Write(&buf, binary.LittleEndian, l)
if err != nil {
t.Fatal(err)
}
_, err = buf.Write(data)
if err != nil {
t.Fatal(err)
}
// write some data for the tensors
ones := make([]float32, 4096*8)
for i := range ones {
ones[i] = float32(1)
}
for range 3 {
err = binary.Write(&buf, binary.LittleEndian, ones)
if err != nil {
t.Fatal(err)
}
}
ones = make([]float32, 1024*8)
for i := range ones {
ones[i] = float32(1)
}
err = binary.Write(&buf, binary.LittleEndian, ones)
if err != nil {
t.Fatal(err)
}
fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
if err != nil {
t.Fatal(err)
}
defer fdata.Close()
_, err = fdata.Write(buf.Bytes())
if err != nil {
t.Fatal(err)
}
configData := `
{
"adapter_path": "adapters-test",
"batch_size": 8,
"config": "config-tiny.json",
"data": "../discollama-completion",
"grad_checkpoint": null,
"iters": 1000,
"learning_rate": 1e-05,
"lora_layers": 1,
"lora_parameters": {
"rank": 8,
"alpha": 16,
"dropout": 0.0,
"scale": 2.0
},
"lr_schedule": null,
"max_seq_length": 2048,
"model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
"resume_adapter_file": null,
"save_every": 100,
"seed": 0,
"steps_per_eval": 200,
"steps_per_report": 10,
"test": false,
"test_batches": 500,
"train": true,
"use_dora": false,
"val_batches": 25
}
`
f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
if err != nil {
t.Fatal(err)
}
defer f.Close()
_, err = f.WriteString(configData)
if err != nil {
t.Fatal(err)
}
}

View file

@ -35,9 +35,7 @@ const (
) )
func (t tensorBase) Kind() uint32 { func (t tensorBase) Kind() uint32 {
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") || if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
t.name == "token_types.weight" {
// these tensors are always F32
return 0 return 0
} }
@ -57,15 +55,13 @@ func (t *tensorBase) SetRepacker(fn repacker) {
type repacker func(string, []float32, []uint64) ([]float32, error) type repacker func(string, []float32, []uint64) ([]float32, error)
func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) { func parseTensors(fsys fs.FS) ([]Tensor, error) {
patterns := []struct { patterns := []struct {
Pattern string Pattern string
Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error) Func func(fs.FS, ...string) ([]Tensor, error)
}{ }{
{"model-*-of-*.safetensors", parseSafetensors}, {"model-*-of-*.safetensors", parseSafetensors},
{"model.safetensors", parseSafetensors}, {"model.safetensors", parseSafetensors},
{"adapters.safetensors", parseSafetensors},
{"adapter_model.safetensors", parseSafetensors},
{"pytorch_model-*-of-*.bin", parseTorch}, {"pytorch_model-*-of-*.bin", parseTorch},
{"pytorch_model.bin", parseTorch}, {"pytorch_model.bin", parseTorch},
{"consolidated.*.pth", parseTorch}, {"consolidated.*.pth", parseTorch},
@ -78,7 +74,7 @@ func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
} }
if len(matches) > 0 { if len(matches) > 0 {
return pattern.Func(fsys, replacer, matches...) return pattern.Func(fsys, matches...)
} }
} }

View file

@ -8,7 +8,6 @@ import (
"io" "io"
"io/fs" "io/fs"
"slices" "slices"
"strings"
"github.com/d4l3k/go-bfloat16" "github.com/d4l3k/go-bfloat16"
"github.com/x448/float16" "github.com/x448/float16"
@ -21,7 +20,7 @@ type safetensorMetadata struct {
Offsets []int64 `json:"data_offsets"` Offsets []int64 `json:"data_offsets"`
} }
func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
var ts []Tensor var ts []Tensor
for _, p := range ps { for _, p := range ps {
f, err := fsys.Open(p) f, err := fsys.Open(p)
@ -57,7 +56,7 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
offset: safetensorsPad(n, value.Offsets[0]), offset: safetensorsPad(n, value.Offsets[0]),
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]), size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
tensorBase: &tensorBase{ tensorBase: &tensorBase{
name: replacer.Replace(key), name: key,
shape: value.Shape, shape: value.Shape,
}, },
}) })

View file

@ -3,13 +3,12 @@ package convert
import ( import (
"io" "io"
"io/fs" "io/fs"
"strings"
"github.com/nlpodyssey/gopickle/pytorch" "github.com/nlpodyssey/gopickle/pytorch"
"github.com/nlpodyssey/gopickle/types" "github.com/nlpodyssey/gopickle/types"
) )
func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
var ts []Tensor var ts []Tensor
for _, p := range ps { for _, p := range ps {
pt, err := pytorch.Load(p) pt, err := pytorch.Load(p)
@ -28,7 +27,7 @@ func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor,
ts = append(ts, torch{ ts = append(ts, torch{
storage: t.(*pytorch.Tensor).Source, storage: t.(*pytorch.Tensor).Source,
tensorBase: &tensorBase{ tensorBase: &tensorBase{
name: replacer.Replace(k.(string)), name: k.(string),
shape: shape, shape: shape,
}, },
}) })

View file

@ -1,3 +0,0 @@
{
"rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
}

View file

@ -1,124 +0,0 @@
{
"general.architecture": "bert",
"general.file_type": "1",
"general.quantization_version": "2",
"bert.attention.causal": "false",
"bert.attention.head_count": "12",
"bert.attention.layer_norm_epsilon": "1e-12",
"bert.block_count": "6",
"bert.context_length": "512",
"bert.embedding_length": "384",
"bert.feed_forward_length": "1536",
"bert.pooling_type": "1",
"tokenizer.ggml.model": "bert",
"tokenizer.ggml.padding_token_id": "0",
"tokenizer.ggml.unknown_token_id": "100",
"tokenizer.ggml.cls_token_id": "101",
"tokenizer.ggml.seperator_token_id": "102",
"tokenizer.ggml.mask_token_id": "103",
"tokenizer.ggml.token_type_count": "2",
"tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
"tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
"tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
"token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
"position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
"token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
"token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
"token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
"blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
"blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
"blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
"blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
"blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
"blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
"blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
"blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
"blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
"blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
"blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
"blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
"blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
"blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
"blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
"blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
"blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
"blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
"blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
"blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
"blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
"blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
"blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
"blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
"blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
"blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
"blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
"blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
"blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
"blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
"blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
"blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
"blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
"blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
"blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
"blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
"blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
"blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
"blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
"blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
"blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
"blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
"blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
"blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
"blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
"blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
"blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
"blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
"blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
"blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
"blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
"blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
"blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
"blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
"blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
"blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
"blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
"blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
"blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
"blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
"blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
"blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
"blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
"blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
"blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
"blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
"blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
"blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
"blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
"blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
"blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
"blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
"blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
"blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
"blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
"blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
"blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
"blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
"blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
"blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
"blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
"blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
"blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
"blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
"blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
"blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
"blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
"blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
"blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
"blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
"blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
"blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
"blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
"blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
"blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
"blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
}

View file

@ -1,6 +0,0 @@
{
"general.architecture": "gemma2",
"gemma2.attention.sliding_window": "4096",
"gemma2.attn_logit_softcapping": "50",
"gemma2.final_logit_softcapping": "30"
}

View file

@ -1,6 +1,7 @@
package convert package convert
import ( import (
"cmp"
"crypto/sha256" "crypto/sha256"
"encoding/hex" "encoding/hex"
"encoding/json" "encoding/json"
@ -10,8 +11,6 @@ import (
"log/slog" "log/slog"
"os" "os"
"slices" "slices"
"golang.org/x/exp/maps"
) )
const ( const (
@ -185,32 +184,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
return nil, err return nil, err
} }
tokens := make(map[int]token, len(t.Model.Vocab)) var tokens []token
for k, v := range t.Model.Vocab { for k, v := range t.Model.Vocab {
tokens[v] = token{ tokens = append(tokens, token{
ID: v, ID: v,
Content: k, Content: k,
} })
} }
for _, token := range t.AddedTokens { for _, t := range t.AddedTokens {
token.UserDefined = true t.UserDefined = true
tokens[token.ID] = token tokens = append(tokens, t)
} }
keys := maps.Keys(tokens) slices.SortFunc(tokens, func(i, j token) int {
slices.Sort(keys) return cmp.Compare(i.ID, j.ID)
})
v := Vocabulary{Model: "gpt2"} v := Vocabulary{Model: "gpt2"}
for _, k := range keys { for _, t := range tokens {
token := tokens[k] v.Tokens = append(v.Tokens, t.Content)
v.Tokens = append(v.Tokens, token.Content) v.Scores = append(v.Scores, float32(t.ID))
v.Scores = append(v.Scores, float32(token.ID))
switch { switch {
case token.Special: case t.Special:
v.Types = append(v.Types, tokenTypeControl) v.Types = append(v.Types, tokenTypeControl)
case token.UserDefined: case t.UserDefined:
v.Types = append(v.Types, tokenTypeUserDefined) v.Types = append(v.Types, tokenTypeUserDefined)
default: default:
v.Types = append(v.Types, tokenTypeNormal) v.Types = append(v.Types, tokenTypeNormal)

View file

@ -15,11 +15,6 @@ import (
) )
func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
ast, err := parseAdditionalSpecialTokens(fsys)
if err != nil {
return nil, err
}
bts, err := fs.ReadFile(fsys, "tokenizer.model") bts, err := fs.ReadFile(fsys, "tokenizer.model")
if err != nil { if err != nil {
return nil, err return nil, err
@ -42,12 +37,7 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
sentencepiece.ModelProto_SentencePiece_BYTE: sentencepiece.ModelProto_SentencePiece_BYTE:
v.Types = append(v.Types, int32(t)) v.Types = append(v.Types, int32(t))
default: default:
tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL) v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
if slices.Contains(ast, piece.GetPiece()) {
tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
}
v.Types = append(v.Types, tt)
} }
} }
@ -91,23 +81,3 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
return &v, nil return &v, nil
} }
func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
f, err := fsys.Open("special_tokens_map.json")
if errors.Is(err, os.ErrNotExist) {
return nil, nil
} else if err != nil {
return nil, err
}
defer f.Close()
var m struct {
AdditionalSpecialTokens []string `json:"additional_special_tokens"`
}
if err := json.NewDecoder(f).Decode(&m); err != nil {
return nil, err
}
return m.AdditionalSpecialTokens, nil
}

View file

@ -111,10 +111,7 @@ On Windows, Ollama inherits your user and system environment variables.
## How do I use Ollama behind a proxy? ## How do I use Ollama behind a proxy?
Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform. Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
> [!NOTE]
> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
### How do I use Ollama behind a proxy in Docker? ### How do I use Ollama behind a proxy in Docker?
@ -279,4 +276,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit
## How does Ollama load models on multiple GPUs? ## How does Ollama load models on multiple GPUs?
Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs. Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.

View file

@ -20,12 +20,13 @@ GPU.
## Manual install ## Manual install
### Download `ollama` ### Download the `ollama` binary
Download and extract the Linux package: Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
```bash ```bash
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo chmod +x /usr/bin/ollama
``` ```
### Adding Ollama as a startup service (recommended) ### Adding Ollama as a startup service (recommended)
@ -95,7 +96,8 @@ curl -fsSL https://ollama.com/install.sh | sh
Or by downloading the ollama binary: Or by downloading the ollama binary:
```bash ```bash
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
sudo chmod +x /usr/bin/ollama
``` ```
## Installing specific versions ## Installing specific versions

View file

@ -174,7 +174,7 @@ func RunnersDir() (p string) {
defer func() { defer func() {
if p == "" { if p == "" {
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'") slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
} }
}() }()
@ -190,17 +190,17 @@ func RunnersDir() (p string) {
} }
var paths []string var paths []string
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} { for _, root := range []string{filepath.Dir(exe), cwd} {
paths = append(paths, paths = append(paths,
root, root,
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), filepath.Join(root, "windows-"+runtime.GOARCH),
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH), filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
) )
} }
// Try a few variations to improve developer experience when building from source in the local tree // Try a few variations to improve developer experience when building from source in the local tree
for _, path := range paths { for _, path := range paths {
candidate := filepath.Join(path, "lib", "ollama", "runners") candidate := filepath.Join(path, "ollama_runners")
if _, err := os.Stat(candidate); err == nil { if _, err := os.Stat(candidate); err == nil {
p = candidate p = candidate
break break

View file

@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
// Installer payload location if we're running the installed binary // Installer payload location if we're running the installed binary
exe, err := os.Executable() exe, err := os.Executable()
if err == nil { if err == nil {
rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama") rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
if rocmLibUsable(rocmTargetDir) { if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir) slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
return rocmTargetDir, nil return rocmTargetDir, nil

View file

@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
// Installer payload (if we're running from some other location) // Installer payload (if we're running from some other location)
localAppData := os.Getenv("LOCALAPPDATA") localAppData := os.Getenv("LOCALAPPDATA")
appDir := filepath.Join(localAppData, "Programs", "Ollama") appDir := filepath.Join(localAppData, "Programs", "Ollama")
rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama") rocmTargetDir := filepath.Join(appDir, "rocm")
if rocmLibUsable(rocmTargetDir) { if rocmLibUsable(rocmTargetDir) {
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir) slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
return rocmTargetDir, nil return rocmTargetDir, nil

View file

@ -4,17 +4,9 @@ package gpu
import ( import (
"log/slog" "log/slog"
"os"
"regexp"
"runtime"
"strconv"
"strings" "strings"
) )
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
ids := []string{} ids := []string{}
for _, info := range gpuInfo { for _, info := range gpuInfo {
@ -27,38 +19,3 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
} }
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",") return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
} }
func cudaVariant(gpuInfo CudaGPUInfo) string {
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
if CudaTegra != "" {
ver := strings.Split(CudaTegra, ".")
if len(ver) > 0 {
return "jetpack" + ver[0]
}
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
r := regexp.MustCompile(` R(\d+) `)
m := r.FindSubmatch(data)
if len(m) != 2 {
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
} else {
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
// https://developer.nvidia.com/embedded/jetpack-archive
switch l4t {
case 35:
return "jetpack5"
case 36:
return "jetpack6"
default:
slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
}
}
}
}
}
if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
return "v11"
}
return "v12"
}

View file

@ -64,6 +64,10 @@ var RocmComputeMin = 9
// TODO find a better way to detect iGPU instead of minimum memory // TODO find a better way to detect iGPU instead of minimum memory
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
var CudaTegra string = os.Getenv("JETSON_JETPACK")
// Note: gpuMutex must already be held // Note: gpuMutex must already be held
func initCudaHandles() *cudaHandles { func initCudaHandles() *cudaHandles {
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
@ -211,7 +215,7 @@ func GetGPUInfo() GpuInfoList {
GpuInfo: GpuInfo{ GpuInfo: GpuInfo{
memInfo: mem, memInfo: mem,
Library: "cpu", Library: "cpu",
Variant: cpuCapability.String(), Variant: cpuCapability,
ID: "0", ID: "0",
}, },
}, },
@ -225,7 +229,11 @@ func GetGPUInfo() GpuInfoList {
return GpuInfoList{cpus[0].GpuInfo} return GpuInfoList{cpus[0].GpuInfo}
} }
depPath := LibraryDir() // On windows we bundle the nvidia library one level above the runner dir
depPath := ""
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
}
// Load ALL libraries // Load ALL libraries
cHandles = initCudaHandles() cHandles = initCudaHandles()
@ -261,23 +269,11 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.computeMajor = int(memInfo.major)
gpuInfo.computeMinor = int(memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMajor = driverMajor
gpuInfo.DriverMinor = driverMinor gpuInfo.DriverMinor = driverMinor
variant := cudaVariant(gpuInfo)
if depPath != "" {
gpuInfo.DependencyPath = depPath
// Check for variant specific directory
if variant != "" {
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
}
}
}
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.Variant = variant
// query the management library as well so we can record any skew between the two // query the management library as well so we can record any skew between the two
// which represents overhead on the GPU we must set aside on subsequent updates // which represents overhead on the GPU we must set aside on subsequent updates
@ -310,6 +306,13 @@ func GetGPUInfo() GpuInfoList {
if envconfig.IntelGPU() { if envconfig.IntelGPU() {
oHandles = initOneAPIHandles() oHandles = initOneAPIHandles()
if oHandles != nil && oHandles.oneapi != nil { if oHandles != nil && oHandles.oneapi != nil {
// On windows we bundle the oneapi library one level above the runner dir
depPath = ""
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
}
for d := range oHandles.oneapi.num_drivers { for d := range oHandles.oneapi.num_drivers {
if oHandles.oneapi == nil { if oHandles.oneapi == nil {
// shouldn't happen // shouldn't happen
@ -464,12 +467,10 @@ func GetGPUInfo() GpuInfoList {
func FindGPULibs(baseLibName string, defaultPatterns []string) []string { func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
var ldPaths []string var ldPaths []string
var patterns []string
gpuLibPaths := []string{} gpuLibPaths := []string{}
slog.Debug("Searching for GPU library", "name", baseLibName) slog.Debug("Searching for GPU library", "name", baseLibName)
// Start with our bundled libraries
patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
switch runtime.GOOS { switch runtime.GOOS {
case "windows": case "windows":
ldPaths = strings.Split(os.Getenv("PATH"), ";") ldPaths = strings.Split(os.Getenv("PATH"), ";")
@ -478,14 +479,13 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
default: default:
return gpuLibPaths return gpuLibPaths
} }
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
// Then with whatever we find in the PATH/LD_LIBRARY_PATH
for _, ldPath := range ldPaths { for _, ldPath := range ldPaths {
d, err := filepath.Abs(ldPath) d, err := filepath.Abs(ldPath)
if err != nil { if err != nil {
continue continue
} }
patterns = append(patterns, filepath.Join(d, baseLibName)) patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
} }
patterns = append(patterns, defaultPatterns...) patterns = append(patterns, defaultPatterns...)
slog.Debug("gpu library search", "globs", patterns) slog.Debug("gpu library search", "globs", patterns)
@ -641,31 +641,3 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
return "", "" return "", ""
} }
} }
func LibraryDir() string {
// On Windows/linux we bundle the dependencies at the same level as the executable
appExe, err := os.Executable()
if err != nil {
slog.Warn("failed to lookup executable path", "error", err)
}
cwd, err := os.Getwd()
if err != nil {
slog.Warn("failed to lookup working directory", "error", err)
}
// Scan for any of our dependeices, and pick first match
for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
libDep := filepath.Join("lib", "ollama")
if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
return filepath.Join(root, libDep)
}
// Developer mode, local build
if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
}
if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
}
}
slog.Warn("unable to locate gpu dependency libraries")
return ""
}

View file

@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList {
return []GpuInfo{ return []GpuInfo{
{ {
Library: "cpu", Library: "cpu",
Variant: GetCPUCapability().String(), Variant: GetCPUCapability(),
memInfo: mem, memInfo: mem,
}, },
} }
@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList {
return []GpuInfo{ return []GpuInfo{
{ {
Library: "cpu", Library: "cpu",
Variant: GetCPUCapability().String(), Variant: GetCPUCapability(),
memInfo: mem, memInfo: mem,
}, },
} }

View file

@ -47,7 +47,7 @@ var (
CudartMgmtName = "libcudart.so*" CudartMgmtName = "libcudart.so*"
NvcudaMgmtName = "libcuda.so*" NvcudaMgmtName = "libcuda.so*"
NvmlMgmtName = "" // not currently wired on linux NvmlMgmtName = "" // not currently wired on linux
OneapiMgmtName = "libze_intel_gpu.so*" OneapiMgmtName = "libze_intel_gpu.so"
) )
func GetCPUMem() (memInfo, error) { func GetCPUMem() (memInfo, error) {

View file

@ -32,29 +32,4 @@ func TestCPUMemInfo(t *testing.T) {
} }
} }
func TestByLibrary(t *testing.T) {
type testCase struct {
input []GpuInfo
expect int
}
testCases := map[string]*testCase{
"empty": {input: []GpuInfo{}, expect: 0},
"cpu": {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
"cpu + GPU": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
"cpu + 2 GPU no variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
}
for k, v := range testCases {
t.Run(k, func(t *testing.T) {
resp := (GpuInfoList)(v.input).ByLibrary()
if len(resp) != v.expect {
t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
}
})
}
}
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected

View file

@ -19,7 +19,7 @@ type GpuInfo struct {
Library string `json:"library,omitempty"` Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags) // Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant"` Variant CPUCapability `json:"variant"`
// MinimumMemory represents the minimum memory required to use the GPU // MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory uint64 `json:"-"` MinimumMemory uint64 `json:"-"`
@ -53,10 +53,8 @@ type CPUInfo struct {
type CudaGPUInfo struct { type CudaGPUInfo struct {
GpuInfo GpuInfo
OSOverhead uint64 // Memory overhead between the driver library and management library OSOverhead uint64 // Memory overhead between the driver library and management library
index int //nolint:unused,nolintlint index int //nolint:unused,nolintlint
computeMajor int //nolint:unused,nolintlint
computeMinor int //nolint:unused,nolintlint
} }
type CudaGPUInfoList []CudaGPUInfo type CudaGPUInfoList []CudaGPUInfo
@ -83,8 +81,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
for _, info := range l { for _, info := range l {
found := false found := false
requested := info.Library requested := info.Library
if info.Variant != CPUCapabilityNone.String() { if info.Variant != CPUCapabilityNone {
requested += "_" + info.Variant requested += "_" + info.Variant.String()
} }
for i, lib := range libs { for i, lib := range libs {
if lib == requested { if lib == requested {
@ -94,7 +92,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
} }
} }
if !found { if !found {
libs = append(libs, requested) libs = append(libs, info.Library)
resp = append(resp, []GpuInfo{info}) resp = append(resp, []GpuInfo{info})
} }
} }
@ -107,7 +105,6 @@ func (l GpuInfoList) LogDetails() {
slog.Info("inference compute", slog.Info("inference compute",
"id", g.ID, "id", g.ID,
"library", g.Library, "library", g.Library,
"variant", g.Variant,
"compute", g.Compute, "compute", g.Compute,
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor), "driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
"name", g.Name, "name", g.Name,

View file

@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0]) t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
} }
if res.PromptEvalCount != 6 { if res.PromptEvalCount != 8 {
t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount) t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
} }
} }
@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0]) t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
} }
if res.PromptEvalCount != 12 { if res.PromptEvalCount != 16 {
t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount) t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
} }
} }

View file

@ -1,13 +1,12 @@
set(TARGET ollama_llama_server) set(TARGET ollama_llama_server)
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR})
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
install(TARGETS ${TARGET} RUNTIME) install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
) )
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS}) target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
if (WIN32) if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif() endif()

View file

@ -1429,13 +1429,7 @@ struct llama_server_context
switch (task.type) switch (task.type)
{ {
case TASK_TYPE_COMPLETION: { case TASK_TYPE_COMPLETION: {
server_slot *slot = nullptr; server_slot *slot = prefix_slot(task.data["prompt"]);
if (task.embedding_mode) {
// Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
slot = slots[0].available() ? &slots[0] : nullptr;
} else {
slot = prefix_slot(task.data["prompt"]);
}
if (slot == nullptr) if (slot == nullptr)
{ {
// if no slot is available, we defer this task for processing later // if no slot is available, we defer this task for processing later

View file

@ -9,14 +9,11 @@ init_vars() {
ARCH="arm64" ARCH="arm64"
;; ;;
*) *)
echo "GOARCH must be set" ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
echo "this script is meant to be run from within go generate"
exit 1
;;
esac esac
LLAMACPP_DIR=../llama.cpp LLAMACPP_DIR=../llama.cpp
CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on" CMAKE_DEFS=""
CMAKE_TARGETS="--target ollama_llama_server" CMAKE_TARGETS="--target ollama_llama_server"
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}" CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
@ -30,7 +27,6 @@ init_vars() {
WHOLE_ARCHIVE="-Wl,-force_load" WHOLE_ARCHIVE="-Wl,-force_load"
NO_WHOLE_ARCHIVE="" NO_WHOLE_ARCHIVE=""
GCC_ARCH="-arch ${ARCH}" GCC_ARCH="-arch ${ARCH}"
DIST_BASE=../../dist/darwin-${GOARCH}/
;; ;;
"Linux") "Linux")
LIB_EXT="so" LIB_EXT="so"
@ -39,7 +35,6 @@ init_vars() {
# Cross compiling not supported on linux - Use docker # Cross compiling not supported on linux - Use docker
GCC_ARCH="" GCC_ARCH=""
DIST_BASE=../../dist/linux-${GOARCH}/
;; ;;
*) *)
;; ;;
@ -47,7 +42,6 @@ init_vars() {
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
fi fi
GZIP=$(which pigz 2>/dev/null || echo "gzip")
} }
git_module_setup() { git_module_setup() {
@ -91,36 +85,26 @@ build() {
compress() { compress() {
echo "Compressing payloads to reduce overall binary size..." echo "Compressing payloads to reduce overall binary size..."
pids=""
rm -rf ${BUILD_DIR}/bin/*.gz rm -rf ${BUILD_DIR}/bin/*.gz
for f in ${BUILD_DIR}/bin/* ; do for f in ${BUILD_DIR}/bin/* ; do
${GZIP} -n --best -f ${f} & gzip -n --best -f ${f} &
compress_pids+=" $!" pids+=" $!"
done done
# check for lib directory # check for lib directory
if [ -d ${BUILD_DIR}/lib ]; then if [ -d ${BUILD_DIR}/lib ]; then
for f in ${BUILD_DIR}/lib/* ; do for f in ${BUILD_DIR}/lib/* ; do
${GZIP} -n --best -f ${f} & gzip -n --best -f ${f} &
compress_pids+=" $!" pids+=" $!"
done done
fi fi
echo echo
} for pid in ${pids}; do
wait_for_compress() {
for pid in ${compress_pids}; do
wait $pid wait $pid
done done
echo "Finished compression" echo "Finished compression"
} }
install() {
echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
cp -af "${lib}" "${BUILD_DIR}/bin/"
done
}
# Keep the local tree clean after we're done with the build # Keep the local tree clean after we're done with the build
cleanup() { cleanup() {
(cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt) (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)

View file

@ -6,7 +6,6 @@
set -ex set -ex
set -o pipefail set -o pipefail
compress_pids=""
echo "Starting darwin generate script" echo "Starting darwin generate script"
source $(dirname $0)/gen_common.sh source $(dirname $0)/gen_common.sh
init_vars init_vars
@ -99,5 +98,4 @@ case "${GOARCH}" in
esac esac
cleanup cleanup
wait_for_compress
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"

View file

@ -13,7 +13,6 @@
set -ex set -ex
set -o pipefail set -o pipefail
compress_pids=""
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
amdGPUs() { amdGPUs() {
@ -52,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
export CUDACXX=$(command -v nvcc) export CUDACXX=$(command -v nvcc)
fi fi
fi fi
COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
source $(dirname $0)/gen_common.sh source $(dirname $0)/gen_common.sh
init_vars init_vars
git_module_setup git_module_setup
@ -78,11 +77,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
init_vars init_vars
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu" BUILD_DIR="../build/linux/${ARCH}/cpu"
echo "Building custom CPU" echo "Building custom CPU"
build build
install
compress compress
else else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@ -95,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
# #
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
@ -105,7 +103,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
BUILD_DIR="../build/linux/${ARCH}/cpu" BUILD_DIR="../build/linux/${ARCH}/cpu"
echo "Building LCD CPU" echo "Building LCD CPU"
build build
install
compress compress
fi fi
@ -123,7 +120,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
BUILD_DIR="../build/linux/${ARCH}/cpu_avx" BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
echo "Building AVX CPU" echo "Building AVX CPU"
build build
install
compress compress
fi fi
@ -137,7 +133,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2" BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
echo "Building AVX2 CPU" echo "Building AVX2 CPU"
build build
install
compress compress
fi fi
fi fi
@ -165,7 +160,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
echo "CUDA libraries detected - building dynamic CUDA library" echo "CUDA libraries detected - building dynamic CUDA library"
init_vars init_vars
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true) CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then if [ -n "${CUDA_MAJOR}" ]; then
CUDA_VARIANT=_v${CUDA_MAJOR} CUDA_VARIANT=_v${CUDA_MAJOR}
fi fi
if [ "${ARCH}" == "arm64" ]; then if [ "${ARCH}" == "arm64" ]; then
@ -183,19 +178,29 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
echo "Building custom CUDA GPU" echo "Building custom CUDA GPU"
else else
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
fi fi
export CUDAFLAGS="-t8" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
build build
install
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" # Carry the CUDA libs as payloads to help reduce dependency burden on users
mkdir -p "${CUDA_DIST_DIR}" #
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do # TODO - in the future we may shift to packaging these separately and conditionally
cp -a "${lib}" "${CUDA_DIST_DIR}" # downloading them in the install script.
DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
for lib in libcudart.so libcublas.so libcublasLt.so ; do
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
else
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
fi
done done
compress compress
@ -213,24 +218,21 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
CC=icx CC=icx
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
BUILD_DIR="../build/linux/${ARCH}/oneapi" BUILD_DIR="../build/linux/${ARCH}/oneapi"
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama" EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
build build
# copy oneAPI dependencies # copy oneAPI dependencies
mkdir -p "${ONEAPI_DIST_DIR}"
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
cp -a "${dep}" "${ONEAPI_DIST_DIR}" cp "${dep}" "${BUILD_DIR}/bin/"
done done
cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
install
compress compress
fi fi
@ -252,7 +254,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true) ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
fi fi
init_vars init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\"" echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
@ -260,22 +262,23 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
echo "Building custom ROCM GPU" echo "Building custom ROCM GPU"
fi fi
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
# ROCm dependencies are too large to fit into a unified bundle EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
# TODO figure out how to disable runpath (rpath)
# export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
build build
# copy the ROCM dependencies # Record the ROCM dependencies
mkdir -p "${ROCM_DIST_DIR}" rm -f "${BUILD_DIR}/bin/deps.txt"
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do touch "${BUILD_DIR}/bin/deps.txt"
cp -a "${dep}"* "${ROCM_DIST_DIR}" for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
done done
install # bomb out if for some reason we didn't get a few deps
if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
cat "${BUILD_DIR}/bin/deps.txt"
echo "ERROR: deps file short"
exit 1
fi
compress compress
fi fi
cleanup cleanup
wait_for_compress
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"

View file

@ -35,7 +35,7 @@ function init_vars {
) )
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners" $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
md "$script:DIST_BASE" -ea 0 > $null md "$script:DIST_BASE" -ea 0 > $null
if ($env:CGO_CFLAGS -contains "-g") { if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo") $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
@ -117,7 +117,7 @@ function build {
if ($cmakeDefs -contains "-G") { if ($cmakeDefs -contains "-G") {
$extra=@("-j8") $extra=@("-j8")
} else { } else {
$extra= @("--", "/maxCpuCount:8") $extra= @("--", "/p:CL_MPcount=8")
} }
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra" write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
@ -261,7 +261,7 @@ function build_cuda() {
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) { if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
# Then build cuda as a dynamically loaded library # Then build cuda as a dynamically loaded library
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe" $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
$script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0] $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
if ($null -ne $script:CUDA_VERSION) { if ($null -ne $script:CUDA_VERSION) {
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
} }
@ -273,9 +273,9 @@ function build_cuda() {
"-DGGML_CUDA=ON", "-DGGML_CUDA=ON",
"-DGGML_AVX=on", "-DGGML_AVX=on",
"-DGGML_AVX2=off", "-DGGML_AVX2=off",
"-DCMAKE_CUDA_FLAGS=-t6", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}", "-DCMAKE_CUDA_FLAGS=-t8",
"-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH" "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
) )
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) { if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`"" write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
@ -286,11 +286,12 @@ function build_cuda() {
sign sign
install install
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
} else { } else {
write-host "Skipping CUDA generation step" write-host "Skipping CUDA generation step"
} }
@ -324,17 +325,18 @@ function build_oneapi() {
sign sign
install install
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
} else { } else {
Write-Host "Skipping oneAPI generation step" Write-Host "Skipping oneAPI generation step"
} }
@ -355,7 +357,7 @@ function build_rocm() {
"-DCMAKE_C_COMPILER=clang.exe", "-DCMAKE_C_COMPILER=clang.exe",
"-DCMAKE_CXX_COMPILER=clang++.exe", "-DCMAKE_CXX_COMPILER=clang++.exe",
"-DGGML_HIPBLAS=on", "-DGGML_HIPBLAS=on",
"-DGGML_CUDA_NO_PEER_COPY=on", "-DLLAMA_CUDA_NO_PEER_COPY=on",
"-DHIP_PLATFORM=amd", "-DHIP_PLATFORM=amd",
"-DGGML_AVX=on", "-DGGML_AVX=on",
"-DGGML_AVX2=off", "-DGGML_AVX2=off",
@ -384,11 +386,12 @@ function build_rocm() {
sign sign
install install
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
} else { } else {
write-host "Skipping ROCm generation step" write-host "Skipping ROCm generation step"
} }

View file

@ -43,14 +43,6 @@ func (kv KV) Architecture() string {
return "unknown" return "unknown"
} }
func (kv KV) Kind() string {
if s, ok := kv["general.type"].(string); ok {
return s
}
return "unknown"
}
func (kv KV) ParameterCount() uint64 { func (kv KV) ParameterCount() uint64 {
return kv.u64("general.parameter_count") return kv.u64("general.parameter_count")
} }

View file

@ -33,6 +33,7 @@ func TestEstimateGPULayers(t *testing.T) {
assert.Len(t, tensors, inputLayerCount+1) assert.Len(t, tensors, inputLayerCount+1)
err = WriteGGUF(f, KV{ err = WriteGGUF(f, KV{
"general.architecture": "llama", "general.architecture": "llama",
"general.name": "name",
"llama.context_length": uint32(32), "llama.context_length": uint32(32),
"llama.embedding_length": uint32(4096), "llama.embedding_length": uint32(4096),
"llama.block_count": uint32(inputLayerCount), "llama.block_count": uint32(inputLayerCount),

View file

@ -0,0 +1,60 @@
diff --git a/src/llama.cpp b/src/llama.cpp
index 721b8f4e..cfe7ac40 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8420,14 +8420,14 @@ struct llm_build_context {
}
struct ggml_tensor * build_inp_mean() {
- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
cb(lctx.inp_mean, "inp_mean", -1);
ggml_set_input(lctx.inp_mean);
return lctx.inp_mean;
}
struct ggml_tensor * build_inp_cls() {
- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
cb(lctx.inp_cls, "inp_cls", -1);
ggml_set_input(lctx.inp_cls);
return lctx.inp_cls;
@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
float * data = (float *) lctx.inp_mean->data;
- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
std::vector<uint64_t> sum(n_tokens, 0);
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
-
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
-
sum[seq_id] += 1;
}
- std::vector<float> div(n_tokens, 0.0f);
- for (int i = 0; i < n_tokens; ++i) {
+ std::vector<float> div(cparams.n_seq_max, 0.0f);
+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
const uint64_t s = sum[i];
if (s > 0) {
div[i] = 1.0f/float(s);
@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i];
-
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
-
if (pos == 0) {
data[seq_id] = i;
}

View file

@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_ // glob workDir for files that start with ollama_
availableServers := getAvailableServers() availableServers := getAvailableServers()
requested := info.Library requested := info.Library
if info.Variant != gpu.CPUCapabilityNone.String() { if info.Variant != gpu.CPUCapabilityNone {
requested += "_" + info.Variant requested += "_" + info.Variant.String()
} }
servers := []string{} servers := []string{}

View file

@ -258,7 +258,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--mlock") params = append(params, "--mlock")
} }
if gpu.IsNUMA() && gpus[0].Library == "cpu" { if gpu.IsNUMA() {
numaMode := "distribute" numaMode := "distribute"
if runtime.GOOS == "linux" { if runtime.GOOS == "linux" {
if _, err := exec.LookPath("numactl"); err == nil { if _, err := exec.LookPath("numactl"); err == nil {
@ -306,18 +306,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
if runtime.GOOS == "windows" { if runtime.GOOS == "windows" {
pathEnv = "PATH" pathEnv = "PATH"
} }
// Start with the server directory for the LD_LIBRARY_PATH/PATH // prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
libraryPaths := []string{dir} libraryPaths := []string{dir, filepath.Dir(dir)}
if libraryPath, ok := os.LookupEnv(pathEnv); ok { if libraryPath, ok := os.LookupEnv(pathEnv); ok {
// favor our bundled library dependencies over system libraries // Append our runner directory to the path
// This will favor system libraries over our bundled library dependencies
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
} }
// Note: we always put the dependency path first // Note: we always put the dependency path first
// since this was the exact version we compiled/linked against // since this was the exact version we verified for AMD GPUs
// and we favor what the user had in their path
if gpus[0].DependencyPath != "" { if gpus[0].DependencyPath != "" {
// assume gpus from the same library have the same dependency path // TODO refine for multi-gpu support
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...) libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
} }

View file

@ -4,7 +4,6 @@ set -eu
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
GZIP=$(which pigz 2>/dev/null || echo "gzip")
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"} BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""} export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
@ -22,16 +21,11 @@ for TARGETARCH in ${BUILD_ARCH}; do
-t builder:$TARGETARCH \ -t builder:$TARGETARCH \
. .
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
rm -rf ./dist/linux-$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
if echo ${TARGETARCH} | grep "amd64" > /dev/null; then if [ "$TARGETARCH" = "amd64" ]; then
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/
fi fi
docker rm builder-$TARGETARCH docker rm builder-$TARGETARCH
echo "Compressing final linux bundle..."
rm -f ./dist/ollama-linux-$TARGETARCH.tgz
(cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
if [ -d dist/linux-$TARGETARCH-rocm ]; then
(cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
fi
done done

View file

@ -7,7 +7,6 @@
$ErrorActionPreference = "Stop" $ErrorActionPreference = "Stop"
function checkEnv() { function checkEnv() {
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
$script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower() $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
Write-host "Building for ${script:TARGET_ARCH}" Write-host "Building for ${script:TARGET_ARCH}"
write-host "Locating required tools and paths" write-host "Locating required tools and paths"
@ -16,23 +15,26 @@ function checkEnv() {
$MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
$env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0] $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
} }
# Locate CUDA versions # Try to find the CUDA dir
# Note: this assumes every version found will be built if ($null -eq $env:NVIDIA_DIR) {
$cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
if ($cudaList.length -eq 0) {
$d=(get-command -ea 'silentlycontinue' nvcc).path $d=(get-command -ea 'silentlycontinue' nvcc).path
if ($null -ne $d) { if ($d -ne $null) {
$script:CUDA_DIRS=@($d| split-path -parent) $script:NVIDIA_DIR=($d| split-path -parent)
} else {
$cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
if ($cudaList.length > 0) {
$script:NVIDIA_DIR=$cudaList[0]
}
} }
} else { } else {
$script:CUDA_DIRS=$cudaList $script:NVIDIA_DIR=$env:NVIDIA_DIR
} }
$script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0] $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
$script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}" $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
$env:CGO_ENABLED="1" $env:CGO_ENABLED="1"
Write-Output "Checking version" echo "Checking version"
if (!$env:VERSION) { if (!$env:VERSION) {
$data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always) $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
$pattern="v(.+)" $pattern="v(.+)"
@ -69,48 +71,7 @@ function checkEnv() {
function buildOllama() { function buildOllama() {
write-host "Building ollama CLI" write-host "Building ollama CLI"
if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) { if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" & go generate ./...
# TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle
# which targets to build
# Start by skipping CUDA to build everything else
pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
# Then skip everyhting else and build all the CUDA variants
foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
write-host "Building CUDA ${env:CUDA_LIB_DIR}"
if ($env:CUDA_LIB_DIR.Contains("v12")) {
pwsh -Command {
$env:OLLAMA_SKIP_CUDA_GENERATE=""
$env:OLLAMA_SKIP_STATIC_GENERATE="1"
$env:OLLAMA_SKIP_CPU_GENERATE="1"
$env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
$env:OLLAMA_SKIP_ROCM_GENERATE="1"
$env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
$env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
$env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
$env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
& go generate ./...
}
} else {
pwsh -Command {
$env:OLLAMA_SKIP_CUDA_GENERATE=""
$env:OLLAMA_SKIP_STATIC_GENERATE="1"
$env:OLLAMA_SKIP_CPU_GENERATE="1"
$env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
$env:OLLAMA_SKIP_ROCM_GENERATE="1"
$env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
$env:OLLAMA_CUSTOM_CUDA_DEFS=""
$env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
$env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
& go generate ./...
}
}
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
}
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} else { } else {
write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set" write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
@ -122,8 +83,8 @@ function buildOllama() {
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
} }
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\ cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
} }
function buildApp() { function buildApp() {
@ -142,22 +103,22 @@ function buildApp() {
function gatherDependencies() { function gatherDependencies() {
write-host "Gathering runtime dependencies" write-host "Gathering runtime dependencies"
cd "${script:SRC_DIR}" cd "${script:SRC_DIR}"
md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
# currently works for Win11 + MSVC 2019 + Cuda V11 # currently works for Win11 + MSVC 2019 + Cuda V11
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\" cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\" cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\" cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) { foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\" cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\"
} }
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\" cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
if ("${env:KEY_CONTAINER}") { if ("${env:KEY_CONTAINER}") {
write-host "about to sign" write-host "about to sign"
foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
write-host "signing $file" write-host "signing $file"
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file

View file

@ -63,36 +63,16 @@ if [ -n "$NEEDS" ]; then
exit 1 exit 1
fi fi
status "Downloading ollama..."
curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
for BINDIR in /usr/local/bin /usr/bin /bin; do for BINDIR in /usr/local/bin /usr/bin /bin; do
echo $PATH | grep -q $BINDIR && break || continue echo $PATH | grep -q $BINDIR && break || continue
done done
OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})
status "Installing ollama to $OLLAMA_INSTALL_DIR" status "Installing ollama to $BINDIR..."
$SUDO install -o0 -g0 -m755 -d $BINDIR $SUDO install -o0 -g0 -m755 -d $BINDIR
$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR" $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then
status "Downloading Linux ${ARCH} bundle"
curl --fail --show-error --location --progress-bar \
"https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
$SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
BUNDLE=1
if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
status "Making ollama accessible in the PATH in $BINDIR"
$SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
fi
else
status "Downloading Linux ${ARCH} CLI"
curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
"https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
BUNDLE=0
if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
status "Making ollama accessible in the PATH in $BINDIR"
$SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
fi
fi
install_success() { install_success() {
status 'The Ollama API is now available at 127.0.0.1:11434.' status 'The Ollama API is now available at 127.0.0.1:11434.'
@ -198,16 +178,6 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
fi fi
if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
if [ $BUNDLE -ne 0 ]; then
status "Downloading Linux ROCm ${ARCH} bundle"
curl --fail --show-error --location --progress-bar \
"https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
$SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
install_success
status "AMD GPU ready."
exit 0
fi
# Look for pre-existing ROCm v6 before downloading the dependencies # Look for pre-existing ROCm v6 before downloading the dependencies
for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then

View file

@ -3,7 +3,6 @@
# Script for common Dockerfile dependency installation in redhat linux based images # Script for common Dockerfile dependency installation in redhat linux based images
set -ex set -ex
set -o pipefail
MACHINE=$(uname -m) MACHINE=$(uname -m)
if grep -i "centos" /etc/system-release >/dev/null; then if grep -i "centos" /etc/system-release >/dev/null; then
@ -30,7 +29,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
dnf install -y rh-git227-git dnf install -y rh-git227-git
ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
fi fi
dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
elif grep -i "rocky" /etc/system-release >/dev/null; then elif grep -i "rocky" /etc/system-release >/dev/null; then
# Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC) # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
@ -44,21 +43,12 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
EOF EOF
dnf install -y git \ dnf install -y git \
gcc-toolset-10-gcc-10.2.1-8.2.el8 \ gcc-toolset-10-gcc-10.2.1-8.2.el8 \
gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \ gcc-toolset-10-gcc-c++-10.2.1-8.2.el8
pigz
else else
echo "ERROR Unexpected distro" echo "ERROR Unexpected distro"
exit 1 exit 1
fi fi
if [ "${MACHINE}" = "x86_64" ] ; then
curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
mv /tmp/ccache /usr/local/bin/
else
yum -y install epel-release
yum install -y ccache
fi
if [ -n "${CMAKE_VERSION}" ]; then if [ -n "${CMAKE_VERSION}" ]; then
curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
fi fi

View file

@ -215,20 +215,25 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) {
return nil, "", err return nil, "", err
} }
f, err := os.Open(fp) if _, err = os.Stat(fp); err != nil {
return nil, "", err
}
var manifest *Manifest
bts, err := os.ReadFile(fp)
if err != nil { if err != nil {
return nil, "", err return nil, "", fmt.Errorf("couldn't open file '%s'", fp)
} }
defer f.Close()
sha256sum := sha256.New() shaSum := sha256.Sum256(bts)
shaStr := hex.EncodeToString(shaSum[:])
var manifest Manifest if err := json.Unmarshal(bts, &manifest); err != nil {
if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil {
return nil, "", err return nil, "", err
} }
return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil return manifest, shaStr, nil
} }
func GetModel(name string) (*Model, error) { func GetModel(name string) (*Model, error) {
@ -369,14 +374,13 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
parameters := make(map[string]any) parameters := make(map[string]any)
var layers []Layer var layers []Layer
var baseLayers []*layerGGML
for _, c := range modelfile.Commands { for _, c := range modelfile.Commands {
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name) mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
command := c.Name
switch command { switch c.Name {
case "model", "adapter": case "model", "adapter":
if name := model.ParseName(c.Args); name.IsValid() && command == "model" { var baseLayers []*layerGGML
if name := model.ParseName(c.Args); name.IsValid() {
baseLayers, err = parseFromModel(ctx, name, fn) baseLayers, err = parseFromModel(ctx, name, fn)
if err != nil { if err != nil {
return err return err
@ -410,14 +414,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
} }
defer blob.Close() defer blob.Close()
baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn) baseLayers, err = parseFromFile(ctx, blob, digest, fn)
if err != nil { if err != nil {
return err return err
} }
} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil { } else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
defer file.Close() defer file.Close()
baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn) baseLayers, err = parseFromFile(ctx, file, "", fn)
if err != nil { if err != nil {
return err return err
} }
@ -688,18 +692,43 @@ func CopyModel(src, dst model.Name) error {
return err return err
} }
func deleteUnusedLayers(deleteMap map[string]struct{}) error { func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
manifests, err := Manifests() fp, err := GetManifestPath()
if err != nil { if err != nil {
return err return err
} }
for _, manifest := range manifests { walkFunc := func(path string, info os.FileInfo, _ error) error {
if info.IsDir() {
return nil
}
dir, file := filepath.Split(path)
dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
tag := strings.Join([]string{dir, file}, ":")
fmp := ParseModelPath(tag)
// skip the manifest we're trying to delete
if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
return nil
}
// save (i.e. delete from the deleteMap) any files used in other manifests
manifest, _, err := GetManifest(fmp)
if err != nil {
return err
}
for _, layer := range manifest.Layers { for _, layer := range manifest.Layers {
delete(deleteMap, layer.Digest) delete(deleteMap, layer.Digest)
} }
delete(deleteMap, manifest.Config.Digest) delete(deleteMap, manifest.Config.Digest)
return nil
}
if err := filepath.Walk(fp, walkFunc); err != nil {
return err
} }
// only delete the files which are still in the deleteMap // only delete the files which are still in the deleteMap
@ -752,7 +781,8 @@ func PruneLayers() error {
slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap))) slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
if err := deleteUnusedLayers(deleteMap); err != nil { err = deleteUnusedLayers(nil, deleteMap)
if err != nil {
slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err)) slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
return nil return nil
} }
@ -847,19 +877,26 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error { func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
mp := ParseModelPath(name) mp := ParseModelPath(name)
var manifest *Manifest
var err error
var noprune string
// build deleteMap to prune unused layers // build deleteMap to prune unused layers
deleteMap := make(map[string]struct{}) deleteMap := make(map[string]struct{})
manifest, _, err := GetManifest(mp)
if errors.Is(err, os.ErrNotExist) { if !envconfig.NoPrune() {
// noop manifest, _, err = GetManifest(mp)
} else if err != nil && !errors.Is(err, os.ErrNotExist) { if err != nil && !errors.Is(err, os.ErrNotExist) {
return err return err
} else {
for _, l := range manifest.Layers {
deleteMap[l.Digest] = struct{}{}
} }
if manifest.Config.Digest != "" {
deleteMap[manifest.Config.Digest] = struct{}{} if manifest != nil {
for _, l := range manifest.Layers {
deleteMap[l.Digest] = struct{}{}
}
if manifest.Config.Digest != "" {
deleteMap[manifest.Config.Digest] = struct{}{}
}
} }
} }
@ -938,9 +975,11 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
return err return err
} }
if !envconfig.NoPrune() && len(deleteMap) > 0 { if noprune == "" {
fn(api.ProgressResponse{Status: "removing unused layers"}) fn(api.ProgressResponse{Status: "removing any unused layers"})
if err := deleteUnusedLayers(deleteMap); err != nil { err = deleteUnusedLayers(nil, deleteMap)
if err != nil {
slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)}) fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
} }
} }
@ -961,12 +1000,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio
} }
defer resp.Body.Close() defer resp.Body.Close()
var m Manifest var m *Manifest
if err := json.NewDecoder(resp.Body).Decode(&m); err != nil { if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
return nil, err return nil, err
} }
return &m, err return m, err
} }
// GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer

View file

@ -51,9 +51,6 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
if err := os.Rename(temp.Name(), blob); err != nil { if err := os.Rename(temp.Name(), blob); err != nil {
return Layer{}, err return Layer{}, err
} }
if err := os.Chmod(blob, 0o644); err != nil {
return Layer{}, err
}
} }
return Layer{ return Layer{

View file

@ -5,7 +5,6 @@ import (
"encoding/hex" "encoding/hex"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"io" "io"
"log/slog" "log/slog"
"os" "os"
@ -151,16 +150,14 @@ func Manifests() (map[model.Name]*Manifest, error) {
n := model.ParseNameFromFilepath(rel) n := model.ParseNameFromFilepath(rel)
if !n.IsValid() { if !n.IsValid() {
slog.Warn("bad manifest name", "path", rel) slog.Warn("bad manifest name", "path", rel, "error", err)
continue continue
} }
m, err := ParseNamedManifest(n) m, err := ParseNamedManifest(n)
if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) { if err != nil {
slog.Warn("bad manifest", "name", n, "error", err) slog.Warn("bad manifest", "name", n, "error", err)
continue continue
} else if err != nil {
return nil, fmt.Errorf("%s: %w", n, err)
} }
ms[n] = m ms[n] = m

View file

@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
return layers, nil return layers, nil
} }
func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
fi, err := f.Stat() fi, err := f.Stat()
if err != nil { if err != nil {
return nil, err return nil, err
@ -108,38 +108,16 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
defer t.Close() defer t.Close()
defer os.Remove(t.Name()) defer os.Remove(t.Name())
var layerType string fn(api.ProgressResponse{Status: "converting model"})
if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil {
switch command { return nil, err
case "adapter":
var baseModel *llm.GGML
for _, l := range baseLayers {
if l.GGML != nil {
baseModel = l.GGML
break
}
}
if baseModel == nil {
return nil, fmt.Errorf("no base model specified for the adapter")
}
if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
return nil, err
}
layerType = "application/vnd.ollama.image.adapter"
case "model":
if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
return nil, err
}
layerType = "application/vnd.ollama.image.model"
} }
if _, err := t.Seek(0, io.SeekStart); err != nil { if _, err := t.Seek(0, io.SeekStart); err != nil {
return nil, err return nil, err
} }
layer, err := NewLayer(t, layerType) layer, err := NewLayer(t, "application/vnd.ollama.image.model")
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -161,7 +139,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
return detectChatTemplate(layers) return detectChatTemplate(layers)
} }
func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
sr := io.NewSectionReader(file, 0, 512) sr := io.NewSectionReader(file, 0, 512)
contentType, err := detectContentType(sr) contentType, err := detectContentType(sr)
if err != nil { if err != nil {
@ -172,7 +150,7 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML,
case "gguf", "ggla": case "gguf", "ggla":
// noop // noop
case "application/zip": case "application/zip":
return parseFromZipFile(ctx, command, baseLayers, file, digest, fn) return parseFromZipFile(ctx, file, digest, fn)
default: default:
return nil, fmt.Errorf("unsupported content type: %s", contentType) return nil, fmt.Errorf("unsupported content type: %s", contentType)
} }
@ -192,7 +170,7 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML,
} }
mediatype := "application/vnd.ollama.image.model" mediatype := "application/vnd.ollama.image.model"
if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" { if ggml.Name() == "ggla" {
mediatype = "application/vnd.ollama.image.adapter" mediatype = "application/vnd.ollama.image.adapter"
} else if ggml.KV().Architecture() == "clip" { } else if ggml.KV().Architecture() == "clip" {
mediatype = "application/vnd.ollama.image.projector" mediatype = "application/vnd.ollama.image.projector"

View file

@ -153,7 +153,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
t.Fatalf("failed to seek to start: %v", err) t.Fatalf("failed to seek to start: %v", err)
} }
layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {}) layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {})
if err != nil { if err != nil {
t.Fatalf("failed to parse from file: %v", err) t.Fatalf("failed to parse from file: %v", err)
} }
@ -166,7 +166,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
t.Fatalf("failed to seek to start: %v", err) t.Fatalf("failed to seek to start: %v", err)
} }
layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {}) layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {})
if err != nil { if err != nil {
t.Fatalf("failed to parse from file: %v", err) t.Fatalf("failed to parse from file: %v", err)
} }
@ -206,7 +206,7 @@ func TestParseLayerFromCopy(t *testing.T) {
t.Fatalf("failed to seek to start: %v", err) t.Fatalf("failed to seek to start: %v", err)
} }
layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {}) layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {})
if err != nil { if err != nil {
t.Fatalf("failed to parse from file: %v", err) t.Fatalf("failed to parse from file: %v", err)
} }

View file

@ -193,11 +193,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
break break
} }
// Embedding models should always be loaded with parallel=1
if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
numParallel = 1
}
// Evaluate if the model will fit in the available system memory, or if we should unload a model first // Evaluate if the model will fit in the available system memory, or if we should unload a model first
if len(gpus) == 1 && gpus[0].Library == "cpu" { if len(gpus) == 1 && gpus[0].Library == "cpu" {
// simplifying assumption of defaultParallel when in CPU mode // simplifying assumption of defaultParallel when in CPU mode
@ -739,10 +734,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
// If multiple Libraries are detected, pick the Library which loads the most layers for the model // If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
if *numParallel <= 0 { *numParallel = 1
*numParallel = 1
req.opts.NumCtx = req.origNumCtx
}
byLibrary := gpus.ByLibrary() byLibrary := gpus.ByLibrary()
if len(byLibrary) <= 1 { if len(byLibrary) <= 1 {
return gpus return gpus

View file

@ -117,6 +117,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
require.NoError(t, llm.WriteGGUF(f, llm.KV{ require.NoError(t, llm.WriteGGUF(f, llm.KV{
"general.architecture": "llama", "general.architecture": "llama",
"general.name": "name",
"llama.context_length": uint32(32), "llama.context_length": uint32(32),
"llama.embedding_length": uint32(4096), "llama.embedding_length": uint32(4096),
"llama.block_count": uint32(1), "llama.block_count": uint32(1),

View file

@ -45,7 +45,7 @@ type blobUpload struct {
} }
const ( const (
numUploadParts = 16 numUploadParts = 64
minUploadPartSize int64 = 100 * format.MegaByte minUploadPartSize int64 = 100 * format.MegaByte
maxUploadPartSize int64 = 1000 * format.MegaByte maxUploadPartSize int64 = 1000 * format.MegaByte
) )