Compare commits
45 commits
99dfb67553
...
0c61920bc9
Author | SHA1 | Date | |
---|---|---|---|
0c61920bc9 | |||
|
0f92b19bec | ||
|
69be940bf6 | ||
|
9638c24c58 | ||
|
bb362caf88 | ||
|
0c819e167b | ||
|
7a1e1c1caf | ||
|
0b03b9c32f | ||
|
90ca84172c | ||
|
6bd8a4b0a1 | ||
|
77903ab8b4 | ||
|
e22286c9e1 | ||
|
107f695929 | ||
|
4ecc70d3b4 | ||
|
3546bbd08c | ||
|
beb49eef65 | ||
|
5a28b9cf5f | ||
|
a017cf2fea | ||
|
19e5a890f7 | ||
|
f91c9e3709 | ||
|
2df6905ede | ||
|
d8be22e47d | ||
|
652c273f0e | ||
|
88e7705079 | ||
|
f9e31da946 | ||
|
88bb9e3328 | ||
|
3b19cdba2a | ||
|
927d98a6cd | ||
|
f6c811b320 | ||
|
4fe3a556fa | ||
|
fc3b4cda89 | ||
|
d470ebe78b | ||
|
c7bcb00319 | ||
|
74d45f0102 | ||
|
9fddef3731 | ||
|
885cf45087 | ||
|
9352eeb752 | ||
|
0ad0e738cd | ||
|
bdc4308afb | ||
|
d29cd4c2ed | ||
|
a84c05cf91 | ||
|
e3d7f32af7 | ||
|
3a75e74e34 | ||
|
237dccba1e | ||
|
b3f75fc812 |
59 changed files with 1647 additions and 471 deletions
32
.github/workflows/release.yaml
vendored
32
.github/workflows/release.yaml
vendored
|
@ -187,6 +187,13 @@ jobs:
|
||||||
generate-windows-cuda:
|
generate-windows-cuda:
|
||||||
environment: release
|
environment: release
|
||||||
runs-on: windows
|
runs-on: windows
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
cuda:
|
||||||
|
- version: "11"
|
||||||
|
url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
|
||||||
|
- version: "12"
|
||||||
|
url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
|
||||||
env:
|
env:
|
||||||
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
||||||
steps:
|
steps:
|
||||||
|
@ -220,11 +227,11 @@ jobs:
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install CUDA'
|
- name: 'Install CUDA ${{ matrix.cuda.version }}'
|
||||||
run: |
|
run: |
|
||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
write-host "downloading CUDA Installer"
|
write-host "downloading CUDA Installer"
|
||||||
Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
|
Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
|
||||||
write-host "Installing CUDA"
|
write-host "Installing CUDA"
|
||||||
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
|
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
|
||||||
write-host "Completed CUDA"
|
write-host "Completed CUDA"
|
||||||
|
@ -256,15 +263,16 @@ jobs:
|
||||||
cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
|
cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cuda
|
name: generate-windows-cuda-${{ matrix.cuda.version }}
|
||||||
path: |
|
path: |
|
||||||
llm/build/**/bin/*
|
llm/build/**/bin/*
|
||||||
dist/windows-amd64/**
|
dist/windows-amd64/**
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: windows-cuda-deps
|
name: windows-cuda-deps-${{ matrix.cuda.version }}
|
||||||
path: dist/deps/*
|
path: dist/deps/*
|
||||||
|
|
||||||
|
|
||||||
# Import the prior generation steps and build the final windows assets
|
# Import the prior generation steps and build the final windows assets
|
||||||
build-windows:
|
build-windows:
|
||||||
environment: release
|
environment: release
|
||||||
|
@ -314,10 +322,16 @@ jobs:
|
||||||
name: generate-windows-cpu
|
name: generate-windows-cpu
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cuda
|
name: generate-windows-cuda-11
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: windows-cuda-deps
|
name: generate-windows-cuda-12
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: windows-cuda-deps-11
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: windows-cuda-deps-12
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: windows-rocm-deps
|
name: windows-rocm-deps
|
||||||
|
@ -363,7 +377,6 @@ jobs:
|
||||||
- run: |
|
- run: |
|
||||||
./scripts/build_linux.sh
|
./scripts/build_linux.sh
|
||||||
./scripts/build_docker.sh
|
./scripts/build_docker.sh
|
||||||
mv dist/deps/* dist/
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: dist-linux-amd64
|
name: dist-linux-amd64
|
||||||
|
@ -459,7 +472,10 @@ jobs:
|
||||||
merge-multiple: true
|
merge-multiple: true
|
||||||
- run: |
|
- run: |
|
||||||
ls -lh dist/
|
ls -lh dist/
|
||||||
(cd dist; sha256sum * > sha256sum.txt)
|
(cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
|
||||||
|
mv sha256sum.txt dist/
|
||||||
|
mv dist/linux-???64 .
|
||||||
|
mv dist/linux-amd64-rocm .
|
||||||
cat dist/sha256sum.txt
|
cat dist/sha256sum.txt
|
||||||
- name: Create or update Release
|
- name: Create or update Release
|
||||||
run: |
|
run: |
|
||||||
|
|
|
@ -87,20 +87,11 @@ DialogFontSize=12
|
||||||
|
|
||||||
[Files]
|
[Files]
|
||||||
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
||||||
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit
|
||||||
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
|
Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
|
||||||
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
||||||
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
||||||
#if DirExists("..\dist\windows-amd64\cuda")
|
Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs
|
||||||
Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
|
|
||||||
#endif
|
|
||||||
#if DirExists("..\dist\windows-amd64\oneapi")
|
|
||||||
Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
|
|
||||||
#endif
|
|
||||||
#if DirExists("..\dist\windows-amd64\rocm")
|
|
||||||
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
[Icons]
|
[Icons]
|
||||||
Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
|
Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
|
||||||
|
@ -108,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
|
||||||
Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
|
Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
|
||||||
|
|
||||||
[Run]
|
[Run]
|
||||||
Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
|
Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
|
||||||
|
|
||||||
[UninstallRun]
|
[UninstallRun]
|
||||||
; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
|
; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
|
||||||
|
@ -143,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
|
||||||
|
|
||||||
[Registry]
|
[Registry]
|
||||||
Root: HKCU; Subkey: "Environment"; \
|
Root: HKCU; Subkey: "Environment"; \
|
||||||
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
|
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \
|
||||||
Check: NeedsAddPath('{app}')
|
Check: NeedsAddPath('{app}\bin')
|
||||||
|
|
||||||
[Code]
|
[Code]
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ import (
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
|
"syscall"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
"golang.org/x/sys/windows"
|
"golang.org/x/sys/windows"
|
||||||
|
@ -433,7 +434,12 @@ func (t *winTray) setIcon(src string) error {
|
||||||
t.muNID.Lock()
|
t.muNID.Lock()
|
||||||
defer t.muNID.Unlock()
|
defer t.muNID.Unlock()
|
||||||
t.nid.Icon = h
|
t.nid.Icon = h
|
||||||
t.nid.Flags |= NIF_ICON
|
t.nid.Flags |= NIF_ICON | NIF_TIP
|
||||||
|
if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil {
|
||||||
|
copy(t.nid.Tip[:], toolTipUTF16)
|
||||||
|
} else {
|
||||||
|
return err
|
||||||
|
}
|
||||||
t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
|
t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
|
||||||
|
|
||||||
return t.nid.modify()
|
return t.nid.modify()
|
||||||
|
|
|
@ -61,6 +61,7 @@ const (
|
||||||
MIIM_SUBMENU = 0x00000004
|
MIIM_SUBMENU = 0x00000004
|
||||||
MIM_APPLYTOSUBMENUS = 0x80000000
|
MIM_APPLYTOSUBMENUS = 0x80000000
|
||||||
NIF_ICON = 0x00000002
|
NIF_ICON = 0x00000002
|
||||||
|
NIF_TIP = 0x00000004
|
||||||
NIF_INFO = 0x00000010
|
NIF_INFO = 0x00000010
|
||||||
NIF_MESSAGE = 0x00000001
|
NIF_MESSAGE = 0x00000001
|
||||||
SW_HIDE = 0
|
SW_HIDE = 0
|
||||||
|
|
19
cmd/cmd.go
19
cmd/cmd.go
|
@ -204,6 +204,12 @@ func tempZipFiles(path string) (string, error) {
|
||||||
// safetensors files might be unresolved git lfs references; skip if they are
|
// safetensors files might be unresolved git lfs references; skip if they are
|
||||||
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
|
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
|
||||||
files = append(files, st...)
|
files = append(files, st...)
|
||||||
|
} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
|
||||||
|
// covers adapters.safetensors
|
||||||
|
files = append(files, st...)
|
||||||
|
} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
|
||||||
|
// covers adapter_model.safetensors
|
||||||
|
files = append(files, st...)
|
||||||
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
|
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
|
||||||
// pytorch files might also be unresolved git lfs references; skip if they are
|
// pytorch files might also be unresolved git lfs references; skip if they are
|
||||||
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
|
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
|
||||||
|
@ -223,6 +229,14 @@ func tempZipFiles(path string) (string, error) {
|
||||||
}
|
}
|
||||||
files = append(files, js...)
|
files = append(files, js...)
|
||||||
|
|
||||||
|
// bert models require a nested config.json
|
||||||
|
// TODO(mxyng): merge this with the glob above
|
||||||
|
js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
files = append(files, js...)
|
||||||
|
|
||||||
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
|
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
|
||||||
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
|
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
|
||||||
// tokenizer.model might be a unresolved git lfs reference; error if it is
|
// tokenizer.model might be a unresolved git lfs reference; error if it is
|
||||||
|
@ -252,6 +266,11 @@ func tempZipFiles(path string) (string, error) {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
zfi.Name, err = filepath.Rel(path, file)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
zf, err := zipfile.CreateHeader(zfi)
|
zf, err := zipfile.CreateHeader(zfi)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
|
|
|
@ -7,16 +7,27 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Parameters struct {
|
type ModelParameters struct {
|
||||||
Architectures []string `json:"architectures"`
|
Architectures []string `json:"architectures"`
|
||||||
VocabSize uint32 `json:"vocab_size"`
|
VocabSize uint32 `json:"vocab_size"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (Parameters) KV(t *Tokenizer) llm.KV {
|
type AdapterParameters struct {
|
||||||
|
Alpha uint32 `json:"lora_alpha"`
|
||||||
|
LoraLayers uint32 `json:"lora_layers"`
|
||||||
|
LoraParameters struct {
|
||||||
|
Rank uint32 `json:"rank"`
|
||||||
|
Alpha float32 `json:"alpha"`
|
||||||
|
Scale float32 `json:"scale"`
|
||||||
|
} `json:"lora_parameters"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
||||||
kv := llm.KV{
|
kv := llm.KV{
|
||||||
"general.file_type": uint32(1),
|
"general.file_type": uint32(1),
|
||||||
"general.quantization_version": uint32(2),
|
"general.quantization_version": uint32(2),
|
||||||
|
@ -43,40 +54,119 @@ func (Parameters) KV(t *Tokenizer) llm.KV {
|
||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (Parameters) specialTokenTypes() []string {
|
func (p AdapterParameters) KV() llm.KV {
|
||||||
|
var alpha float32
|
||||||
|
if p.LoraParameters.Alpha == 0 {
|
||||||
|
alpha = float32(p.Alpha)
|
||||||
|
} else {
|
||||||
|
alpha = p.LoraParameters.Alpha
|
||||||
|
}
|
||||||
|
|
||||||
|
kv := llm.KV{
|
||||||
|
"adapter.lora.alpha": alpha,
|
||||||
|
"adapter.type": "lora",
|
||||||
|
"general.file_type": uint32(1),
|
||||||
|
"general.type": "adapter",
|
||||||
|
"general.version": "v0.2",
|
||||||
|
}
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ModelParameters) specialTokenTypes() []string {
|
||||||
return []string{
|
return []string{
|
||||||
"bos", "eos", "unk", "sep", "pad", "cls", "mask",
|
"bos", "eos", "unk", "sep", "pad", "cls", "mask",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||||
return llm.WriteGGUF(ws, kv, ts)
|
return llm.WriteGGUF(ws, kv, ts)
|
||||||
}
|
}
|
||||||
|
|
||||||
type Converter interface {
|
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||||
|
return llm.WriteGGUF(ws, kv, ts)
|
||||||
|
}
|
||||||
|
|
||||||
|
type ModelConverter interface {
|
||||||
// KV maps parameters to LLM key-values
|
// KV maps parameters to LLM key-values
|
||||||
KV(*Tokenizer) llm.KV
|
KV(*Tokenizer) llm.KV
|
||||||
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
||||||
Tensors([]Tensor) []llm.Tensor
|
Tensors([]Tensor) []llm.Tensor
|
||||||
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
|
Replacements() []string
|
||||||
|
|
||||||
// tensorName returns the LLM tensor name for a specific input name
|
|
||||||
tensorName(string) string
|
|
||||||
// specialTokenTypes returns any special token types the model uses
|
// specialTokenTypes returns any special token types the model uses
|
||||||
specialTokenTypes() []string
|
specialTokenTypes() []string
|
||||||
|
// writeFile writes the model to the provided io.WriteSeeker
|
||||||
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type moreParser interface {
|
||||||
|
parseMore(fs.FS) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type AdapterConverter interface {
|
||||||
|
// KV maps parameters to LLM key-values
|
||||||
|
KV(llm.KV) llm.KV
|
||||||
|
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
||||||
|
Tensors([]Tensor) []llm.Tensor
|
||||||
|
// Replacements returns a list of string pairs to replace in tensor names.
|
||||||
|
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||||
|
Replacements() []string
|
||||||
|
|
||||||
|
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
||||||
|
}
|
||||||
|
|
||||||
|
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
|
||||||
|
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var p AdapterParameters
|
||||||
|
if err := json.Unmarshal(bts, &p); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
arch, ok := baseKV["general.architecture"]
|
||||||
|
if !ok {
|
||||||
|
return errors.New("architecture not set for the base model")
|
||||||
|
}
|
||||||
|
|
||||||
|
var conv AdapterConverter
|
||||||
|
switch arch {
|
||||||
|
case "llama":
|
||||||
|
conv = &llamaAdapter{}
|
||||||
|
case "gemma2":
|
||||||
|
conv = &gemma2Adapter{}
|
||||||
|
default:
|
||||||
|
return errors.New("unsupported architecture")
|
||||||
|
}
|
||||||
|
|
||||||
|
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(bts, conv); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
|
||||||
|
}
|
||||||
|
|
||||||
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
||||||
// and files it finds in the input path.
|
// and files it finds in the input path.
|
||||||
// Supported input model formats include safetensors.
|
// Supported input model formats include safetensors.
|
||||||
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
||||||
func Convert(fsys fs.FS, ws io.WriteSeeker) error {
|
func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
||||||
bts, err := fs.ReadFile(fsys, "config.json")
|
bts, err := fs.ReadFile(fsys, "config.json")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
var p Parameters
|
var p ModelParameters
|
||||||
if err := json.Unmarshal(bts, &p); err != nil {
|
if err := json.Unmarshal(bts, &p); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -85,16 +175,20 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
|
||||||
return errors.New("unknown architecture")
|
return errors.New("unknown architecture")
|
||||||
}
|
}
|
||||||
|
|
||||||
var conv Converter
|
var conv ModelConverter
|
||||||
switch p.Architectures[0] {
|
switch p.Architectures[0] {
|
||||||
case "LlamaForCausalLM", "MistralForCausalLM":
|
case "LlamaForCausalLM", "MistralForCausalLM":
|
||||||
conv = &llama{}
|
conv = &llamaModel{}
|
||||||
case "MixtralForCausalLM":
|
case "MixtralForCausalLM":
|
||||||
conv = &mixtral{}
|
conv = &mixtralModel{}
|
||||||
case "GemmaForCausalLM":
|
case "GemmaForCausalLM":
|
||||||
conv = &gemma{}
|
conv = &gemmaModel{}
|
||||||
|
case "Gemma2ForCausalLM":
|
||||||
|
conv = &gemma2Model{}
|
||||||
case "Phi3ForCausalLM":
|
case "Phi3ForCausalLM":
|
||||||
conv = &phi3{}
|
conv = &phi3Model{}
|
||||||
|
case "BertModel":
|
||||||
|
conv = &bertModel{}
|
||||||
default:
|
default:
|
||||||
return errors.New("unsupported architecture")
|
return errors.New("unsupported architecture")
|
||||||
}
|
}
|
||||||
|
@ -103,6 +197,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if t, ok := conv.(moreParser); ok {
|
||||||
|
if err := t.parseMore(fsys); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
t, err := parseTokenizer(fsys, conv.specialTokenTypes())
|
t, err := parseTokenizer(fsys, conv.specialTokenTypes())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -119,7 +219,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error {
|
||||||
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
||||||
}
|
}
|
||||||
|
|
||||||
ts, err := parseTensors(fsys)
|
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
174
convert/convert_bert.go
Normal file
174
convert/convert_bert.go
Normal file
|
@ -0,0 +1,174 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"encoding/json"
|
||||||
|
"io/fs"
|
||||||
|
"path/filepath"
|
||||||
|
"slices"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
|
)
|
||||||
|
|
||||||
|
type bertModel struct {
|
||||||
|
ModelParameters
|
||||||
|
NLayers uint32 `json:"n_layers"`
|
||||||
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
|
NLayer uint32 `json:"n_layer"`
|
||||||
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||||
|
NCtx uint32 `json:"n_ctx"`
|
||||||
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
NEmbd uint32 `json:"n_embd"`
|
||||||
|
IntermediateSize uint32 `json:"intermediate_size"`
|
||||||
|
NInner uint32 `json:"n_inner"`
|
||||||
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||||
|
NHead uint32 `json:"n_head"`
|
||||||
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||||
|
LayerNormEPS float32 `json:"layer_norm_eps"`
|
||||||
|
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
|
||||||
|
NormEpsilon float32 `json:"norm_epsilon"`
|
||||||
|
|
||||||
|
PoolingType uint32
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
_ ModelConverter = (*bertModel)(nil)
|
||||||
|
_ moreParser = (*bertModel)(nil)
|
||||||
|
)
|
||||||
|
|
||||||
|
func (p *bertModel) parseMore(fsys fs.FS) error {
|
||||||
|
bts, err := fs.ReadFile(fsys, "modules.json")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var modules []struct {
|
||||||
|
Type string `json:"type"`
|
||||||
|
Path string `json:"path"`
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(bts, &modules); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var pooling string
|
||||||
|
for _, m := range modules {
|
||||||
|
if m.Type == "sentence_transformers.models.Pooling" {
|
||||||
|
pooling = m.Path
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if pooling != "" {
|
||||||
|
bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var pc struct {
|
||||||
|
PoolingModeCLSToken bool `json:"pooling_mode_cls_token"`
|
||||||
|
PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.Unmarshal(bts, &pc); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if pc.PoolingModeMeanTokens {
|
||||||
|
p.PoolingType = 1
|
||||||
|
} else if pc.PoolingModeCLSToken {
|
||||||
|
p.PoolingType = 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *bertModel) KV(t *Tokenizer) llm.KV {
|
||||||
|
kv := p.ModelParameters.KV(t)
|
||||||
|
kv["general.architecture"] = "bert"
|
||||||
|
kv["bert.attention.causal"] = false
|
||||||
|
kv["bert.pooling_type"] = p.PoolingType
|
||||||
|
|
||||||
|
kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
|
||||||
|
|
||||||
|
if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
|
||||||
|
kv["bert.context_length"] = contextLength
|
||||||
|
}
|
||||||
|
|
||||||
|
if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
|
||||||
|
kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
|
||||||
|
}
|
||||||
|
|
||||||
|
if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
|
||||||
|
kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
|
||||||
|
}
|
||||||
|
|
||||||
|
if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
|
||||||
|
kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
|
||||||
|
}
|
||||||
|
|
||||||
|
if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
|
||||||
|
kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
|
||||||
|
}
|
||||||
|
|
||||||
|
kv["tokenizer.ggml.model"] = "bert"
|
||||||
|
kv["tokenizer.ggml.token_type_count"] = uint32(2)
|
||||||
|
|
||||||
|
// convert to phantom space tokens
|
||||||
|
for i, e := range t.Tokens {
|
||||||
|
if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
|
||||||
|
// noop
|
||||||
|
} else if strings.HasPrefix(e, "##") {
|
||||||
|
t.Tokens[i] = e[2:]
|
||||||
|
} else {
|
||||||
|
t.Tokens[i] = "\u2581" + e
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
kv["tokenizer.ggml.tokens"] = t.Tokens
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
|
var out []llm.Tensor
|
||||||
|
for _, t := range ts {
|
||||||
|
if slices.Contains([]string{
|
||||||
|
"embeddings.position_ids",
|
||||||
|
"pooler.dense.weight",
|
||||||
|
"pooler.dense.bias",
|
||||||
|
}, t.Name()) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, llm.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (bertModel) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"encoder.layer", "blk",
|
||||||
|
"encoder.layers", "blk",
|
||||||
|
"embeddings.word_embeddings", "token_embd",
|
||||||
|
"embeddings.token_type_embeddings", "token_types",
|
||||||
|
"embeddings.LayerNorm", "token_embd_norm",
|
||||||
|
"embeddings.position_embeddings", "position_embd",
|
||||||
|
"attention.self.query", "attn_q",
|
||||||
|
"attention.self.key", "attn_k",
|
||||||
|
"attention.self.value", "attn_v",
|
||||||
|
"attention.output.dense", "attn_output",
|
||||||
|
"attention.output.LayerNorm", "attn_output_norm",
|
||||||
|
"intermediate.dense", "ffn_up",
|
||||||
|
"output.dense", "ffn_down",
|
||||||
|
"output.LayerNorm", "layer_output_norm",
|
||||||
|
}
|
||||||
|
}
|
|
@ -9,8 +9,8 @@ import (
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type gemma struct {
|
type gemmaModel struct {
|
||||||
Parameters
|
ModelParameters
|
||||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||||
HiddenSize uint32 `json:"hidden_size"`
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
HiddenLayers uint32 `json:"num_hidden_layers"`
|
HiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
|
@ -21,12 +21,11 @@ type gemma struct {
|
||||||
HeadDim uint32 `json:"head_dim"`
|
HeadDim uint32 `json:"head_dim"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ Converter = (*gemma)(nil)
|
var _ ModelConverter = (*gemmaModel)(nil)
|
||||||
|
|
||||||
func (p *gemma) KV(t *Tokenizer) llm.KV {
|
func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.Parameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "gemma"
|
kv["general.architecture"] = "gemma"
|
||||||
kv["general.name"] = "gemma"
|
|
||||||
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
||||||
kv["gemma.embedding_length"] = p.HiddenSize
|
kv["gemma.embedding_length"] = p.HiddenSize
|
||||||
kv["gemma.block_count"] = p.HiddenLayers
|
kv["gemma.block_count"] = p.HiddenLayers
|
||||||
|
@ -43,16 +42,15 @@ func (p *gemma) KV(t *Tokenizer) llm.KV {
|
||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var out []llm.Tensor
|
var out []llm.Tensor
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
name := p.tensorName(t.Name())
|
if strings.HasSuffix(t.Name(), "_norm.weight") {
|
||||||
if strings.HasSuffix(name, "_norm.weight") {
|
|
||||||
t.SetRepacker(p.addOne)
|
t.SetRepacker(p.addOne)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: name,
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
WriterTo: t,
|
WriterTo: t,
|
||||||
|
@ -62,8 +60,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *gemma) tensorName(n string) string {
|
func (p *gemmaModel) Replacements() []string {
|
||||||
return strings.NewReplacer(
|
return []string{
|
||||||
"model.embed_tokens", "token_embd",
|
"model.embed_tokens", "token_embd",
|
||||||
"model.norm", "output_norm",
|
"model.norm", "output_norm",
|
||||||
"model.layers", "blk",
|
"model.layers", "blk",
|
||||||
|
@ -76,11 +74,10 @@ func (p *gemma) tensorName(n string) string {
|
||||||
"mlp.down_proj", "ffn_down",
|
"mlp.down_proj", "ffn_down",
|
||||||
"mlp.up_proj", "ffn_up",
|
"mlp.up_proj", "ffn_up",
|
||||||
"post_attention_layernorm", "ffn_norm",
|
"post_attention_layernorm", "ffn_norm",
|
||||||
"block_sparse_moe.gate", "ffn_inp",
|
}
|
||||||
).Replace(n)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
|
func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
|
n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
|
||||||
ones := tensor.Ones(tensor.Float32, int(shape[0]))
|
ones := tensor.Ones(tensor.Float32, int(shape[0]))
|
||||||
|
|
||||||
|
|
43
convert/convert_gemma2.go
Normal file
43
convert/convert_gemma2.go
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
|
)
|
||||||
|
|
||||||
|
type gemma2Model struct {
|
||||||
|
gemmaModel
|
||||||
|
SlidingWindow uint32 `json:"sliding_window"`
|
||||||
|
AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
|
||||||
|
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
|
||||||
|
kv := p.ModelParameters.KV(t)
|
||||||
|
kv["general.architecture"] = "gemma2"
|
||||||
|
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
||||||
|
kv["gemma2.embedding_length"] = p.HiddenSize
|
||||||
|
kv["gemma2.block_count"] = p.HiddenLayers
|
||||||
|
kv["gemma2.feed_forward_length"] = p.IntermediateSize
|
||||||
|
kv["gemma2.attention.head_count"] = p.NumAttentionHeads
|
||||||
|
kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
|
||||||
|
kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
||||||
|
kv["gemma2.attention.key_length"] = p.HeadDim
|
||||||
|
kv["gemma2.attention.value_length"] = p.HeadDim
|
||||||
|
kv["gemma2.attention.sliding_window"] = p.SlidingWindow
|
||||||
|
kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
|
||||||
|
kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
|
||||||
|
kv["tokenizer.ggml.eot_token_id"] = uint32(107)
|
||||||
|
kv["tokenizer.ggml.middle_token_id"] = uint32(68)
|
||||||
|
kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
|
||||||
|
kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *gemma2Model) Replacements() []string {
|
||||||
|
return append(
|
||||||
|
p.gemmaModel.Replacements(),
|
||||||
|
"post_attention_layernorm", "post_attention_norm",
|
||||||
|
"pre_feedforward_layernorm", "ffn_norm",
|
||||||
|
"post_feedforward_layernorm", "post_ffw_norm",
|
||||||
|
)
|
||||||
|
}
|
91
convert/convert_gemma2_adapter.go
Normal file
91
convert/convert_gemma2_adapter.go
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/pdevine/tensor"
|
||||||
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
|
)
|
||||||
|
|
||||||
|
type gemma2Adapter struct {
|
||||||
|
AdapterParameters
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
||||||
|
|
||||||
|
func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
|
||||||
|
kv := p.AdapterParameters.KV()
|
||||||
|
kv["general.architecture"] = "gemma2"
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
|
var out []llm.Tensor
|
||||||
|
for _, t := range ts {
|
||||||
|
shape := t.Shape()
|
||||||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
|
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
|
||||||
|
shape[0], shape[1] = shape[1], shape[0]
|
||||||
|
t.SetRepacker(p.repack)
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, llm.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: t.Shape(),
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *gemma2Adapter) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"base_model.model.", "",
|
||||||
|
"model.layers", "blk",
|
||||||
|
"self_attn.q_proj", "attn_q",
|
||||||
|
"self_attn.k_proj", "attn_k",
|
||||||
|
"self_attn.v_proj", "attn_v",
|
||||||
|
"self_attn.o_proj", "attn_output",
|
||||||
|
"mlp.gate_proj", "ffn_gate",
|
||||||
|
"mlp.down_proj", "ffn_down",
|
||||||
|
"mlp.up_proj", "ffn_up",
|
||||||
|
"lora_A.weight", "weight.lora_a",
|
||||||
|
"lora_B.weight", "weight.lora_b",
|
||||||
|
"lora_a", "weight.lora_a",
|
||||||
|
"lora_b", "weight.lora_b",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
|
dims := []int{int(shape[1]), int(shape[0])}
|
||||||
|
|
||||||
|
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
|
||||||
|
if err := n.T(1, 0); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ts, err := native.SelectF32(n, 1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var f32s []float32
|
||||||
|
for _, t := range ts {
|
||||||
|
f32s = append(f32s, t...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f32s, nil
|
||||||
|
}
|
|
@ -3,6 +3,7 @@ package convert
|
||||||
import (
|
import (
|
||||||
"cmp"
|
"cmp"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/pdevine/tensor"
|
"github.com/pdevine/tensor"
|
||||||
|
@ -11,8 +12,8 @@ import (
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type llama struct {
|
type llamaModel struct {
|
||||||
Parameters
|
ModelParameters
|
||||||
NLayers uint32 `json:"n_layers"`
|
NLayers uint32 `json:"n_layers"`
|
||||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
NLayer uint32 `json:"n_layer"`
|
NLayer uint32 `json:"n_layer"`
|
||||||
|
@ -28,7 +29,13 @@ type llama struct {
|
||||||
RopeTheta float32 `json:"rope_theta"`
|
RopeTheta float32 `json:"rope_theta"`
|
||||||
RopeScaling struct {
|
RopeScaling struct {
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
|
RopeType string `json:"rope_type"`
|
||||||
Factor float32 `json:"factor"`
|
Factor float32 `json:"factor"`
|
||||||
|
LowFrequencyFactor float32 `json:"low_freq_factor"`
|
||||||
|
HighFrequencyFactor float32 `json:"high_freq_factor"`
|
||||||
|
OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"`
|
||||||
|
|
||||||
|
factors ropeFactor
|
||||||
} `json:"rope_scaling"`
|
} `json:"rope_scaling"`
|
||||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||||
LayerNormEPS float32 `json:"layer_norm_eps"`
|
LayerNormEPS float32 `json:"layer_norm_eps"`
|
||||||
|
@ -37,12 +44,11 @@ type llama struct {
|
||||||
HeadDim uint32 `json:"head_dim"`
|
HeadDim uint32 `json:"head_dim"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ Converter = (*llama)(nil)
|
var _ ModelConverter = (*llamaModel)(nil)
|
||||||
|
|
||||||
func (p *llama) KV(t *Tokenizer) llm.KV {
|
func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.Parameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "llama"
|
kv["general.architecture"] = "llama"
|
||||||
kv["general.name"] = "llama"
|
|
||||||
kv["llama.vocab_size"] = p.VocabSize
|
kv["llama.vocab_size"] = p.VocabSize
|
||||||
|
|
||||||
kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
|
kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
|
||||||
|
@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
|
||||||
if p.RopeScaling.Type == "linear" {
|
if p.RopeScaling.Type == "linear" {
|
||||||
kv["llama.rope.scaling.type"] = p.RopeScaling.Type
|
kv["llama.rope.scaling.type"] = p.RopeScaling.Type
|
||||||
kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
|
kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
|
||||||
|
} else if p.RopeScaling.RopeType == "llama3" {
|
||||||
|
dim := p.HiddenSize / p.NumAttentionHeads
|
||||||
|
for i := uint32(0); i < dim; i += 2 {
|
||||||
|
factor := cmp.Or(p.RopeScaling.Factor, 8.0)
|
||||||
|
factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
|
||||||
|
factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
|
||||||
|
|
||||||
|
original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
|
||||||
|
lambdaLow := float32(original) / factorLow
|
||||||
|
lambdaHigh := float32(original) / factorHigh
|
||||||
|
|
||||||
|
lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
|
||||||
|
if lambda < float64(lambdaHigh) {
|
||||||
|
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
|
||||||
|
} else if lambda > float64(lambdaLow) {
|
||||||
|
p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
|
||||||
|
} else {
|
||||||
|
smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
|
||||||
|
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.NumKeyValueHeads > 0 {
|
if p.NumKeyValueHeads > 0 {
|
||||||
|
@ -93,17 +120,26 @@ func (p *llama) KV(t *Tokenizer) llm.KV {
|
||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var out []llm.Tensor
|
var out []llm.Tensor
|
||||||
|
|
||||||
|
if p.RopeScaling.factors != nil {
|
||||||
|
out = append(out, llm.Tensor{
|
||||||
|
Name: "rope_freqs.weight",
|
||||||
|
Kind: 0,
|
||||||
|
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
||||||
|
WriterTo: p.RopeScaling.factors,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
name := p.tensorName(t.Name())
|
if strings.HasSuffix(t.Name(), "attn_q.weight") ||
|
||||||
if strings.HasSuffix(name, "attn_q.weight") ||
|
strings.HasSuffix(t.Name(), "attn_k.weight") {
|
||||||
strings.HasSuffix(name, "attn_k.weight") {
|
|
||||||
t.SetRepacker(p.repack)
|
t.SetRepacker(p.repack)
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: name,
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
WriterTo: t,
|
WriterTo: t,
|
||||||
|
@ -113,8 +149,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llama) tensorName(n string) string {
|
func (p *llamaModel) Replacements() []string {
|
||||||
return strings.NewReplacer(
|
return []string{
|
||||||
"lm_head", "output",
|
"lm_head", "output",
|
||||||
"model.embed_tokens", "token_embd",
|
"model.embed_tokens", "token_embd",
|
||||||
"model.norm", "output_norm",
|
"model.norm", "output_norm",
|
||||||
|
@ -128,21 +164,19 @@ func (p *llama) tensorName(n string) string {
|
||||||
"mlp.down_proj", "ffn_down",
|
"mlp.down_proj", "ffn_down",
|
||||||
"mlp.up_proj", "ffn_up",
|
"mlp.up_proj", "ffn_up",
|
||||||
"post_attention_layernorm", "ffn_norm",
|
"post_attention_layernorm", "ffn_norm",
|
||||||
// mixtral
|
}
|
||||||
"block_sparse_moe.gate", "ffn_gate_inp",
|
|
||||||
).Replace(n)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
var dims []int
|
var dims []int
|
||||||
for _, dim := range shape {
|
for _, dim := range shape {
|
||||||
dims = append(dims, int(dim))
|
dims = append(dims, int(dim))
|
||||||
}
|
}
|
||||||
|
|
||||||
var heads uint32
|
var heads uint32
|
||||||
if strings.HasSuffix(name, "q_proj.weight") {
|
if strings.HasSuffix(name, "attn_q.weight") {
|
||||||
heads = p.NumAttentionHeads
|
heads = p.NumAttentionHeads
|
||||||
} else if strings.HasSuffix(name, "k_proj.weight") {
|
} else if strings.HasSuffix(name, "attn_k.weight") {
|
||||||
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
} else {
|
} else {
|
||||||
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
||||||
|
|
169
convert/convert_llama_adapter.go
Normal file
169
convert/convert_llama_adapter.go
Normal file
|
@ -0,0 +1,169 @@
|
||||||
|
package convert
|
||||||
|
|
||||||
|
import (
|
||||||
|
"cmp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/pdevine/tensor"
|
||||||
|
"github.com/pdevine/tensor/native"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/llm"
|
||||||
|
)
|
||||||
|
|
||||||
|
type llamaAdapter struct {
|
||||||
|
AdapterParameters
|
||||||
|
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||||
|
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ AdapterConverter = (*llamaAdapter)(nil)
|
||||||
|
|
||||||
|
func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
|
||||||
|
kv := p.AdapterParameters.KV()
|
||||||
|
kv["general.architecture"] = "llama"
|
||||||
|
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
||||||
|
kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
|
||||||
|
|
||||||
|
p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
|
||||||
|
|
||||||
|
return kv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
|
var out []llm.Tensor
|
||||||
|
for _, t := range ts {
|
||||||
|
shape := t.Shape()
|
||||||
|
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||||
|
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
|
||||||
|
shape[0], shape[1] = shape[1], shape[0]
|
||||||
|
t.SetRepacker(p.repackAndTranspose)
|
||||||
|
} else {
|
||||||
|
t.SetRepacker(p.repack)
|
||||||
|
}
|
||||||
|
|
||||||
|
out = append(out, llm.Tensor{
|
||||||
|
Name: t.Name(),
|
||||||
|
Kind: t.Kind(),
|
||||||
|
Shape: shape,
|
||||||
|
WriterTo: t,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *llamaAdapter) Replacements() []string {
|
||||||
|
return []string{
|
||||||
|
"base_model.model.", "",
|
||||||
|
"model.layers", "blk",
|
||||||
|
"self_attn.q_proj", "attn_q",
|
||||||
|
"self_attn.k_proj", "attn_k",
|
||||||
|
"self_attn.v_proj", "attn_v",
|
||||||
|
"self_attn.o_proj", "attn_output",
|
||||||
|
"mlp.gate_proj", "ffn_gate",
|
||||||
|
"mlp.down_proj", "ffn_down",
|
||||||
|
"mlp.up_proj", "ffn_up",
|
||||||
|
"lora_A.weight", "weight.lora_a",
|
||||||
|
"lora_B.weight", "weight.lora_b",
|
||||||
|
"lora_a", "weight.lora_a",
|
||||||
|
"lora_b", "weight.lora_b",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
|
dims := []int{int(shape[1]), int(shape[0])}
|
||||||
|
|
||||||
|
var heads uint32
|
||||||
|
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
|
||||||
|
heads = p.NumAttentionHeads
|
||||||
|
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
|
||||||
|
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
|
} else {
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
|
||||||
|
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.T(0, 2, 1, 3); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ts, err := native.SelectF32(n, 1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var f32s []float32
|
||||||
|
for _, t := range ts {
|
||||||
|
f32s = append(f32s, t...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f32s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||||
|
dims := []int{int(shape[1]), int(shape[0])}
|
||||||
|
|
||||||
|
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||||
|
|
||||||
|
var heads uint32
|
||||||
|
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
|
||||||
|
heads = p.NumAttentionHeads
|
||||||
|
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
|
||||||
|
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||||
|
}
|
||||||
|
|
||||||
|
if heads > 0 {
|
||||||
|
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.T(0, 2, 1, 3); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.T(1, 0); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Reshape(dims...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := n.Transpose(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
ts, err := native.SelectF32(n, 1)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var f32s []float32
|
||||||
|
for _, t := range ts {
|
||||||
|
f32s = append(f32s, t...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f32s, nil
|
||||||
|
}
|
|
@ -9,16 +9,14 @@ import (
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type mixtral struct {
|
type mixtralModel struct {
|
||||||
llama
|
llamaModel
|
||||||
NumLocalExperts uint32 `json:"num_local_experts"`
|
NumLocalExperts uint32 `json:"num_local_experts"`
|
||||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ Converter = (*mixtral)(nil)
|
func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
||||||
|
kv := p.llamaModel.KV(t)
|
||||||
func (p *mixtral) KV(t *Tokenizer) llm.KV {
|
|
||||||
kv := p.llama.KV(t)
|
|
||||||
|
|
||||||
if p.NumLocalExperts > 0 {
|
if p.NumLocalExperts > 0 {
|
||||||
kv["llama.expert_count"] = p.NumLocalExperts
|
kv["llama.expert_count"] = p.NumLocalExperts
|
||||||
|
@ -31,7 +29,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV {
|
||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
oldnew := []string{
|
oldnew := []string{
|
||||||
"model.layers", "blk",
|
"model.layers", "blk",
|
||||||
"w1", "ffn_gate_exps",
|
"w1", "ffn_gate_exps",
|
||||||
|
@ -69,7 +67,14 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
return append(out, p.llama.Tensors(ts)...)
|
return append(out, p.llamaModel.Tensors(ts)...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *mixtralModel) Replacements() []string {
|
||||||
|
return append(
|
||||||
|
p.llamaModel.Replacements(),
|
||||||
|
"block_sparse_moe.gate", "ffn_gate_inp",
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
type experts []Tensor
|
type experts []Tensor
|
||||||
|
|
|
@ -11,8 +11,8 @@ import (
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
)
|
)
|
||||||
|
|
||||||
type phi3 struct {
|
type phi3Model struct {
|
||||||
Parameters
|
ModelParameters
|
||||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||||
NLayers uint32 `json:"n_layers"`
|
NLayers uint32 `json:"n_layers"`
|
||||||
HiddenSize uint32 `json:"hidden_size"`
|
HiddenSize uint32 `json:"hidden_size"`
|
||||||
|
@ -35,12 +35,11 @@ type phi3 struct {
|
||||||
SlidingWindow uint32 `json:"sliding_window"`
|
SlidingWindow uint32 `json:"sliding_window"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var _ Converter = (*phi3)(nil)
|
var _ ModelConverter = (*phi3Model)(nil)
|
||||||
|
|
||||||
func (p *phi3) KV(t *Tokenizer) llm.KV {
|
func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
||||||
kv := p.Parameters.KV(t)
|
kv := p.ModelParameters.KV(t)
|
||||||
kv["general.architecture"] = "phi3"
|
kv["general.architecture"] = "phi3"
|
||||||
kv["general.name"] = "phi3"
|
|
||||||
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
||||||
kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
|
kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
|
||||||
kv["phi3.feed_forward_length"] = p.IntermediateSize
|
kv["phi3.feed_forward_length"] = p.IntermediateSize
|
||||||
|
@ -69,13 +68,12 @@ func (p *phi3) KV(t *Tokenizer) llm.KV {
|
||||||
return kv
|
return kv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
|
func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
var addRopeFactors sync.Once
|
var addRopeFactors sync.Once
|
||||||
|
|
||||||
out := make([]llm.Tensor, 0, len(ts)+2)
|
out := make([]llm.Tensor, 0, len(ts)+2)
|
||||||
for _, t := range ts {
|
for _, t := range ts {
|
||||||
name := p.tensorName(t.Name())
|
if strings.HasPrefix(t.Name(), "blk.0.") {
|
||||||
if strings.HasPrefix(name, "blk.0.") {
|
|
||||||
addRopeFactors.Do(func() {
|
addRopeFactors.Do(func() {
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: "rope_factors_long.weight",
|
Name: "rope_factors_long.weight",
|
||||||
|
@ -92,7 +90,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
}
|
}
|
||||||
|
|
||||||
out = append(out, llm.Tensor{
|
out = append(out, llm.Tensor{
|
||||||
Name: name,
|
Name: t.Name(),
|
||||||
Kind: t.Kind(),
|
Kind: t.Kind(),
|
||||||
Shape: t.Shape(),
|
Shape: t.Shape(),
|
||||||
WriterTo: t,
|
WriterTo: t,
|
||||||
|
@ -102,8 +100,8 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *phi3) tensorName(n string) string {
|
func (p *phi3Model) Replacements() []string {
|
||||||
return strings.NewReplacer(
|
return []string{
|
||||||
"lm_head", "output",
|
"lm_head", "output",
|
||||||
"model.embed_tokens", "token_embd",
|
"model.embed_tokens", "token_embd",
|
||||||
"model.norm", "output_norm",
|
"model.norm", "output_norm",
|
||||||
|
@ -114,7 +112,7 @@ func (p *phi3) tensorName(n string) string {
|
||||||
"mlp.down_proj", "ffn_down",
|
"mlp.down_proj", "ffn_down",
|
||||||
"mlp.gate_up_proj", "ffn_up",
|
"mlp.gate_up_proj", "ffn_up",
|
||||||
"post_attention_layernorm", "ffn_norm",
|
"post_attention_layernorm", "ffn_norm",
|
||||||
).Replace(n)
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
type ropeFactor []float32
|
type ropeFactor []float32
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
package convert
|
package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"crypto/sha256"
|
"crypto/sha256"
|
||||||
|
"encoding/binary"
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"flag"
|
"flag"
|
||||||
|
@ -29,7 +31,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
if err := Convert(fsys, f); err != nil {
|
if err := ConvertModel(fsys, f); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -51,37 +53,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
||||||
return r, m.KV(), m.Tensors()
|
return r, m.KV(), m.Tensors()
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMain(m *testing.M) {
|
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
|
||||||
var level slog.Level
|
|
||||||
flag.TextVar(&level, "level", slog.LevelInfo, "log level")
|
|
||||||
flag.Parse()
|
|
||||||
slog.SetLogLoggerLevel(level)
|
|
||||||
os.Exit(m.Run())
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestConvertFull(t *testing.T) {
|
|
||||||
cases := []string{
|
|
||||||
"Meta-Llama-3-8B-Instruct",
|
|
||||||
"Mistral-7B-Instruct-v0.2",
|
|
||||||
"Mixtral-8x7B-Instruct-v0.1",
|
|
||||||
"gemma-2b-it",
|
|
||||||
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
|
|
||||||
"Phi-3-mini-128k-instruct",
|
|
||||||
}
|
|
||||||
|
|
||||||
for i := range cases {
|
|
||||||
tt := cases[i]
|
|
||||||
t.Run(tt, func(t *testing.T) {
|
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
p := filepath.Join("testdata", tt)
|
|
||||||
if testing.Short() {
|
|
||||||
t.Skip("skipping in short mode")
|
|
||||||
} else if _, err := os.Stat(p); err != nil {
|
|
||||||
t.Skipf("%s not found", p)
|
|
||||||
}
|
|
||||||
|
|
||||||
f, kv, tensors := convertFull(t, os.DirFS(p))
|
|
||||||
actual := make(map[string]string)
|
actual := make(map[string]string)
|
||||||
for k, v := range kv {
|
for k, v := range kv {
|
||||||
if s, ok := v.(json.Marshaler); !ok {
|
if s, ok := v.(json.Marshaler); !ok {
|
||||||
|
@ -106,6 +78,45 @@ func TestConvertFull(t *testing.T) {
|
||||||
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
|
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return actual
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMain(m *testing.M) {
|
||||||
|
var level slog.Level
|
||||||
|
flag.TextVar(&level, "level", slog.LevelInfo, "log level")
|
||||||
|
flag.Parse()
|
||||||
|
slog.SetLogLoggerLevel(level)
|
||||||
|
os.Exit(m.Run())
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestConvertFull(t *testing.T) {
|
||||||
|
cases := []string{
|
||||||
|
"Meta-Llama-3-8B-Instruct",
|
||||||
|
"Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"Mistral-7B-Instruct-v0.2",
|
||||||
|
"Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"gemma-2b-it",
|
||||||
|
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
|
||||||
|
"Phi-3-mini-128k-instruct",
|
||||||
|
"all-MiniLM-L6-v2",
|
||||||
|
"gemma-2-9b-it",
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range cases {
|
||||||
|
tt := cases[i]
|
||||||
|
t.Run(tt, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
p := filepath.Join("testdata", tt)
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("skipping in short mode")
|
||||||
|
} else if _, err := os.Stat(p); err != nil {
|
||||||
|
t.Skipf("%s not found", p)
|
||||||
|
}
|
||||||
|
|
||||||
|
f, kv, tensors := convertFull(t, os.DirFS(p))
|
||||||
|
actual := generateResultsJSON(t, f, kv, tensors)
|
||||||
|
|
||||||
expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
|
expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
|
@ -128,3 +139,209 @@ func TestConvertFull(t *testing.T) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestConvertAdapter(t *testing.T) {
|
||||||
|
type AdapterCase struct {
|
||||||
|
Name string
|
||||||
|
BaseKV map[string]any
|
||||||
|
Expected map[string]string
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []AdapterCase{
|
||||||
|
{
|
||||||
|
Name: "discollama",
|
||||||
|
BaseKV: map[string]any{
|
||||||
|
"general.architecture": "llama",
|
||||||
|
"llama.attention.head_count": uint32(32),
|
||||||
|
"llama.attention.head_count_kv": uint32(8),
|
||||||
|
},
|
||||||
|
Expected: map[string]string{
|
||||||
|
"general.architecture": "llama",
|
||||||
|
"general.file_type": "1",
|
||||||
|
"general.parameter_count": "106496",
|
||||||
|
"general.type": "adapter",
|
||||||
|
"general.version": "v0.2",
|
||||||
|
"adapter.lora.alpha": "16",
|
||||||
|
"adapter.type": "lora",
|
||||||
|
"llama.attention.head_count": "32",
|
||||||
|
"llama.attention.head_count_kv": "8",
|
||||||
|
"blk.31.attn_q.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
|
||||||
|
"blk.31.attn_q.weight.lora_b": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
|
||||||
|
"blk.31.attn_v.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
|
||||||
|
"blk.31.attn_v.weight.lora_b": "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(c.Name, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
f, err := os.CreateTemp(t.TempDir(), "f16")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
tempDir := t.TempDir()
|
||||||
|
generateLoraTestData(t, tempDir)
|
||||||
|
|
||||||
|
if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
r, err := os.Open(f.Name())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer r.Close()
|
||||||
|
|
||||||
|
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := r.Seek(0, io.SeekStart); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
|
||||||
|
|
||||||
|
keys := maps.Keys(c.Expected)
|
||||||
|
slices.Sort(keys)
|
||||||
|
for _, k := range keys {
|
||||||
|
if v, ok := actual[k]; !ok {
|
||||||
|
t.Errorf("missing %s", k)
|
||||||
|
} else if v != c.Expected[k] {
|
||||||
|
t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func generateLoraTestData(t *testing.T, tempDir string) {
|
||||||
|
type tensorData struct {
|
||||||
|
Offsets []int `json:"data_offsets"`
|
||||||
|
Type string `json:"dtype"`
|
||||||
|
Shape []int `json:"shape"`
|
||||||
|
}
|
||||||
|
offset := 4096 * 8 * 4
|
||||||
|
|
||||||
|
td := map[string]*tensorData{"__metadata__": nil}
|
||||||
|
td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
|
||||||
|
Offsets: []int{0, offset},
|
||||||
|
Type: "F32",
|
||||||
|
Shape: []int{4096, 8},
|
||||||
|
}
|
||||||
|
td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
|
||||||
|
Offsets: []int{offset, offset * 2},
|
||||||
|
Type: "F32",
|
||||||
|
Shape: []int{8, 4096},
|
||||||
|
}
|
||||||
|
td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
|
||||||
|
Offsets: []int{offset * 2, offset * 3},
|
||||||
|
Type: "F32",
|
||||||
|
Shape: []int{4096, 8},
|
||||||
|
}
|
||||||
|
td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
|
||||||
|
Offsets: []int{offset * 3, offset*3 + 8*1024*4},
|
||||||
|
Type: "F32",
|
||||||
|
Shape: []int{8, 1024},
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := json.Marshal(td)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
|
||||||
|
l := int64(len(data))
|
||||||
|
err = binary.Write(&buf, binary.LittleEndian, l)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = buf.Write(data)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// write some data for the tensors
|
||||||
|
|
||||||
|
ones := make([]float32, 4096*8)
|
||||||
|
for i := range ones {
|
||||||
|
ones[i] = float32(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
for range 3 {
|
||||||
|
err = binary.Write(&buf, binary.LittleEndian, ones)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ones = make([]float32, 1024*8)
|
||||||
|
for i := range ones {
|
||||||
|
ones[i] = float32(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = binary.Write(&buf, binary.LittleEndian, ones)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer fdata.Close()
|
||||||
|
|
||||||
|
_, err = fdata.Write(buf.Bytes())
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
configData := `
|
||||||
|
{
|
||||||
|
"adapter_path": "adapters-test",
|
||||||
|
"batch_size": 8,
|
||||||
|
"config": "config-tiny.json",
|
||||||
|
"data": "../discollama-completion",
|
||||||
|
"grad_checkpoint": null,
|
||||||
|
"iters": 1000,
|
||||||
|
"learning_rate": 1e-05,
|
||||||
|
"lora_layers": 1,
|
||||||
|
"lora_parameters": {
|
||||||
|
"rank": 8,
|
||||||
|
"alpha": 16,
|
||||||
|
"dropout": 0.0,
|
||||||
|
"scale": 2.0
|
||||||
|
},
|
||||||
|
"lr_schedule": null,
|
||||||
|
"max_seq_length": 2048,
|
||||||
|
"model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
|
||||||
|
"resume_adapter_file": null,
|
||||||
|
"save_every": 100,
|
||||||
|
"seed": 0,
|
||||||
|
"steps_per_eval": 200,
|
||||||
|
"steps_per_report": 10,
|
||||||
|
"test": false,
|
||||||
|
"test_batches": 500,
|
||||||
|
"train": true,
|
||||||
|
"use_dora": false,
|
||||||
|
"val_batches": 25
|
||||||
|
}
|
||||||
|
`
|
||||||
|
f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
_, err = f.WriteString(configData)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -35,7 +35,9 @@ const (
|
||||||
)
|
)
|
||||||
|
|
||||||
func (t tensorBase) Kind() uint32 {
|
func (t tensorBase) Kind() uint32 {
|
||||||
if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
|
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
|
||||||
|
t.name == "token_types.weight" {
|
||||||
|
// these tensors are always F32
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -55,13 +57,15 @@ func (t *tensorBase) SetRepacker(fn repacker) {
|
||||||
|
|
||||||
type repacker func(string, []float32, []uint64) ([]float32, error)
|
type repacker func(string, []float32, []uint64) ([]float32, error)
|
||||||
|
|
||||||
func parseTensors(fsys fs.FS) ([]Tensor, error) {
|
func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
|
||||||
patterns := []struct {
|
patterns := []struct {
|
||||||
Pattern string
|
Pattern string
|
||||||
Func func(fs.FS, ...string) ([]Tensor, error)
|
Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
|
||||||
}{
|
}{
|
||||||
{"model-*-of-*.safetensors", parseSafetensors},
|
{"model-*-of-*.safetensors", parseSafetensors},
|
||||||
{"model.safetensors", parseSafetensors},
|
{"model.safetensors", parseSafetensors},
|
||||||
|
{"adapters.safetensors", parseSafetensors},
|
||||||
|
{"adapter_model.safetensors", parseSafetensors},
|
||||||
{"pytorch_model-*-of-*.bin", parseTorch},
|
{"pytorch_model-*-of-*.bin", parseTorch},
|
||||||
{"pytorch_model.bin", parseTorch},
|
{"pytorch_model.bin", parseTorch},
|
||||||
{"consolidated.*.pth", parseTorch},
|
{"consolidated.*.pth", parseTorch},
|
||||||
|
@ -74,7 +78,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(matches) > 0 {
|
if len(matches) > 0 {
|
||||||
return pattern.Func(fsys, matches...)
|
return pattern.Func(fsys, replacer, matches...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
"slices"
|
"slices"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/d4l3k/go-bfloat16"
|
"github.com/d4l3k/go-bfloat16"
|
||||||
"github.com/x448/float16"
|
"github.com/x448/float16"
|
||||||
|
@ -20,7 +21,7 @@ type safetensorMetadata struct {
|
||||||
Offsets []int64 `json:"data_offsets"`
|
Offsets []int64 `json:"data_offsets"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
|
||||||
var ts []Tensor
|
var ts []Tensor
|
||||||
for _, p := range ps {
|
for _, p := range ps {
|
||||||
f, err := fsys.Open(p)
|
f, err := fsys.Open(p)
|
||||||
|
@ -56,7 +57,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
||||||
offset: safetensorsPad(n, value.Offsets[0]),
|
offset: safetensorsPad(n, value.Offsets[0]),
|
||||||
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
|
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
|
||||||
tensorBase: &tensorBase{
|
tensorBase: &tensorBase{
|
||||||
name: key,
|
name: replacer.Replace(key),
|
||||||
shape: value.Shape,
|
shape: value.Shape,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
|
@ -3,12 +3,13 @@ package convert
|
||||||
import (
|
import (
|
||||||
"io"
|
"io"
|
||||||
"io/fs"
|
"io/fs"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/nlpodyssey/gopickle/pytorch"
|
"github.com/nlpodyssey/gopickle/pytorch"
|
||||||
"github.com/nlpodyssey/gopickle/types"
|
"github.com/nlpodyssey/gopickle/types"
|
||||||
)
|
)
|
||||||
|
|
||||||
func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
|
||||||
var ts []Tensor
|
var ts []Tensor
|
||||||
for _, p := range ps {
|
for _, p := range ps {
|
||||||
pt, err := pytorch.Load(p)
|
pt, err := pytorch.Load(p)
|
||||||
|
@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
||||||
ts = append(ts, torch{
|
ts = append(ts, torch{
|
||||||
storage: t.(*pytorch.Tensor).Source,
|
storage: t.(*pytorch.Tensor).Source,
|
||||||
tensorBase: &tensorBase{
|
tensorBase: &tensorBase{
|
||||||
name: k.(string),
|
name: replacer.Replace(k.(string)),
|
||||||
shape: shape,
|
shape: shape,
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
3
convert/testdata/Meta-Llama-3.1-8B-Instruct.json
vendored
Normal file
3
convert/testdata/Meta-Llama-3.1-8B-Instruct.json
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
|
||||||
|
}
|
124
convert/testdata/all-MiniLM-L6-v2.json
vendored
Normal file
124
convert/testdata/all-MiniLM-L6-v2.json
vendored
Normal file
|
@ -0,0 +1,124 @@
|
||||||
|
{
|
||||||
|
"general.architecture": "bert",
|
||||||
|
"general.file_type": "1",
|
||||||
|
"general.quantization_version": "2",
|
||||||
|
"bert.attention.causal": "false",
|
||||||
|
"bert.attention.head_count": "12",
|
||||||
|
"bert.attention.layer_norm_epsilon": "1e-12",
|
||||||
|
"bert.block_count": "6",
|
||||||
|
"bert.context_length": "512",
|
||||||
|
"bert.embedding_length": "384",
|
||||||
|
"bert.feed_forward_length": "1536",
|
||||||
|
"bert.pooling_type": "1",
|
||||||
|
"tokenizer.ggml.model": "bert",
|
||||||
|
"tokenizer.ggml.padding_token_id": "0",
|
||||||
|
"tokenizer.ggml.unknown_token_id": "100",
|
||||||
|
"tokenizer.ggml.cls_token_id": "101",
|
||||||
|
"tokenizer.ggml.seperator_token_id": "102",
|
||||||
|
"tokenizer.ggml.mask_token_id": "103",
|
||||||
|
"tokenizer.ggml.token_type_count": "2",
|
||||||
|
"tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
|
||||||
|
"tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
|
||||||
|
"tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
|
||||||
|
"token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
|
||||||
|
"position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
|
||||||
|
"token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
|
||||||
|
"token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
|
||||||
|
"token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
|
||||||
|
"blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
|
||||||
|
"blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
|
||||||
|
"blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
|
||||||
|
"blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
|
||||||
|
"blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
|
||||||
|
"blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
|
||||||
|
"blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
|
||||||
|
"blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
|
||||||
|
"blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
|
||||||
|
"blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
|
||||||
|
"blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
|
||||||
|
"blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
|
||||||
|
"blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
|
||||||
|
"blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
|
||||||
|
"blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
|
||||||
|
"blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
|
||||||
|
"blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
|
||||||
|
"blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
|
||||||
|
"blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
|
||||||
|
"blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
|
||||||
|
"blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
|
||||||
|
"blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
|
||||||
|
"blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
|
||||||
|
"blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
|
||||||
|
"blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
|
||||||
|
"blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
|
||||||
|
"blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
|
||||||
|
"blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
|
||||||
|
"blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
|
||||||
|
"blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
|
||||||
|
"blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
|
||||||
|
"blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
|
||||||
|
"blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
|
||||||
|
"blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
|
||||||
|
"blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
|
||||||
|
"blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
|
||||||
|
"blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
|
||||||
|
"blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
|
||||||
|
"blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
|
||||||
|
"blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
|
||||||
|
"blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
|
||||||
|
"blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
|
||||||
|
"blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
|
||||||
|
"blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
|
||||||
|
"blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
|
||||||
|
"blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
|
||||||
|
"blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
|
||||||
|
"blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
|
||||||
|
"blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
|
||||||
|
"blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
|
||||||
|
"blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
|
||||||
|
"blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
|
||||||
|
"blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
|
||||||
|
"blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
|
||||||
|
"blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
|
||||||
|
"blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
|
||||||
|
"blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
|
||||||
|
"blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
|
||||||
|
"blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
|
||||||
|
"blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
|
||||||
|
"blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
|
||||||
|
"blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
|
||||||
|
"blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
|
||||||
|
"blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
|
||||||
|
"blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
|
||||||
|
"blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
|
||||||
|
"blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
|
||||||
|
"blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
|
||||||
|
"blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
|
||||||
|
"blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
|
||||||
|
"blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
|
||||||
|
"blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
|
||||||
|
"blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
|
||||||
|
"blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
|
||||||
|
"blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
|
||||||
|
"blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
|
||||||
|
"blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
|
||||||
|
"blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
|
||||||
|
"blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
|
||||||
|
"blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
|
||||||
|
"blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
|
||||||
|
"blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
|
||||||
|
"blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
|
||||||
|
"blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
|
||||||
|
"blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
|
||||||
|
"blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
|
||||||
|
"blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
|
||||||
|
"blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
|
||||||
|
"blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
|
||||||
|
"blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
|
||||||
|
"blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
|
||||||
|
"blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
|
||||||
|
"blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
|
||||||
|
"blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
|
||||||
|
"blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
|
||||||
|
"blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
|
||||||
|
}
|
6
convert/testdata/gemma-2-9b-it.json
vendored
Normal file
6
convert/testdata/gemma-2-9b-it.json
vendored
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"general.architecture": "gemma2",
|
||||||
|
"gemma2.attention.sliding_window": "4096",
|
||||||
|
"gemma2.attn_logit_softcapping": "50",
|
||||||
|
"gemma2.final_logit_softcapping": "30"
|
||||||
|
}
|
|
@ -1,7 +1,6 @@
|
||||||
package convert
|
package convert
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
|
||||||
"crypto/sha256"
|
"crypto/sha256"
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
@ -11,6 +10,8 @@ import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"slices"
|
"slices"
|
||||||
|
|
||||||
|
"golang.org/x/exp/maps"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -184,32 +185,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var tokens []token
|
tokens := make(map[int]token, len(t.Model.Vocab))
|
||||||
for k, v := range t.Model.Vocab {
|
for k, v := range t.Model.Vocab {
|
||||||
tokens = append(tokens, token{
|
tokens[v] = token{
|
||||||
ID: v,
|
ID: v,
|
||||||
Content: k,
|
Content: k,
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, t := range t.AddedTokens {
|
for _, token := range t.AddedTokens {
|
||||||
t.UserDefined = true
|
token.UserDefined = true
|
||||||
tokens = append(tokens, t)
|
tokens[token.ID] = token
|
||||||
}
|
}
|
||||||
|
|
||||||
slices.SortFunc(tokens, func(i, j token) int {
|
keys := maps.Keys(tokens)
|
||||||
return cmp.Compare(i.ID, j.ID)
|
slices.Sort(keys)
|
||||||
})
|
|
||||||
|
|
||||||
v := Vocabulary{Model: "gpt2"}
|
v := Vocabulary{Model: "gpt2"}
|
||||||
for _, t := range tokens {
|
for _, k := range keys {
|
||||||
v.Tokens = append(v.Tokens, t.Content)
|
token := tokens[k]
|
||||||
v.Scores = append(v.Scores, float32(t.ID))
|
v.Tokens = append(v.Tokens, token.Content)
|
||||||
|
v.Scores = append(v.Scores, float32(token.ID))
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case t.Special:
|
case token.Special:
|
||||||
v.Types = append(v.Types, tokenTypeControl)
|
v.Types = append(v.Types, tokenTypeControl)
|
||||||
case t.UserDefined:
|
case token.UserDefined:
|
||||||
v.Types = append(v.Types, tokenTypeUserDefined)
|
v.Types = append(v.Types, tokenTypeUserDefined)
|
||||||
default:
|
default:
|
||||||
v.Types = append(v.Types, tokenTypeNormal)
|
v.Types = append(v.Types, tokenTypeNormal)
|
||||||
|
|
|
@ -15,6 +15,11 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
||||||
|
ast, err := parseAdditionalSpecialTokens(fsys)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
bts, err := fs.ReadFile(fsys, "tokenizer.model")
|
bts, err := fs.ReadFile(fsys, "tokenizer.model")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
||||||
sentencepiece.ModelProto_SentencePiece_BYTE:
|
sentencepiece.ModelProto_SentencePiece_BYTE:
|
||||||
v.Types = append(v.Types, int32(t))
|
v.Types = append(v.Types, int32(t))
|
||||||
default:
|
default:
|
||||||
v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
|
tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
|
||||||
|
if slices.Contains(ast, piece.GetPiece()) {
|
||||||
|
tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
|
||||||
|
}
|
||||||
|
|
||||||
|
v.Types = append(v.Types, tt)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
||||||
|
|
||||||
return &v, nil
|
return &v, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
|
||||||
|
f, err := fsys.Open("special_tokens_map.json")
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
return nil, nil
|
||||||
|
} else if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
var m struct {
|
||||||
|
AdditionalSpecialTokens []string `json:"additional_special_tokens"`
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := json.NewDecoder(f).Decode(&m); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return m.AdditionalSpecialTokens, nil
|
||||||
|
}
|
||||||
|
|
|
@ -111,7 +111,10 @@ On Windows, Ollama inherits your user and system environment variables.
|
||||||
|
|
||||||
## How do I use Ollama behind a proxy?
|
## How do I use Ollama behind a proxy?
|
||||||
|
|
||||||
Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
|
Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
|
||||||
|
|
||||||
### How do I use Ollama behind a proxy in Docker?
|
### How do I use Ollama behind a proxy in Docker?
|
||||||
|
|
||||||
|
|
|
@ -20,13 +20,12 @@ GPU.
|
||||||
|
|
||||||
## Manual install
|
## Manual install
|
||||||
|
|
||||||
### Download the `ollama` binary
|
### Download `ollama`
|
||||||
|
|
||||||
Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
|
Download and extract the Linux package:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
|
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
|
||||||
sudo chmod +x /usr/bin/ollama
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Adding Ollama as a startup service (recommended)
|
### Adding Ollama as a startup service (recommended)
|
||||||
|
@ -96,8 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh
|
||||||
Or by downloading the ollama binary:
|
Or by downloading the ollama binary:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
|
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
|
||||||
sudo chmod +x /usr/bin/ollama
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Installing specific versions
|
## Installing specific versions
|
||||||
|
|
|
@ -174,7 +174,7 @@ func RunnersDir() (p string) {
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
if p == "" {
|
if p == "" {
|
||||||
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
@ -190,17 +190,17 @@ func RunnersDir() (p string) {
|
||||||
}
|
}
|
||||||
|
|
||||||
var paths []string
|
var paths []string
|
||||||
for _, root := range []string{filepath.Dir(exe), cwd} {
|
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} {
|
||||||
paths = append(paths,
|
paths = append(paths,
|
||||||
root,
|
root,
|
||||||
filepath.Join(root, "windows-"+runtime.GOARCH),
|
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
|
||||||
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try a few variations to improve developer experience when building from source in the local tree
|
// Try a few variations to improve developer experience when building from source in the local tree
|
||||||
for _, path := range paths {
|
for _, path := range paths {
|
||||||
candidate := filepath.Join(path, "ollama_runners")
|
candidate := filepath.Join(path, "lib", "ollama", "runners")
|
||||||
if _, err := os.Stat(candidate); err == nil {
|
if _, err := os.Stat(candidate); err == nil {
|
||||||
p = candidate
|
p = candidate
|
||||||
break
|
break
|
||||||
|
|
|
@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
|
||||||
// Installer payload location if we're running the installed binary
|
// Installer payload location if we're running the installed binary
|
||||||
exe, err := os.Executable()
|
exe, err := os.Executable()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
|
rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
|
||||||
if rocmLibUsable(rocmTargetDir) {
|
if rocmLibUsable(rocmTargetDir) {
|
||||||
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
|
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
|
||||||
return rocmTargetDir, nil
|
return rocmTargetDir, nil
|
||||||
|
|
|
@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
|
||||||
// Installer payload (if we're running from some other location)
|
// Installer payload (if we're running from some other location)
|
||||||
localAppData := os.Getenv("LOCALAPPDATA")
|
localAppData := os.Getenv("LOCALAPPDATA")
|
||||||
appDir := filepath.Join(localAppData, "Programs", "Ollama")
|
appDir := filepath.Join(localAppData, "Programs", "Ollama")
|
||||||
rocmTargetDir := filepath.Join(appDir, "rocm")
|
rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama")
|
||||||
if rocmLibUsable(rocmTargetDir) {
|
if rocmLibUsable(rocmTargetDir) {
|
||||||
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
|
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
|
||||||
return rocmTargetDir, nil
|
return rocmTargetDir, nil
|
||||||
|
|
|
@ -4,9 +4,17 @@ package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"runtime"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||||
|
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||||
|
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||||
|
|
||||||
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||||
ids := []string{}
|
ids := []string{}
|
||||||
for _, info := range gpuInfo {
|
for _, info := range gpuInfo {
|
||||||
|
@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||||
}
|
}
|
||||||
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
|
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func cudaVariant(gpuInfo CudaGPUInfo) string {
|
||||||
|
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
|
||||||
|
if CudaTegra != "" {
|
||||||
|
ver := strings.Split(CudaTegra, ".")
|
||||||
|
if len(ver) > 0 {
|
||||||
|
return "jetpack" + ver[0]
|
||||||
|
}
|
||||||
|
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
|
||||||
|
r := regexp.MustCompile(` R(\d+) `)
|
||||||
|
m := r.FindSubmatch(data)
|
||||||
|
if len(m) != 2 {
|
||||||
|
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
|
||||||
|
} else {
|
||||||
|
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
|
||||||
|
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
|
||||||
|
// https://developer.nvidia.com/embedded/jetpack-archive
|
||||||
|
switch l4t {
|
||||||
|
case 35:
|
||||||
|
return "jetpack5"
|
||||||
|
case 36:
|
||||||
|
return "jetpack6"
|
||||||
|
default:
|
||||||
|
slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
|
||||||
|
return "v11"
|
||||||
|
}
|
||||||
|
return "v12"
|
||||||
|
}
|
||||||
|
|
72
gpu/gpu.go
72
gpu/gpu.go
|
@ -64,10 +64,6 @@ var RocmComputeMin = 9
|
||||||
// TODO find a better way to detect iGPU instead of minimum memory
|
// TODO find a better way to detect iGPU instead of minimum memory
|
||||||
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||||
|
|
||||||
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
|
||||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
|
||||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
|
||||||
|
|
||||||
// Note: gpuMutex must already be held
|
// Note: gpuMutex must already be held
|
||||||
func initCudaHandles() *cudaHandles {
|
func initCudaHandles() *cudaHandles {
|
||||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||||
|
@ -215,7 +211,7 @@ func GetGPUInfo() GpuInfoList {
|
||||||
GpuInfo: GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: cpuCapability,
|
Variant: cpuCapability.String(),
|
||||||
ID: "0",
|
ID: "0",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -229,11 +225,7 @@ func GetGPUInfo() GpuInfoList {
|
||||||
return GpuInfoList{cpus[0].GpuInfo}
|
return GpuInfoList{cpus[0].GpuInfo}
|
||||||
}
|
}
|
||||||
|
|
||||||
// On windows we bundle the nvidia library one level above the runner dir
|
depPath := LibraryDir()
|
||||||
depPath := ""
|
|
||||||
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
|
|
||||||
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load ALL libraries
|
// Load ALL libraries
|
||||||
cHandles = initCudaHandles()
|
cHandles = initCudaHandles()
|
||||||
|
@ -269,11 +261,23 @@ func GetGPUInfo() GpuInfoList {
|
||||||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||||
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
||||||
|
gpuInfo.computeMajor = int(memInfo.major)
|
||||||
|
gpuInfo.computeMinor = int(memInfo.minor)
|
||||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
gpuInfo.MinimumMemory = cudaMinimumMemory
|
||||||
gpuInfo.DependencyPath = depPath
|
|
||||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
|
||||||
gpuInfo.DriverMajor = driverMajor
|
gpuInfo.DriverMajor = driverMajor
|
||||||
gpuInfo.DriverMinor = driverMinor
|
gpuInfo.DriverMinor = driverMinor
|
||||||
|
variant := cudaVariant(gpuInfo)
|
||||||
|
if depPath != "" {
|
||||||
|
gpuInfo.DependencyPath = depPath
|
||||||
|
// Check for variant specific directory
|
||||||
|
if variant != "" {
|
||||||
|
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
|
||||||
|
gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||||
|
gpuInfo.Variant = variant
|
||||||
|
|
||||||
// query the management library as well so we can record any skew between the two
|
// query the management library as well so we can record any skew between the two
|
||||||
// which represents overhead on the GPU we must set aside on subsequent updates
|
// which represents overhead on the GPU we must set aside on subsequent updates
|
||||||
|
@ -306,13 +310,6 @@ func GetGPUInfo() GpuInfoList {
|
||||||
if envconfig.IntelGPU() {
|
if envconfig.IntelGPU() {
|
||||||
oHandles = initOneAPIHandles()
|
oHandles = initOneAPIHandles()
|
||||||
if oHandles != nil && oHandles.oneapi != nil {
|
if oHandles != nil && oHandles.oneapi != nil {
|
||||||
|
|
||||||
// On windows we bundle the oneapi library one level above the runner dir
|
|
||||||
depPath = ""
|
|
||||||
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
|
|
||||||
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
|
|
||||||
}
|
|
||||||
|
|
||||||
for d := range oHandles.oneapi.num_drivers {
|
for d := range oHandles.oneapi.num_drivers {
|
||||||
if oHandles.oneapi == nil {
|
if oHandles.oneapi == nil {
|
||||||
// shouldn't happen
|
// shouldn't happen
|
||||||
|
@ -467,10 +464,12 @@ func GetGPUInfo() GpuInfoList {
|
||||||
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
||||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||||
var ldPaths []string
|
var ldPaths []string
|
||||||
var patterns []string
|
|
||||||
gpuLibPaths := []string{}
|
gpuLibPaths := []string{}
|
||||||
slog.Debug("Searching for GPU library", "name", baseLibName)
|
slog.Debug("Searching for GPU library", "name", baseLibName)
|
||||||
|
|
||||||
|
// Start with our bundled libraries
|
||||||
|
patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
|
||||||
|
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
case "windows":
|
case "windows":
|
||||||
ldPaths = strings.Split(os.Getenv("PATH"), ";")
|
ldPaths = strings.Split(os.Getenv("PATH"), ";")
|
||||||
|
@ -479,13 +478,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
||||||
default:
|
default:
|
||||||
return gpuLibPaths
|
return gpuLibPaths
|
||||||
}
|
}
|
||||||
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
|
|
||||||
|
// Then with whatever we find in the PATH/LD_LIBRARY_PATH
|
||||||
for _, ldPath := range ldPaths {
|
for _, ldPath := range ldPaths {
|
||||||
d, err := filepath.Abs(ldPath)
|
d, err := filepath.Abs(ldPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
|
patterns = append(patterns, filepath.Join(d, baseLibName))
|
||||||
}
|
}
|
||||||
patterns = append(patterns, defaultPatterns...)
|
patterns = append(patterns, defaultPatterns...)
|
||||||
slog.Debug("gpu library search", "globs", patterns)
|
slog.Debug("gpu library search", "globs", patterns)
|
||||||
|
@ -641,3 +641,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
||||||
return "", ""
|
return "", ""
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func LibraryDir() string {
|
||||||
|
// On Windows/linux we bundle the dependencies at the same level as the executable
|
||||||
|
appExe, err := os.Executable()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to lookup executable path", "error", err)
|
||||||
|
}
|
||||||
|
cwd, err := os.Getwd()
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to lookup working directory", "error", err)
|
||||||
|
}
|
||||||
|
// Scan for any of our dependeices, and pick first match
|
||||||
|
for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
|
||||||
|
libDep := filepath.Join("lib", "ollama")
|
||||||
|
if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
|
||||||
|
return filepath.Join(root, libDep)
|
||||||
|
}
|
||||||
|
// Developer mode, local build
|
||||||
|
if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
|
||||||
|
return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
|
||||||
|
return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
slog.Warn("unable to locate gpu dependency libraries")
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
|
@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList {
|
||||||
return []GpuInfo{
|
return []GpuInfo{
|
||||||
{
|
{
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: GetCPUCapability(),
|
Variant: GetCPUCapability().String(),
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList {
|
||||||
return []GpuInfo{
|
return []GpuInfo{
|
||||||
{
|
{
|
||||||
Library: "cpu",
|
Library: "cpu",
|
||||||
Variant: GetCPUCapability(),
|
Variant: GetCPUCapability().String(),
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -47,7 +47,7 @@ var (
|
||||||
CudartMgmtName = "libcudart.so*"
|
CudartMgmtName = "libcudart.so*"
|
||||||
NvcudaMgmtName = "libcuda.so*"
|
NvcudaMgmtName = "libcuda.so*"
|
||||||
NvmlMgmtName = "" // not currently wired on linux
|
NvmlMgmtName = "" // not currently wired on linux
|
||||||
OneapiMgmtName = "libze_intel_gpu.so"
|
OneapiMgmtName = "libze_intel_gpu.so*"
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
func GetCPUMem() (memInfo, error) {
|
||||||
|
|
|
@ -32,4 +32,29 @@ func TestCPUMemInfo(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestByLibrary(t *testing.T) {
|
||||||
|
type testCase struct {
|
||||||
|
input []GpuInfo
|
||||||
|
expect int
|
||||||
|
}
|
||||||
|
|
||||||
|
testCases := map[string]*testCase{
|
||||||
|
"empty": {input: []GpuInfo{}, expect: 0},
|
||||||
|
"cpu": {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
|
||||||
|
"cpu + GPU": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
|
||||||
|
"cpu + 2 GPU no variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
|
||||||
|
"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
|
||||||
|
"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, v := range testCases {
|
||||||
|
t.Run(k, func(t *testing.T) {
|
||||||
|
resp := (GpuInfoList)(v.input).ByLibrary()
|
||||||
|
if len(resp) != v.expect {
|
||||||
|
t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
|
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
|
||||||
|
|
11
gpu/types.go
11
gpu/types.go
|
@ -19,7 +19,7 @@ type GpuInfo struct {
|
||||||
Library string `json:"library,omitempty"`
|
Library string `json:"library,omitempty"`
|
||||||
|
|
||||||
// Optional variant to select (e.g. versions, cpu feature flags)
|
// Optional variant to select (e.g. versions, cpu feature flags)
|
||||||
Variant CPUCapability `json:"variant"`
|
Variant string `json:"variant"`
|
||||||
|
|
||||||
// MinimumMemory represents the minimum memory required to use the GPU
|
// MinimumMemory represents the minimum memory required to use the GPU
|
||||||
MinimumMemory uint64 `json:"-"`
|
MinimumMemory uint64 `json:"-"`
|
||||||
|
@ -55,6 +55,8 @@ type CudaGPUInfo struct {
|
||||||
GpuInfo
|
GpuInfo
|
||||||
OSOverhead uint64 // Memory overhead between the driver library and management library
|
OSOverhead uint64 // Memory overhead between the driver library and management library
|
||||||
index int //nolint:unused,nolintlint
|
index int //nolint:unused,nolintlint
|
||||||
|
computeMajor int //nolint:unused,nolintlint
|
||||||
|
computeMinor int //nolint:unused,nolintlint
|
||||||
}
|
}
|
||||||
type CudaGPUInfoList []CudaGPUInfo
|
type CudaGPUInfoList []CudaGPUInfo
|
||||||
|
|
||||||
|
@ -81,8 +83,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
||||||
for _, info := range l {
|
for _, info := range l {
|
||||||
found := false
|
found := false
|
||||||
requested := info.Library
|
requested := info.Library
|
||||||
if info.Variant != CPUCapabilityNone {
|
if info.Variant != CPUCapabilityNone.String() {
|
||||||
requested += "_" + info.Variant.String()
|
requested += "_" + info.Variant
|
||||||
}
|
}
|
||||||
for i, lib := range libs {
|
for i, lib := range libs {
|
||||||
if lib == requested {
|
if lib == requested {
|
||||||
|
@ -92,7 +94,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !found {
|
if !found {
|
||||||
libs = append(libs, info.Library)
|
libs = append(libs, requested)
|
||||||
resp = append(resp, []GpuInfo{info})
|
resp = append(resp, []GpuInfo{info})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -105,6 +107,7 @@ func (l GpuInfoList) LogDetails() {
|
||||||
slog.Info("inference compute",
|
slog.Info("inference compute",
|
||||||
"id", g.ID,
|
"id", g.ID,
|
||||||
"library", g.Library,
|
"library", g.Library,
|
||||||
|
"variant", g.Variant,
|
||||||
"compute", g.Compute,
|
"compute", g.Compute,
|
||||||
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
|
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
|
||||||
"name", g.Name,
|
"name", g.Name,
|
||||||
|
|
|
@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
||||||
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
|
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
|
||||||
}
|
}
|
||||||
|
|
||||||
if res.PromptEvalCount != 8 {
|
if res.PromptEvalCount != 6 {
|
||||||
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
|
t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
||||||
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
|
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
|
||||||
}
|
}
|
||||||
|
|
||||||
if res.PromptEvalCount != 16 {
|
if res.PromptEvalCount != 12 {
|
||||||
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
|
t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
3
llm/ext_server/CMakeLists.txt
vendored
3
llm/ext_server/CMakeLists.txt
vendored
|
@ -1,12 +1,13 @@
|
||||||
set(TARGET ollama_llama_server)
|
set(TARGET ollama_llama_server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
|
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
|
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
)
|
)
|
||||||
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
endif()
|
endif()
|
||||||
|
|
8
llm/ext_server/server.cpp
vendored
8
llm/ext_server/server.cpp
vendored
|
@ -1429,7 +1429,13 @@ struct llama_server_context
|
||||||
switch (task.type)
|
switch (task.type)
|
||||||
{
|
{
|
||||||
case TASK_TYPE_COMPLETION: {
|
case TASK_TYPE_COMPLETION: {
|
||||||
server_slot *slot = prefix_slot(task.data["prompt"]);
|
server_slot *slot = nullptr;
|
||||||
|
if (task.embedding_mode) {
|
||||||
|
// Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
|
||||||
|
slot = slots[0].available() ? &slots[0] : nullptr;
|
||||||
|
} else {
|
||||||
|
slot = prefix_slot(task.data["prompt"]);
|
||||||
|
}
|
||||||
if (slot == nullptr)
|
if (slot == nullptr)
|
||||||
{
|
{
|
||||||
// if no slot is available, we defer this task for processing later
|
// if no slot is available, we defer this task for processing later
|
||||||
|
|
|
@ -9,11 +9,14 @@ init_vars() {
|
||||||
ARCH="arm64"
|
ARCH="arm64"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
|
echo "GOARCH must be set"
|
||||||
|
echo "this script is meant to be run from within go generate"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
LLAMACPP_DIR=../llama.cpp
|
LLAMACPP_DIR=../llama.cpp
|
||||||
CMAKE_DEFS=""
|
CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
|
||||||
CMAKE_TARGETS="--target ollama_llama_server"
|
CMAKE_TARGETS="--target ollama_llama_server"
|
||||||
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
|
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
|
||||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
|
||||||
|
@ -27,6 +30,7 @@ init_vars() {
|
||||||
WHOLE_ARCHIVE="-Wl,-force_load"
|
WHOLE_ARCHIVE="-Wl,-force_load"
|
||||||
NO_WHOLE_ARCHIVE=""
|
NO_WHOLE_ARCHIVE=""
|
||||||
GCC_ARCH="-arch ${ARCH}"
|
GCC_ARCH="-arch ${ARCH}"
|
||||||
|
DIST_BASE=../../dist/darwin-${GOARCH}/
|
||||||
;;
|
;;
|
||||||
"Linux")
|
"Linux")
|
||||||
LIB_EXT="so"
|
LIB_EXT="so"
|
||||||
|
@ -35,6 +39,7 @@ init_vars() {
|
||||||
|
|
||||||
# Cross compiling not supported on linux - Use docker
|
# Cross compiling not supported on linux - Use docker
|
||||||
GCC_ARCH=""
|
GCC_ARCH=""
|
||||||
|
DIST_BASE=../../dist/linux-${GOARCH}/
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
;;
|
;;
|
||||||
|
@ -42,6 +47,7 @@ init_vars() {
|
||||||
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
|
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
|
||||||
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||||
fi
|
fi
|
||||||
|
GZIP=$(which pigz 2>/dev/null || echo "gzip")
|
||||||
}
|
}
|
||||||
|
|
||||||
git_module_setup() {
|
git_module_setup() {
|
||||||
|
@ -85,26 +91,36 @@ build() {
|
||||||
|
|
||||||
compress() {
|
compress() {
|
||||||
echo "Compressing payloads to reduce overall binary size..."
|
echo "Compressing payloads to reduce overall binary size..."
|
||||||
pids=""
|
|
||||||
rm -rf ${BUILD_DIR}/bin/*.gz
|
rm -rf ${BUILD_DIR}/bin/*.gz
|
||||||
for f in ${BUILD_DIR}/bin/* ; do
|
for f in ${BUILD_DIR}/bin/* ; do
|
||||||
gzip -n --best -f ${f} &
|
${GZIP} -n --best -f ${f} &
|
||||||
pids+=" $!"
|
compress_pids+=" $!"
|
||||||
done
|
done
|
||||||
# check for lib directory
|
# check for lib directory
|
||||||
if [ -d ${BUILD_DIR}/lib ]; then
|
if [ -d ${BUILD_DIR}/lib ]; then
|
||||||
for f in ${BUILD_DIR}/lib/* ; do
|
for f in ${BUILD_DIR}/lib/* ; do
|
||||||
gzip -n --best -f ${f} &
|
${GZIP} -n --best -f ${f} &
|
||||||
pids+=" $!"
|
compress_pids+=" $!"
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
echo
|
echo
|
||||||
for pid in ${pids}; do
|
}
|
||||||
|
|
||||||
|
wait_for_compress() {
|
||||||
|
for pid in ${compress_pids}; do
|
||||||
wait $pid
|
wait $pid
|
||||||
done
|
done
|
||||||
echo "Finished compression"
|
echo "Finished compression"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
install() {
|
||||||
|
echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
|
||||||
|
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
|
||||||
|
rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
|
||||||
|
cp -af "${lib}" "${BUILD_DIR}/bin/"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
# Keep the local tree clean after we're done with the build
|
# Keep the local tree clean after we're done with the build
|
||||||
cleanup() {
|
cleanup() {
|
||||||
(cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
|
(cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
compress_pids=""
|
||||||
echo "Starting darwin generate script"
|
echo "Starting darwin generate script"
|
||||||
source $(dirname $0)/gen_common.sh
|
source $(dirname $0)/gen_common.sh
|
||||||
init_vars
|
init_vars
|
||||||
|
@ -98,4 +99,5 @@ case "${GOARCH}" in
|
||||||
esac
|
esac
|
||||||
|
|
||||||
cleanup
|
cleanup
|
||||||
|
wait_for_compress
|
||||||
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
compress_pids=""
|
||||||
|
|
||||||
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
|
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
|
||||||
amdGPUs() {
|
amdGPUs() {
|
||||||
|
@ -51,7 +52,7 @@ if [ -z "${CUDACXX}" ]; then
|
||||||
export CUDACXX=$(command -v nvcc)
|
export CUDACXX=$(command -v nvcc)
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
|
COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
|
||||||
source $(dirname $0)/gen_common.sh
|
source $(dirname $0)/gen_common.sh
|
||||||
init_vars
|
init_vars
|
||||||
git_module_setup
|
git_module_setup
|
||||||
|
@ -77,10 +78,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
||||||
init_vars
|
init_vars
|
||||||
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
||||||
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
||||||
echo "Building custom CPU"
|
echo "Building custom CPU"
|
||||||
build
|
build
|
||||||
|
install
|
||||||
compress
|
compress
|
||||||
else
|
else
|
||||||
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
||||||
|
@ -93,7 +95,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
|
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
|
||||||
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
|
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
|
||||||
|
|
||||||
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
|
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
|
||||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
||||||
#
|
#
|
||||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||||
|
@ -103,6 +105,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
||||||
echo "Building LCD CPU"
|
echo "Building LCD CPU"
|
||||||
build
|
build
|
||||||
|
install
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -120,6 +123,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
|
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
|
||||||
echo "Building AVX CPU"
|
echo "Building AVX CPU"
|
||||||
build
|
build
|
||||||
|
install
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -133,6 +137,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
|
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
|
||||||
echo "Building AVX2 CPU"
|
echo "Building AVX2 CPU"
|
||||||
build
|
build
|
||||||
|
install
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
@ -160,7 +165,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
||||||
echo "CUDA libraries detected - building dynamic CUDA library"
|
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||||
init_vars
|
init_vars
|
||||||
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
|
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
|
||||||
if [ -n "${CUDA_MAJOR}" ]; then
|
if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
|
||||||
CUDA_VARIANT=_v${CUDA_MAJOR}
|
CUDA_VARIANT=_v${CUDA_MAJOR}
|
||||||
fi
|
fi
|
||||||
if [ "${ARCH}" == "arm64" ]; then
|
if [ "${ARCH}" == "arm64" ]; then
|
||||||
|
@ -178,29 +183,19 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
||||||
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
||||||
echo "Building custom CUDA GPU"
|
echo "Building custom CUDA GPU"
|
||||||
else
|
else
|
||||||
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
||||||
fi
|
fi
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
|
export CUDAFLAGS="-t8"
|
||||||
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||||
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||||
|
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
|
||||||
build
|
build
|
||||||
|
install
|
||||||
# Carry the CUDA libs as payloads to help reduce dependency burden on users
|
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
|
||||||
#
|
mkdir -p "${CUDA_DIST_DIR}"
|
||||||
# TODO - in the future we may shift to packaging these separately and conditionally
|
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
|
||||||
# downloading them in the install script.
|
cp -a "${lib}" "${CUDA_DIST_DIR}"
|
||||||
DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
|
|
||||||
for lib in libcudart.so libcublas.so libcublasLt.so ; do
|
|
||||||
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
|
|
||||||
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
|
|
||||||
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
|
|
||||||
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
|
|
||||||
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
|
|
||||||
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
|
|
||||||
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
|
|
||||||
else
|
|
||||||
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
|
|
||||||
fi
|
|
||||||
done
|
done
|
||||||
compress
|
compress
|
||||||
|
|
||||||
|
@ -218,21 +213,24 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
|
||||||
CC=icx
|
CC=icx
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/oneapi"
|
BUILD_DIR="../build/linux/${ARCH}/oneapi"
|
||||||
EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
|
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
|
||||||
|
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
|
||||||
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
|
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
|
||||||
build
|
build
|
||||||
|
|
||||||
# copy oneAPI dependencies
|
# copy oneAPI dependencies
|
||||||
|
mkdir -p "${ONEAPI_DIST_DIR}"
|
||||||
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
|
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
|
||||||
cp "${dep}" "${BUILD_DIR}/bin/"
|
cp -a "${dep}" "${ONEAPI_DIST_DIR}"
|
||||||
done
|
done
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
|
||||||
|
install
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -254,7 +252,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
||||||
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
|
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
|
||||||
fi
|
fi
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||||
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
||||||
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
|
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
|
||||||
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
|
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
|
||||||
|
@ -262,23 +260,22 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
||||||
echo "Building custom ROCM GPU"
|
echo "Building custom ROCM GPU"
|
||||||
fi
|
fi
|
||||||
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
||||||
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
# ROCm dependencies are too large to fit into a unified bundle
|
||||||
|
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
|
||||||
|
# TODO figure out how to disable runpath (rpath)
|
||||||
|
# export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
|
||||||
|
export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||||
build
|
build
|
||||||
|
|
||||||
# Record the ROCM dependencies
|
# copy the ROCM dependencies
|
||||||
rm -f "${BUILD_DIR}/bin/deps.txt"
|
mkdir -p "${ROCM_DIST_DIR}"
|
||||||
touch "${BUILD_DIR}/bin/deps.txt"
|
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
|
||||||
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
|
cp -a "${dep}"* "${ROCM_DIST_DIR}"
|
||||||
echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
|
|
||||||
done
|
done
|
||||||
# bomb out if for some reason we didn't get a few deps
|
install
|
||||||
if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
|
|
||||||
cat "${BUILD_DIR}/bin/deps.txt"
|
|
||||||
echo "ERROR: deps file short"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cleanup
|
cleanup
|
||||||
|
wait_for_compress
|
||||||
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
||||||
|
|
|
@ -35,7 +35,7 @@ function init_vars {
|
||||||
)
|
)
|
||||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||||
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||||
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
|
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
|
||||||
md "$script:DIST_BASE" -ea 0 > $null
|
md "$script:DIST_BASE" -ea 0 > $null
|
||||||
if ($env:CGO_CFLAGS -contains "-g") {
|
if ($env:CGO_CFLAGS -contains "-g") {
|
||||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
|
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
|
||||||
|
@ -117,7 +117,7 @@ function build {
|
||||||
if ($cmakeDefs -contains "-G") {
|
if ($cmakeDefs -contains "-G") {
|
||||||
$extra=@("-j8")
|
$extra=@("-j8")
|
||||||
} else {
|
} else {
|
||||||
$extra= @("--", "/p:CL_MPcount=8")
|
$extra= @("--", "/maxCpuCount:8")
|
||||||
}
|
}
|
||||||
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
|
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
|
||||||
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
|
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
|
||||||
|
@ -261,7 +261,7 @@ function build_cuda() {
|
||||||
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
|
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
|
||||||
# Then build cuda as a dynamically loaded library
|
# Then build cuda as a dynamically loaded library
|
||||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
$script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
|
||||||
if ($null -ne $script:CUDA_VERSION) {
|
if ($null -ne $script:CUDA_VERSION) {
|
||||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||||
}
|
}
|
||||||
|
@ -273,9 +273,9 @@ function build_cuda() {
|
||||||
"-DGGML_CUDA=ON",
|
"-DGGML_CUDA=ON",
|
||||||
"-DGGML_AVX=on",
|
"-DGGML_AVX=on",
|
||||||
"-DGGML_AVX2=off",
|
"-DGGML_AVX2=off",
|
||||||
"-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
|
"-DCMAKE_CUDA_FLAGS=-t6",
|
||||||
"-DCMAKE_CUDA_FLAGS=-t8",
|
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
|
||||||
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
|
"-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
|
||||||
)
|
)
|
||||||
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
||||||
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
||||||
|
@ -286,12 +286,11 @@ function build_cuda() {
|
||||||
sign
|
sign
|
||||||
install
|
install
|
||||||
|
|
||||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
|
||||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
|
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
|
||||||
} else {
|
} else {
|
||||||
write-host "Skipping CUDA generation step"
|
write-host "Skipping CUDA generation step"
|
||||||
}
|
}
|
||||||
|
@ -325,18 +324,17 @@ function build_oneapi() {
|
||||||
sign
|
sign
|
||||||
install
|
install
|
||||||
|
|
||||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
|
||||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
|
||||||
} else {
|
} else {
|
||||||
Write-Host "Skipping oneAPI generation step"
|
Write-Host "Skipping oneAPI generation step"
|
||||||
}
|
}
|
||||||
|
@ -357,7 +355,7 @@ function build_rocm() {
|
||||||
"-DCMAKE_C_COMPILER=clang.exe",
|
"-DCMAKE_C_COMPILER=clang.exe",
|
||||||
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
||||||
"-DGGML_HIPBLAS=on",
|
"-DGGML_HIPBLAS=on",
|
||||||
"-DLLAMA_CUDA_NO_PEER_COPY=on",
|
"-DGGML_CUDA_NO_PEER_COPY=on",
|
||||||
"-DHIP_PLATFORM=amd",
|
"-DHIP_PLATFORM=amd",
|
||||||
"-DGGML_AVX=on",
|
"-DGGML_AVX=on",
|
||||||
"-DGGML_AVX2=off",
|
"-DGGML_AVX2=off",
|
||||||
|
@ -386,12 +384,11 @@ function build_rocm() {
|
||||||
sign
|
sign
|
||||||
install
|
install
|
||||||
|
|
||||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
|
||||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
|
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
|
||||||
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
|
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
|
||||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
|
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
|
||||||
} else {
|
} else {
|
||||||
write-host "Skipping ROCm generation step"
|
write-host "Skipping ROCm generation step"
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,6 +43,14 @@ func (kv KV) Architecture() string {
|
||||||
return "unknown"
|
return "unknown"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (kv KV) Kind() string {
|
||||||
|
if s, ok := kv["general.type"].(string); ok {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
func (kv KV) ParameterCount() uint64 {
|
func (kv KV) ParameterCount() uint64 {
|
||||||
return kv.u64("general.parameter_count")
|
return kv.u64("general.parameter_count")
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||||
assert.Len(t, tensors, inputLayerCount+1)
|
assert.Len(t, tensors, inputLayerCount+1)
|
||||||
err = WriteGGUF(f, KV{
|
err = WriteGGUF(f, KV{
|
||||||
"general.architecture": "llama",
|
"general.architecture": "llama",
|
||||||
"general.name": "name",
|
|
||||||
"llama.context_length": uint32(32),
|
"llama.context_length": uint32(32),
|
||||||
"llama.embedding_length": uint32(4096),
|
"llama.embedding_length": uint32(4096),
|
||||||
"llama.block_count": uint32(inputLayerCount),
|
"llama.block_count": uint32(inputLayerCount),
|
||||||
|
|
|
@ -1,60 +0,0 @@
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
|
||||||
index 721b8f4e..cfe7ac40 100644
|
|
||||||
--- a/src/llama.cpp
|
|
||||||
+++ b/src/llama.cpp
|
|
||||||
@@ -8420,14 +8420,14 @@ struct llm_build_context {
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * build_inp_mean() {
|
|
||||||
- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
|
||||||
+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
|
|
||||||
cb(lctx.inp_mean, "inp_mean", -1);
|
|
||||||
ggml_set_input(lctx.inp_mean);
|
|
||||||
return lctx.inp_mean;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * build_inp_cls() {
|
|
||||||
- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
|
||||||
+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
|
|
||||||
cb(lctx.inp_cls, "inp_cls", -1);
|
|
||||||
ggml_set_input(lctx.inp_cls);
|
|
||||||
return lctx.inp_cls;
|
|
||||||
@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
|
||||||
|
|
||||||
float * data = (float *) lctx.inp_mean->data;
|
|
||||||
- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
|
||||||
+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
|
|
||||||
|
|
||||||
std::vector<uint64_t> sum(n_tokens, 0);
|
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
|
||||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
|
||||||
-
|
|
||||||
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
|
|
||||||
-
|
|
||||||
sum[seq_id] += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
- std::vector<float> div(n_tokens, 0.0f);
|
|
||||||
- for (int i = 0; i < n_tokens; ++i) {
|
|
||||||
+ std::vector<float> div(cparams.n_seq_max, 0.0f);
|
|
||||||
+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
|
|
||||||
const uint64_t s = sum[i];
|
|
||||||
if (s > 0) {
|
|
||||||
div[i] = 1.0f/float(s);
|
|
||||||
@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
|
||||||
|
|
||||||
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
|
||||||
- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
|
||||||
+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
|
|
||||||
|
|
||||||
for (int i = 0; i < n_tokens; ++i) {
|
|
||||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
|
||||||
const llama_pos pos = batch.pos[i];
|
|
||||||
-
|
|
||||||
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
|
|
||||||
-
|
|
||||||
if (pos == 0) {
|
|
||||||
data[seq_id] = i;
|
|
||||||
}
|
|
|
@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
||||||
// glob workDir for files that start with ollama_
|
// glob workDir for files that start with ollama_
|
||||||
availableServers := getAvailableServers()
|
availableServers := getAvailableServers()
|
||||||
requested := info.Library
|
requested := info.Library
|
||||||
if info.Variant != gpu.CPUCapabilityNone {
|
if info.Variant != gpu.CPUCapabilityNone.String() {
|
||||||
requested += "_" + info.Variant.String()
|
requested += "_" + info.Variant
|
||||||
}
|
}
|
||||||
|
|
||||||
servers := []string{}
|
servers := []string{}
|
||||||
|
|
|
@ -258,7 +258,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
params = append(params, "--mlock")
|
params = append(params, "--mlock")
|
||||||
}
|
}
|
||||||
|
|
||||||
if gpu.IsNUMA() {
|
if gpu.IsNUMA() && gpus[0].Library == "cpu" {
|
||||||
numaMode := "distribute"
|
numaMode := "distribute"
|
||||||
if runtime.GOOS == "linux" {
|
if runtime.GOOS == "linux" {
|
||||||
if _, err := exec.LookPath("numactl"); err == nil {
|
if _, err := exec.LookPath("numactl"); err == nil {
|
||||||
|
@ -306,20 +306,18 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
if runtime.GOOS == "windows" {
|
if runtime.GOOS == "windows" {
|
||||||
pathEnv = "PATH"
|
pathEnv = "PATH"
|
||||||
}
|
}
|
||||||
// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
|
// Start with the server directory for the LD_LIBRARY_PATH/PATH
|
||||||
libraryPaths := []string{dir, filepath.Dir(dir)}
|
libraryPaths := []string{dir}
|
||||||
|
|
||||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||||
// Append our runner directory to the path
|
// favor our bundled library dependencies over system libraries
|
||||||
// This will favor system libraries over our bundled library dependencies
|
|
||||||
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: we always put the dependency path first
|
// Note: we always put the dependency path first
|
||||||
// since this was the exact version we verified for AMD GPUs
|
// since this was the exact version we compiled/linked against
|
||||||
// and we favor what the user had in their path
|
|
||||||
if gpus[0].DependencyPath != "" {
|
if gpus[0].DependencyPath != "" {
|
||||||
// TODO refine for multi-gpu support
|
// assume gpus from the same library have the same dependency path
|
||||||
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
|
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ set -eu
|
||||||
|
|
||||||
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
||||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
||||||
|
GZIP=$(which pigz 2>/dev/null || echo "gzip")
|
||||||
|
|
||||||
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
|
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
|
||||||
export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
|
export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
|
||||||
|
@ -21,11 +22,16 @@ for TARGETARCH in ${BUILD_ARCH}; do
|
||||||
-t builder:$TARGETARCH \
|
-t builder:$TARGETARCH \
|
||||||
.
|
.
|
||||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
||||||
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH
|
rm -rf ./dist/linux-$TARGETARCH
|
||||||
|
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
|
||||||
if [ "$TARGETARCH" = "amd64" ]; then
|
if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
|
||||||
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/
|
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker rm builder-$TARGETARCH
|
docker rm builder-$TARGETARCH
|
||||||
|
echo "Compressing final linux bundle..."
|
||||||
|
rm -f ./dist/ollama-linux-$TARGETARCH.tgz
|
||||||
|
(cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
|
||||||
|
if [ -d dist/linux-$TARGETARCH-rocm ]; then
|
||||||
|
(cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
$ErrorActionPreference = "Stop"
|
$ErrorActionPreference = "Stop"
|
||||||
|
|
||||||
function checkEnv() {
|
function checkEnv() {
|
||||||
|
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||||
$script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
|
$script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||||
Write-host "Building for ${script:TARGET_ARCH}"
|
Write-host "Building for ${script:TARGET_ARCH}"
|
||||||
write-host "Locating required tools and paths"
|
write-host "Locating required tools and paths"
|
||||||
|
@ -15,26 +16,23 @@ function checkEnv() {
|
||||||
$MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
|
$MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
|
||||||
$env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
|
$env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
|
||||||
}
|
}
|
||||||
# Try to find the CUDA dir
|
# Locate CUDA versions
|
||||||
if ($null -eq $env:NVIDIA_DIR) {
|
# Note: this assumes every version found will be built
|
||||||
$d=(get-command -ea 'silentlycontinue' nvcc).path
|
|
||||||
if ($d -ne $null) {
|
|
||||||
$script:NVIDIA_DIR=($d| split-path -parent)
|
|
||||||
} else {
|
|
||||||
$cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
|
$cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
|
||||||
if ($cudaList.length > 0) {
|
if ($cudaList.length -eq 0) {
|
||||||
$script:NVIDIA_DIR=$cudaList[0]
|
$d=(get-command -ea 'silentlycontinue' nvcc).path
|
||||||
}
|
if ($null -ne $d) {
|
||||||
|
$script:CUDA_DIRS=@($d| split-path -parent)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
$script:NVIDIA_DIR=$env:NVIDIA_DIR
|
$script:CUDA_DIRS=$cudaList
|
||||||
}
|
}
|
||||||
|
|
||||||
$script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
|
$script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
|
||||||
|
|
||||||
$script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
|
$script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
|
||||||
$env:CGO_ENABLED="1"
|
$env:CGO_ENABLED="1"
|
||||||
echo "Checking version"
|
Write-Output "Checking version"
|
||||||
if (!$env:VERSION) {
|
if (!$env:VERSION) {
|
||||||
$data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
|
$data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
|
||||||
$pattern="v(.+)"
|
$pattern="v(.+)"
|
||||||
|
@ -71,7 +69,48 @@ function checkEnv() {
|
||||||
function buildOllama() {
|
function buildOllama() {
|
||||||
write-host "Building ollama CLI"
|
write-host "Building ollama CLI"
|
||||||
if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
|
if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
|
||||||
|
Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
|
||||||
|
|
||||||
|
# TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle
|
||||||
|
# which targets to build
|
||||||
|
|
||||||
|
# Start by skipping CUDA to build everything else
|
||||||
|
pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
|
||||||
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
|
|
||||||
|
# Then skip everyhting else and build all the CUDA variants
|
||||||
|
foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
|
||||||
|
write-host "Building CUDA ${env:CUDA_LIB_DIR}"
|
||||||
|
|
||||||
|
if ($env:CUDA_LIB_DIR.Contains("v12")) {
|
||||||
|
pwsh -Command {
|
||||||
|
$env:OLLAMA_SKIP_CUDA_GENERATE=""
|
||||||
|
$env:OLLAMA_SKIP_STATIC_GENERATE="1"
|
||||||
|
$env:OLLAMA_SKIP_CPU_GENERATE="1"
|
||||||
|
$env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
|
||||||
|
$env:OLLAMA_SKIP_ROCM_GENERATE="1"
|
||||||
|
$env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
|
||||||
|
$env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
|
||||||
|
$env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
|
||||||
|
$env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
|
||||||
& go generate ./...
|
& go generate ./...
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
pwsh -Command {
|
||||||
|
$env:OLLAMA_SKIP_CUDA_GENERATE=""
|
||||||
|
$env:OLLAMA_SKIP_STATIC_GENERATE="1"
|
||||||
|
$env:OLLAMA_SKIP_CPU_GENERATE="1"
|
||||||
|
$env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
|
||||||
|
$env:OLLAMA_SKIP_ROCM_GENERATE="1"
|
||||||
|
$env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
|
||||||
|
$env:OLLAMA_CUSTOM_CUDA_DEFS=""
|
||||||
|
$env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
|
||||||
|
$env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
|
||||||
|
& go generate ./...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
|
}
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
} else {
|
} else {
|
||||||
write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
|
write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
|
||||||
|
@ -83,8 +122,8 @@ function buildOllama() {
|
||||||
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
|
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
|
||||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||||
}
|
}
|
||||||
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
|
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force
|
||||||
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
|
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\
|
||||||
}
|
}
|
||||||
|
|
||||||
function buildApp() {
|
function buildApp() {
|
||||||
|
@ -103,22 +142,22 @@ function buildApp() {
|
||||||
function gatherDependencies() {
|
function gatherDependencies() {
|
||||||
write-host "Gathering runtime dependencies"
|
write-host "Gathering runtime dependencies"
|
||||||
cd "${script:SRC_DIR}"
|
cd "${script:SRC_DIR}"
|
||||||
md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
|
md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null
|
||||||
|
|
||||||
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output
|
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output
|
||||||
# currently works for Win11 + MSVC 2019 + Cuda V11
|
# currently works for Win11 + MSVC 2019 + Cuda V11
|
||||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\"
|
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\"
|
||||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
|
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\"
|
||||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
|
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\"
|
||||||
foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
|
foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
|
||||||
cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\"
|
cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
|
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
|
||||||
if ("${env:KEY_CONTAINER}") {
|
if ("${env:KEY_CONTAINER}") {
|
||||||
write-host "about to sign"
|
write-host "about to sign"
|
||||||
foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
|
foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
|
||||||
write-host "signing $file"
|
write-host "signing $file"
|
||||||
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
||||||
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
|
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
|
||||||
|
|
|
@ -63,16 +63,36 @@ if [ -n "$NEEDS" ]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
status "Downloading ollama..."
|
|
||||||
curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
|
|
||||||
|
|
||||||
for BINDIR in /usr/local/bin /usr/bin /bin; do
|
for BINDIR in /usr/local/bin /usr/bin /bin; do
|
||||||
echo $PATH | grep -q $BINDIR && break || continue
|
echo $PATH | grep -q $BINDIR && break || continue
|
||||||
done
|
done
|
||||||
|
OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})
|
||||||
|
|
||||||
status "Installing ollama to $BINDIR..."
|
status "Installing ollama to $OLLAMA_INSTALL_DIR"
|
||||||
$SUDO install -o0 -g0 -m755 -d $BINDIR
|
$SUDO install -o0 -g0 -m755 -d $BINDIR
|
||||||
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
|
$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
|
||||||
|
if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then
|
||||||
|
status "Downloading Linux ${ARCH} bundle"
|
||||||
|
curl --fail --show-error --location --progress-bar \
|
||||||
|
"https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
|
||||||
|
$SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
|
||||||
|
BUNDLE=1
|
||||||
|
if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
|
||||||
|
status "Making ollama accessible in the PATH in $BINDIR"
|
||||||
|
$SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
status "Downloading Linux ${ARCH} CLI"
|
||||||
|
curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
|
||||||
|
"https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
|
||||||
|
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
|
||||||
|
BUNDLE=0
|
||||||
|
if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
|
||||||
|
status "Making ollama accessible in the PATH in $BINDIR"
|
||||||
|
$SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
install_success() {
|
install_success() {
|
||||||
status 'The Ollama API is now available at 127.0.0.1:11434.'
|
status 'The Ollama API is now available at 127.0.0.1:11434.'
|
||||||
|
@ -178,6 +198,16 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
|
if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
|
||||||
|
if [ $BUNDLE -ne 0 ]; then
|
||||||
|
status "Downloading Linux ROCm ${ARCH} bundle"
|
||||||
|
curl --fail --show-error --location --progress-bar \
|
||||||
|
"https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
|
||||||
|
$SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
|
||||||
|
|
||||||
|
install_success
|
||||||
|
status "AMD GPU ready."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
# Look for pre-existing ROCm v6 before downloading the dependencies
|
# Look for pre-existing ROCm v6 before downloading the dependencies
|
||||||
for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
|
for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
|
||||||
if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
|
if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
# Script for common Dockerfile dependency installation in redhat linux based images
|
# Script for common Dockerfile dependency installation in redhat linux based images
|
||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
|
set -o pipefail
|
||||||
MACHINE=$(uname -m)
|
MACHINE=$(uname -m)
|
||||||
|
|
||||||
if grep -i "centos" /etc/system-release >/dev/null; then
|
if grep -i "centos" /etc/system-release >/dev/null; then
|
||||||
|
@ -29,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
|
||||||
dnf install -y rh-git227-git
|
dnf install -y rh-git227-git
|
||||||
ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
|
ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
|
||||||
fi
|
fi
|
||||||
dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
|
dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz
|
||||||
elif grep -i "rocky" /etc/system-release >/dev/null; then
|
elif grep -i "rocky" /etc/system-release >/dev/null; then
|
||||||
# Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
|
# Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
|
||||||
cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
|
cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
|
||||||
|
@ -43,12 +44,21 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
|
||||||
EOF
|
EOF
|
||||||
dnf install -y git \
|
dnf install -y git \
|
||||||
gcc-toolset-10-gcc-10.2.1-8.2.el8 \
|
gcc-toolset-10-gcc-10.2.1-8.2.el8 \
|
||||||
gcc-toolset-10-gcc-c++-10.2.1-8.2.el8
|
gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
|
||||||
|
pigz
|
||||||
else
|
else
|
||||||
echo "ERROR Unexpected distro"
|
echo "ERROR Unexpected distro"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ "${MACHINE}" = "x86_64" ] ; then
|
||||||
|
curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
|
||||||
|
mv /tmp/ccache /usr/local/bin/
|
||||||
|
else
|
||||||
|
yum -y install epel-release
|
||||||
|
yum install -y ccache
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -n "${CMAKE_VERSION}" ]; then
|
if [ -n "${CMAKE_VERSION}" ]; then
|
||||||
curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
|
curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -215,25 +215,20 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) {
|
||||||
return nil, "", err
|
return nil, "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err = os.Stat(fp); err != nil {
|
f, err := os.Open(fp)
|
||||||
return nil, "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
var manifest *Manifest
|
|
||||||
|
|
||||||
bts, err := os.ReadFile(fp)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, "", fmt.Errorf("couldn't open file '%s'", fp)
|
return nil, "", err
|
||||||
}
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
shaSum := sha256.Sum256(bts)
|
sha256sum := sha256.New()
|
||||||
shaStr := hex.EncodeToString(shaSum[:])
|
|
||||||
|
|
||||||
if err := json.Unmarshal(bts, &manifest); err != nil {
|
var manifest Manifest
|
||||||
|
if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil {
|
||||||
return nil, "", err
|
return nil, "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return manifest, shaStr, nil
|
return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetModel(name string) (*Model, error) {
|
func GetModel(name string) (*Model, error) {
|
||||||
|
@ -374,13 +369,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
||||||
parameters := make(map[string]any)
|
parameters := make(map[string]any)
|
||||||
|
|
||||||
var layers []Layer
|
var layers []Layer
|
||||||
|
var baseLayers []*layerGGML
|
||||||
for _, c := range modelfile.Commands {
|
for _, c := range modelfile.Commands {
|
||||||
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
|
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
|
||||||
|
command := c.Name
|
||||||
|
|
||||||
switch c.Name {
|
switch command {
|
||||||
case "model", "adapter":
|
case "model", "adapter":
|
||||||
var baseLayers []*layerGGML
|
if name := model.ParseName(c.Args); name.IsValid() && command == "model" {
|
||||||
if name := model.ParseName(c.Args); name.IsValid() {
|
|
||||||
baseLayers, err = parseFromModel(ctx, name, fn)
|
baseLayers, err = parseFromModel(ctx, name, fn)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -414,14 +410,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
||||||
}
|
}
|
||||||
defer blob.Close()
|
defer blob.Close()
|
||||||
|
|
||||||
baseLayers, err = parseFromFile(ctx, blob, digest, fn)
|
baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
|
} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
baseLayers, err = parseFromFile(ctx, file, "", fn)
|
baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -692,43 +688,18 @@ func CopyModel(src, dst model.Name) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
|
func deleteUnusedLayers(deleteMap map[string]struct{}) error {
|
||||||
fp, err := GetManifestPath()
|
manifests, err := Manifests()
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
walkFunc := func(path string, info os.FileInfo, _ error) error {
|
|
||||||
if info.IsDir() {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
dir, file := filepath.Split(path)
|
|
||||||
dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
|
|
||||||
tag := strings.Join([]string{dir, file}, ":")
|
|
||||||
fmp := ParseModelPath(tag)
|
|
||||||
|
|
||||||
// skip the manifest we're trying to delete
|
|
||||||
if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// save (i.e. delete from the deleteMap) any files used in other manifests
|
|
||||||
manifest, _, err := GetManifest(fmp)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, manifest := range manifests {
|
||||||
for _, layer := range manifest.Layers {
|
for _, layer := range manifest.Layers {
|
||||||
delete(deleteMap, layer.Digest)
|
delete(deleteMap, layer.Digest)
|
||||||
}
|
}
|
||||||
|
|
||||||
delete(deleteMap, manifest.Config.Digest)
|
delete(deleteMap, manifest.Config.Digest)
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := filepath.Walk(fp, walkFunc); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// only delete the files which are still in the deleteMap
|
// only delete the files which are still in the deleteMap
|
||||||
|
@ -781,8 +752,7 @@ func PruneLayers() error {
|
||||||
|
|
||||||
slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
|
slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
|
||||||
|
|
||||||
err = deleteUnusedLayers(nil, deleteMap)
|
if err := deleteUnusedLayers(deleteMap); err != nil {
|
||||||
if err != nil {
|
|
||||||
slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
|
slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -877,20 +847,14 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
||||||
func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
|
func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
|
||||||
mp := ParseModelPath(name)
|
mp := ParseModelPath(name)
|
||||||
|
|
||||||
var manifest *Manifest
|
|
||||||
var err error
|
|
||||||
var noprune string
|
|
||||||
|
|
||||||
// build deleteMap to prune unused layers
|
// build deleteMap to prune unused layers
|
||||||
deleteMap := make(map[string]struct{})
|
deleteMap := make(map[string]struct{})
|
||||||
|
manifest, _, err := GetManifest(mp)
|
||||||
if !envconfig.NoPrune() {
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
manifest, _, err = GetManifest(mp)
|
// noop
|
||||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
} else if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||||
return err
|
return err
|
||||||
}
|
} else {
|
||||||
|
|
||||||
if manifest != nil {
|
|
||||||
for _, l := range manifest.Layers {
|
for _, l := range manifest.Layers {
|
||||||
deleteMap[l.Digest] = struct{}{}
|
deleteMap[l.Digest] = struct{}{}
|
||||||
}
|
}
|
||||||
|
@ -898,7 +862,6 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
||||||
deleteMap[manifest.Config.Digest] = struct{}{}
|
deleteMap[manifest.Config.Digest] = struct{}{}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if mp.ProtocolScheme == "http" && !regOpts.Insecure {
|
if mp.ProtocolScheme == "http" && !regOpts.Insecure {
|
||||||
return errors.New("insecure protocol http")
|
return errors.New("insecure protocol http")
|
||||||
|
@ -975,11 +938,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if noprune == "" {
|
if !envconfig.NoPrune() && len(deleteMap) > 0 {
|
||||||
fn(api.ProgressResponse{Status: "removing any unused layers"})
|
fn(api.ProgressResponse{Status: "removing unused layers"})
|
||||||
err = deleteUnusedLayers(nil, deleteMap)
|
if err := deleteUnusedLayers(deleteMap); err != nil {
|
||||||
if err != nil {
|
|
||||||
slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
|
|
||||||
fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
|
fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1000,12 +961,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
var m *Manifest
|
var m Manifest
|
||||||
if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
|
if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return m, err
|
return &m, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer
|
// GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer
|
||||||
|
|
|
@ -51,6 +51,9 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
|
||||||
if err := os.Rename(temp.Name(), blob); err != nil {
|
if err := os.Rename(temp.Name(), blob); err != nil {
|
||||||
return Layer{}, err
|
return Layer{}, err
|
||||||
}
|
}
|
||||||
|
if err := os.Chmod(blob, 0o644); err != nil {
|
||||||
|
return Layer{}, err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Layer{
|
return Layer{
|
||||||
|
|
|
@ -5,6 +5,7 @@ import (
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"errors"
|
"errors"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
|
@ -150,14 +151,16 @@ func Manifests() (map[model.Name]*Manifest, error) {
|
||||||
|
|
||||||
n := model.ParseNameFromFilepath(rel)
|
n := model.ParseNameFromFilepath(rel)
|
||||||
if !n.IsValid() {
|
if !n.IsValid() {
|
||||||
slog.Warn("bad manifest name", "path", rel, "error", err)
|
slog.Warn("bad manifest name", "path", rel)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
m, err := ParseNamedManifest(n)
|
m, err := ParseNamedManifest(n)
|
||||||
if err != nil {
|
if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
|
||||||
slog.Warn("bad manifest", "name", n, "error", err)
|
slog.Warn("bad manifest", "name", n, "error", err)
|
||||||
continue
|
continue
|
||||||
|
} else if err != nil {
|
||||||
|
return nil, fmt.Errorf("%s: %w", n, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
ms[n] = m
|
ms[n] = m
|
||||||
|
|
|
@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
||||||
return layers, nil
|
return layers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||||
fi, err := f.Stat()
|
fi, err := f.Stat()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -108,16 +108,38 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
|
||||||
defer t.Close()
|
defer t.Close()
|
||||||
defer os.Remove(t.Name())
|
defer os.Remove(t.Name())
|
||||||
|
|
||||||
fn(api.ProgressResponse{Status: "converting model"})
|
var layerType string
|
||||||
if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil {
|
|
||||||
|
switch command {
|
||||||
|
case "adapter":
|
||||||
|
var baseModel *llm.GGML
|
||||||
|
for _, l := range baseLayers {
|
||||||
|
if l.GGML != nil {
|
||||||
|
baseModel = l.GGML
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if baseModel == nil {
|
||||||
|
return nil, fmt.Errorf("no base model specified for the adapter")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
layerType = "application/vnd.ollama.image.adapter"
|
||||||
|
case "model":
|
||||||
|
if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
layerType = "application/vnd.ollama.image.model"
|
||||||
|
}
|
||||||
|
|
||||||
if _, err := t.Seek(0, io.SeekStart); err != nil {
|
if _, err := t.Seek(0, io.SeekStart); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
layer, err := NewLayer(t, "application/vnd.ollama.image.model")
|
layer, err := NewLayer(t, layerType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -139,7 +161,7 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.
|
||||||
return detectChatTemplate(layers)
|
return detectChatTemplate(layers)
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||||
sr := io.NewSectionReader(file, 0, 512)
|
sr := io.NewSectionReader(file, 0, 512)
|
||||||
contentType, err := detectContentType(sr)
|
contentType, err := detectContentType(sr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -150,7 +172,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
|
||||||
case "gguf", "ggla":
|
case "gguf", "ggla":
|
||||||
// noop
|
// noop
|
||||||
case "application/zip":
|
case "application/zip":
|
||||||
return parseFromZipFile(ctx, file, digest, fn)
|
return parseFromZipFile(ctx, command, baseLayers, file, digest, fn)
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unsupported content type: %s", contentType)
|
return nil, fmt.Errorf("unsupported content type: %s", contentType)
|
||||||
}
|
}
|
||||||
|
@ -170,7 +192,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
|
||||||
}
|
}
|
||||||
|
|
||||||
mediatype := "application/vnd.ollama.image.model"
|
mediatype := "application/vnd.ollama.image.model"
|
||||||
if ggml.Name() == "ggla" {
|
if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" {
|
||||||
mediatype = "application/vnd.ollama.image.adapter"
|
mediatype = "application/vnd.ollama.image.adapter"
|
||||||
} else if ggml.KV().Architecture() == "clip" {
|
} else if ggml.KV().Architecture() == "clip" {
|
||||||
mediatype = "application/vnd.ollama.image.projector"
|
mediatype = "application/vnd.ollama.image.projector"
|
||||||
|
|
|
@ -153,7 +153,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
|
||||||
t.Fatalf("failed to seek to start: %v", err)
|
t.Fatalf("failed to seek to start: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {})
|
layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("failed to parse from file: %v", err)
|
t.Fatalf("failed to parse from file: %v", err)
|
||||||
}
|
}
|
||||||
|
@ -166,7 +166,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
|
||||||
t.Fatalf("failed to seek to start: %v", err)
|
t.Fatalf("failed to seek to start: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {})
|
layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("failed to parse from file: %v", err)
|
t.Fatalf("failed to parse from file: %v", err)
|
||||||
}
|
}
|
||||||
|
@ -206,7 +206,7 @@ func TestParseLayerFromCopy(t *testing.T) {
|
||||||
t.Fatalf("failed to seek to start: %v", err)
|
t.Fatalf("failed to seek to start: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {})
|
layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("failed to parse from file: %v", err)
|
t.Fatalf("failed to parse from file: %v", err)
|
||||||
}
|
}
|
||||||
|
|
|
@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Embedding models should always be loaded with parallel=1
|
||||||
|
if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
|
||||||
|
numParallel = 1
|
||||||
|
}
|
||||||
|
|
||||||
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
||||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
// simplifying assumption of defaultParallel when in CPU mode
|
// simplifying assumption of defaultParallel when in CPU mode
|
||||||
|
@ -734,7 +739,10 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
||||||
|
|
||||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||||
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||||
|
if *numParallel <= 0 {
|
||||||
*numParallel = 1
|
*numParallel = 1
|
||||||
|
req.opts.NumCtx = req.origNumCtx
|
||||||
|
}
|
||||||
byLibrary := gpus.ByLibrary()
|
byLibrary := gpus.ByLibrary()
|
||||||
if len(byLibrary) <= 1 {
|
if len(byLibrary) <= 1 {
|
||||||
return gpus
|
return gpus
|
||||||
|
|
|
@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
||||||
|
|
||||||
require.NoError(t, llm.WriteGGUF(f, llm.KV{
|
require.NoError(t, llm.WriteGGUF(f, llm.KV{
|
||||||
"general.architecture": "llama",
|
"general.architecture": "llama",
|
||||||
"general.name": "name",
|
|
||||||
"llama.context_length": uint32(32),
|
"llama.context_length": uint32(32),
|
||||||
"llama.embedding_length": uint32(4096),
|
"llama.embedding_length": uint32(4096),
|
||||||
"llama.block_count": uint32(1),
|
"llama.block_count": uint32(1),
|
||||||
|
|
|
@ -45,7 +45,7 @@ type blobUpload struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
numUploadParts = 64
|
numUploadParts = 16
|
||||||
minUploadPartSize int64 = 100 * format.MegaByte
|
minUploadPartSize int64 = 100 * format.MegaByte
|
||||||
maxUploadPartSize int64 = 1000 * format.MegaByte
|
maxUploadPartSize int64 = 1000 * format.MegaByte
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue