Compare commits
No commits in common. "0c61920bc960283307643a8980ff06468bbf657f" and "99dfb67553f05361dc68144c801424d86c5bb423" have entirely different histories.
0c61920bc9
...
99dfb67553
59 changed files with 471 additions and 1647 deletions
32
.github/workflows/release.yaml
vendored
32
.github/workflows/release.yaml
vendored
|
@ -187,13 +187,6 @@ jobs:
|
|||
generate-windows-cuda:
|
||||
environment: release
|
||||
runs-on: windows
|
||||
strategy:
|
||||
matrix:
|
||||
cuda:
|
||||
- version: "11"
|
||||
url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe'
|
||||
- version: "12"
|
||||
url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe'
|
||||
env:
|
||||
KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
|
||||
steps:
|
||||
|
@ -227,11 +220,11 @@ jobs:
|
|||
with:
|
||||
go-version-file: go.mod
|
||||
cache: true
|
||||
- name: 'Install CUDA ${{ matrix.cuda.version }}'
|
||||
- name: 'Install CUDA'
|
||||
run: |
|
||||
$ErrorActionPreference = "Stop"
|
||||
write-host "downloading CUDA Installer"
|
||||
Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
|
||||
Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe"
|
||||
write-host "Installing CUDA"
|
||||
Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait
|
||||
write-host "Completed CUDA"
|
||||
|
@ -263,16 +256,15 @@ jobs:
|
|||
cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\"
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: generate-windows-cuda-${{ matrix.cuda.version }}
|
||||
name: generate-windows-cuda
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
dist/windows-amd64/**
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: windows-cuda-deps-${{ matrix.cuda.version }}
|
||||
name: windows-cuda-deps
|
||||
path: dist/deps/*
|
||||
|
||||
|
||||
# Import the prior generation steps and build the final windows assets
|
||||
build-windows:
|
||||
environment: release
|
||||
|
@ -322,16 +314,10 @@ jobs:
|
|||
name: generate-windows-cpu
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: generate-windows-cuda-11
|
||||
name: generate-windows-cuda
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: generate-windows-cuda-12
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: windows-cuda-deps-11
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: windows-cuda-deps-12
|
||||
name: windows-cuda-deps
|
||||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: windows-rocm-deps
|
||||
|
@ -377,6 +363,7 @@ jobs:
|
|||
- run: |
|
||||
./scripts/build_linux.sh
|
||||
./scripts/build_docker.sh
|
||||
mv dist/deps/* dist/
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: dist-linux-amd64
|
||||
|
@ -472,10 +459,7 @@ jobs:
|
|||
merge-multiple: true
|
||||
- run: |
|
||||
ls -lh dist/
|
||||
(cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt)
|
||||
mv sha256sum.txt dist/
|
||||
mv dist/linux-???64 .
|
||||
mv dist/linux-amd64-rocm .
|
||||
(cd dist; sha256sum * > sha256sum.txt)
|
||||
cat dist/sha256sum.txt
|
||||
- name: Create or update Release
|
||||
run: |
|
||||
|
|
|
@ -58,4 +58,4 @@ ENV OLLAMA_HOST="0.0.0.0:8080"
|
|||
|
||||
EXPOSE 8080
|
||||
|
||||
CMD ["supervisord", "-c", "/app/supervisord.conf"]
|
||||
CMD ["supervisord", "-c", "/app/supervisord.conf"]
|
||||
|
|
|
@ -87,11 +87,20 @@ DialogFontSize=12
|
|||
|
||||
[Files]
|
||||
Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
|
||||
Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit
|
||||
Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
|
||||
Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
|
||||
Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs
|
||||
Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
|
||||
Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
|
||||
Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs
|
||||
#if DirExists("..\dist\windows-amd64\cuda")
|
||||
Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs
|
||||
#endif
|
||||
#if DirExists("..\dist\windows-amd64\oneapi")
|
||||
Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs
|
||||
#endif
|
||||
#if DirExists("..\dist\windows-amd64\rocm")
|
||||
Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs
|
||||
#endif
|
||||
|
||||
|
||||
[Icons]
|
||||
Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
|
||||
|
@ -99,7 +108,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
|
|||
Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
|
||||
|
||||
[Run]
|
||||
Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
|
||||
Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
|
||||
|
||||
[UninstallRun]
|
||||
; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden
|
||||
|
@ -134,8 +143,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
|
|||
|
||||
[Registry]
|
||||
Root: HKCU; Subkey: "Environment"; \
|
||||
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \
|
||||
Check: NeedsAddPath('{app}\bin')
|
||||
ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \
|
||||
Check: NeedsAddPath('{app}')
|
||||
|
||||
[Code]
|
||||
|
||||
|
|
|
@ -11,7 +11,6 @@ import (
|
|||
"path/filepath"
|
||||
"sort"
|
||||
"sync"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/sys/windows"
|
||||
|
@ -434,12 +433,7 @@ func (t *winTray) setIcon(src string) error {
|
|||
t.muNID.Lock()
|
||||
defer t.muNID.Unlock()
|
||||
t.nid.Icon = h
|
||||
t.nid.Flags |= NIF_ICON | NIF_TIP
|
||||
if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil {
|
||||
copy(t.nid.Tip[:], toolTipUTF16)
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
t.nid.Flags |= NIF_ICON
|
||||
t.nid.Size = uint32(unsafe.Sizeof(*t.nid))
|
||||
|
||||
return t.nid.modify()
|
||||
|
|
|
@ -61,7 +61,6 @@ const (
|
|||
MIIM_SUBMENU = 0x00000004
|
||||
MIM_APPLYTOSUBMENUS = 0x80000000
|
||||
NIF_ICON = 0x00000002
|
||||
NIF_TIP = 0x00000004
|
||||
NIF_INFO = 0x00000010
|
||||
NIF_MESSAGE = 0x00000001
|
||||
SW_HIDE = 0
|
||||
|
|
19
cmd/cmd.go
19
cmd/cmd.go
|
@ -204,12 +204,6 @@ func tempZipFiles(path string) (string, error) {
|
|||
// safetensors files might be unresolved git lfs references; skip if they are
|
||||
// covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors
|
||||
files = append(files, st...)
|
||||
} else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 {
|
||||
// covers adapters.safetensors
|
||||
files = append(files, st...)
|
||||
} else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 {
|
||||
// covers adapter_model.safetensors
|
||||
files = append(files, st...)
|
||||
} else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 {
|
||||
// pytorch files might also be unresolved git lfs references; skip if they are
|
||||
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
|
||||
|
@ -229,14 +223,6 @@ func tempZipFiles(path string) (string, error) {
|
|||
}
|
||||
files = append(files, js...)
|
||||
|
||||
// bert models require a nested config.json
|
||||
// TODO(mxyng): merge this with the glob above
|
||||
js, err = glob(filepath.Join(path, "**/*.json"), "text/plain")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
files = append(files, js...)
|
||||
|
||||
if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 {
|
||||
// add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob
|
||||
// tokenizer.model might be a unresolved git lfs reference; error if it is
|
||||
|
@ -266,11 +252,6 @@ func tempZipFiles(path string) (string, error) {
|
|||
return "", err
|
||||
}
|
||||
|
||||
zfi.Name, err = filepath.Rel(path, file)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
zf, err := zipfile.CreateHeader(zfi)
|
||||
if err != nil {
|
||||
return "", err
|
||||
|
|
|
@ -7,27 +7,16 @@ import (
|
|||
"io"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type ModelParameters struct {
|
||||
type Parameters struct {
|
||||
Architectures []string `json:"architectures"`
|
||||
VocabSize uint32 `json:"vocab_size"`
|
||||
}
|
||||
|
||||
type AdapterParameters struct {
|
||||
Alpha uint32 `json:"lora_alpha"`
|
||||
LoraLayers uint32 `json:"lora_layers"`
|
||||
LoraParameters struct {
|
||||
Rank uint32 `json:"rank"`
|
||||
Alpha float32 `json:"alpha"`
|
||||
Scale float32 `json:"scale"`
|
||||
} `json:"lora_parameters"`
|
||||
}
|
||||
|
||||
func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
||||
func (Parameters) KV(t *Tokenizer) llm.KV {
|
||||
kv := llm.KV{
|
||||
"general.file_type": uint32(1),
|
||||
"general.quantization_version": uint32(2),
|
||||
|
@ -54,119 +43,40 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
|
|||
return kv
|
||||
}
|
||||
|
||||
func (p AdapterParameters) KV() llm.KV {
|
||||
var alpha float32
|
||||
if p.LoraParameters.Alpha == 0 {
|
||||
alpha = float32(p.Alpha)
|
||||
} else {
|
||||
alpha = p.LoraParameters.Alpha
|
||||
}
|
||||
|
||||
kv := llm.KV{
|
||||
"adapter.lora.alpha": alpha,
|
||||
"adapter.type": "lora",
|
||||
"general.file_type": uint32(1),
|
||||
"general.type": "adapter",
|
||||
"general.version": "v0.2",
|
||||
}
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (ModelParameters) specialTokenTypes() []string {
|
||||
func (Parameters) specialTokenTypes() []string {
|
||||
return []string{
|
||||
"bos", "eos", "unk", "sep", "pad", "cls", "mask",
|
||||
}
|
||||
}
|
||||
|
||||
func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||
func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||
return llm.WriteGGUF(ws, kv, ts)
|
||||
}
|
||||
|
||||
func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
|
||||
return llm.WriteGGUF(ws, kv, ts)
|
||||
}
|
||||
|
||||
type ModelConverter interface {
|
||||
type Converter interface {
|
||||
// KV maps parameters to LLM key-values
|
||||
KV(*Tokenizer) llm.KV
|
||||
// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
|
||||
Tensors([]Tensor) []llm.Tensor
|
||||
// Replacements returns a list of string pairs to replace in tensor names.
|
||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||
Replacements() []string
|
||||
|
||||
// tensorName returns the LLM tensor name for a specific input name
|
||||
tensorName(string) string
|
||||
// specialTokenTypes returns any special token types the model uses
|
||||
specialTokenTypes() []string
|
||||
// writeFile writes the model to the provided io.WriteSeeker
|
||||
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
||||
}
|
||||
|
||||
type moreParser interface {
|
||||
parseMore(fs.FS) error
|
||||
}
|
||||
|
||||
type AdapterConverter interface {
|
||||
// KV maps parameters to LLM key-values
|
||||
KV(llm.KV) llm.KV
|
||||
// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
|
||||
Tensors([]Tensor) []llm.Tensor
|
||||
// Replacements returns a list of string pairs to replace in tensor names.
|
||||
// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
|
||||
Replacements() []string
|
||||
|
||||
writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
|
||||
}
|
||||
|
||||
func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
|
||||
bts, err := fs.ReadFile(fsys, "adapter_config.json")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var p AdapterParameters
|
||||
if err := json.Unmarshal(bts, &p); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
arch, ok := baseKV["general.architecture"]
|
||||
if !ok {
|
||||
return errors.New("architecture not set for the base model")
|
||||
}
|
||||
|
||||
var conv AdapterConverter
|
||||
switch arch {
|
||||
case "llama":
|
||||
conv = &llamaAdapter{}
|
||||
case "gemma2":
|
||||
conv = &gemma2Adapter{}
|
||||
default:
|
||||
return errors.New("unsupported architecture")
|
||||
}
|
||||
|
||||
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(bts, conv); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts))
|
||||
}
|
||||
|
||||
// Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations
|
||||
// and files it finds in the input path.
|
||||
// Supported input model formats include safetensors.
|
||||
// Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model.
|
||||
func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
||||
func Convert(fsys fs.FS, ws io.WriteSeeker) error {
|
||||
bts, err := fs.ReadFile(fsys, "config.json")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var p ModelParameters
|
||||
var p Parameters
|
||||
if err := json.Unmarshal(bts, &p); err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -175,20 +85,16 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
|||
return errors.New("unknown architecture")
|
||||
}
|
||||
|
||||
var conv ModelConverter
|
||||
var conv Converter
|
||||
switch p.Architectures[0] {
|
||||
case "LlamaForCausalLM", "MistralForCausalLM":
|
||||
conv = &llamaModel{}
|
||||
conv = &llama{}
|
||||
case "MixtralForCausalLM":
|
||||
conv = &mixtralModel{}
|
||||
conv = &mixtral{}
|
||||
case "GemmaForCausalLM":
|
||||
conv = &gemmaModel{}
|
||||
case "Gemma2ForCausalLM":
|
||||
conv = &gemma2Model{}
|
||||
conv = &gemma{}
|
||||
case "Phi3ForCausalLM":
|
||||
conv = &phi3Model{}
|
||||
case "BertModel":
|
||||
conv = &bertModel{}
|
||||
conv = &phi3{}
|
||||
default:
|
||||
return errors.New("unsupported architecture")
|
||||
}
|
||||
|
@ -197,12 +103,6 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
|||
return err
|
||||
}
|
||||
|
||||
if t, ok := conv.(moreParser); ok {
|
||||
if err := t.parseMore(fsys); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
t, err := parseTokenizer(fsys, conv.specialTokenTypes())
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -219,7 +119,7 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
|||
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
||||
}
|
||||
|
||||
ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...))
|
||||
ts, err := parseTensors(fsys)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -1,174 +0,0 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"encoding/json"
|
||||
"io/fs"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type bertModel struct {
|
||||
ModelParameters
|
||||
NLayers uint32 `json:"n_layers"`
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
NLayer uint32 `json:"n_layer"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
NCtx uint32 `json:"n_ctx"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
NEmbd uint32 `json:"n_embd"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NInner uint32 `json:"n_inner"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NHead uint32 `json:"n_head"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
LayerNormEPS float32 `json:"layer_norm_eps"`
|
||||
LayerNormEpsilon float32 `json:"layer_norm_epsilon"`
|
||||
NormEpsilon float32 `json:"norm_epsilon"`
|
||||
|
||||
PoolingType uint32
|
||||
}
|
||||
|
||||
var (
|
||||
_ ModelConverter = (*bertModel)(nil)
|
||||
_ moreParser = (*bertModel)(nil)
|
||||
)
|
||||
|
||||
func (p *bertModel) parseMore(fsys fs.FS) error {
|
||||
bts, err := fs.ReadFile(fsys, "modules.json")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var modules []struct {
|
||||
Type string `json:"type"`
|
||||
Path string `json:"path"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(bts, &modules); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var pooling string
|
||||
for _, m := range modules {
|
||||
if m.Type == "sentence_transformers.models.Pooling" {
|
||||
pooling = m.Path
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if pooling != "" {
|
||||
bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var pc struct {
|
||||
PoolingModeCLSToken bool `json:"pooling_mode_cls_token"`
|
||||
PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(bts, &pc); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if pc.PoolingModeMeanTokens {
|
||||
p.PoolingType = 1
|
||||
} else if pc.PoolingModeCLSToken {
|
||||
p.PoolingType = 2
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (p *bertModel) KV(t *Tokenizer) llm.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "bert"
|
||||
kv["bert.attention.causal"] = false
|
||||
kv["bert.pooling_type"] = p.PoolingType
|
||||
|
||||
kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
|
||||
|
||||
if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 {
|
||||
kv["bert.context_length"] = contextLength
|
||||
}
|
||||
|
||||
if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 {
|
||||
kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
|
||||
}
|
||||
|
||||
if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 {
|
||||
kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner)
|
||||
}
|
||||
|
||||
if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 {
|
||||
kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead)
|
||||
}
|
||||
|
||||
if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 {
|
||||
kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon
|
||||
}
|
||||
|
||||
kv["tokenizer.ggml.model"] = "bert"
|
||||
kv["tokenizer.ggml.token_type_count"] = uint32(2)
|
||||
|
||||
// convert to phantom space tokens
|
||||
for i, e := range t.Tokens {
|
||||
if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") {
|
||||
// noop
|
||||
} else if strings.HasPrefix(e, "##") {
|
||||
t.Tokens[i] = e[2:]
|
||||
} else {
|
||||
t.Tokens[i] = "\u2581" + e
|
||||
}
|
||||
}
|
||||
|
||||
kv["tokenizer.ggml.tokens"] = t.Tokens
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
for _, t := range ts {
|
||||
if slices.Contains([]string{
|
||||
"embeddings.position_ids",
|
||||
"pooler.dense.weight",
|
||||
"pooler.dense.bias",
|
||||
}, t.Name()) {
|
||||
continue
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (bertModel) Replacements() []string {
|
||||
return []string{
|
||||
"encoder.layer", "blk",
|
||||
"encoder.layers", "blk",
|
||||
"embeddings.word_embeddings", "token_embd",
|
||||
"embeddings.token_type_embeddings", "token_types",
|
||||
"embeddings.LayerNorm", "token_embd_norm",
|
||||
"embeddings.position_embeddings", "position_embd",
|
||||
"attention.self.query", "attn_q",
|
||||
"attention.self.key", "attn_k",
|
||||
"attention.self.value", "attn_v",
|
||||
"attention.output.dense", "attn_output",
|
||||
"attention.output.LayerNorm", "attn_output_norm",
|
||||
"intermediate.dense", "ffn_up",
|
||||
"output.dense", "ffn_down",
|
||||
"output.LayerNorm", "layer_output_norm",
|
||||
}
|
||||
}
|
|
@ -9,8 +9,8 @@ import (
|
|||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type gemmaModel struct {
|
||||
ModelParameters
|
||||
type gemma struct {
|
||||
Parameters
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
HiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
|
@ -21,11 +21,12 @@ type gemmaModel struct {
|
|||
HeadDim uint32 `json:"head_dim"`
|
||||
}
|
||||
|
||||
var _ ModelConverter = (*gemmaModel)(nil)
|
||||
var _ Converter = (*gemma)(nil)
|
||||
|
||||
func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
func (p *gemma) KV(t *Tokenizer) llm.KV {
|
||||
kv := p.Parameters.KV(t)
|
||||
kv["general.architecture"] = "gemma"
|
||||
kv["general.name"] = "gemma"
|
||||
kv["gemma.context_length"] = p.MaxPositionEmbeddings
|
||||
kv["gemma.embedding_length"] = p.HiddenSize
|
||||
kv["gemma.block_count"] = p.HiddenLayers
|
||||
|
@ -42,15 +43,16 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
|
|||
return kv
|
||||
}
|
||||
|
||||
func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
func (p *gemma) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
for _, t := range ts {
|
||||
if strings.HasSuffix(t.Name(), "_norm.weight") {
|
||||
name := p.tensorName(t.Name())
|
||||
if strings.HasSuffix(name, "_norm.weight") {
|
||||
t.SetRepacker(p.addOne)
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
Name: t.Name(),
|
||||
Name: name,
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
|
@ -60,8 +62,8 @@ func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
|
|||
return out
|
||||
}
|
||||
|
||||
func (p *gemmaModel) Replacements() []string {
|
||||
return []string{
|
||||
func (p *gemma) tensorName(n string) string {
|
||||
return strings.NewReplacer(
|
||||
"model.embed_tokens", "token_embd",
|
||||
"model.norm", "output_norm",
|
||||
"model.layers", "blk",
|
||||
|
@ -74,10 +76,11 @@ func (p *gemmaModel) Replacements() []string {
|
|||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"post_attention_layernorm", "ffn_norm",
|
||||
}
|
||||
"block_sparse_moe.gate", "ffn_inp",
|
||||
).Replace(n)
|
||||
}
|
||||
|
||||
func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||
func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) {
|
||||
n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data))
|
||||
ones := tensor.Ones(tensor.Float32, int(shape[0]))
|
||||
|
||||
|
|
|
@ -1,43 +0,0 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type gemma2Model struct {
|
||||
gemmaModel
|
||||
SlidingWindow uint32 `json:"sliding_window"`
|
||||
AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"`
|
||||
FinalLogitSoftcap float32 `json:"final_logit_softcapping"`
|
||||
}
|
||||
|
||||
func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "gemma2"
|
||||
kv["gemma2.context_length"] = p.MaxPositionEmbeddings
|
||||
kv["gemma2.embedding_length"] = p.HiddenSize
|
||||
kv["gemma2.block_count"] = p.HiddenLayers
|
||||
kv["gemma2.feed_forward_length"] = p.IntermediateSize
|
||||
kv["gemma2.attention.head_count"] = p.NumAttentionHeads
|
||||
kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads
|
||||
kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS
|
||||
kv["gemma2.attention.key_length"] = p.HeadDim
|
||||
kv["gemma2.attention.value_length"] = p.HeadDim
|
||||
kv["gemma2.attention.sliding_window"] = p.SlidingWindow
|
||||
kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap
|
||||
kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap
|
||||
kv["tokenizer.ggml.eot_token_id"] = uint32(107)
|
||||
kv["tokenizer.ggml.middle_token_id"] = uint32(68)
|
||||
kv["tokenizer.ggml.prefix_token_id"] = uint32(67)
|
||||
kv["tokenizer.ggml.suffix_token_id"] = uint32(69)
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *gemma2Model) Replacements() []string {
|
||||
return append(
|
||||
p.gemmaModel.Replacements(),
|
||||
"post_attention_layernorm", "post_attention_norm",
|
||||
"pre_feedforward_layernorm", "ffn_norm",
|
||||
"post_feedforward_layernorm", "post_ffw_norm",
|
||||
)
|
||||
}
|
|
@ -1,91 +0,0 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type gemma2Adapter struct {
|
||||
AdapterParameters
|
||||
}
|
||||
|
||||
var _ AdapterConverter = (*gemma2Adapter)(nil)
|
||||
|
||||
func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
|
||||
kv := p.AdapterParameters.KV()
|
||||
kv["general.architecture"] = "gemma2"
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
for _, t := range ts {
|
||||
shape := t.Shape()
|
||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
|
||||
shape[0], shape[1] = shape[1], shape[0]
|
||||
t.SetRepacker(p.repack)
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (p *gemma2Adapter) Replacements() []string {
|
||||
return []string{
|
||||
"base_model.model.", "",
|
||||
"model.layers", "blk",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.o_proj", "attn_output",
|
||||
"mlp.gate_proj", "ffn_gate",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"lora_A.weight", "weight.lora_a",
|
||||
"lora_B.weight", "weight.lora_b",
|
||||
"lora_a", "weight.lora_a",
|
||||
"lora_b", "weight.lora_b",
|
||||
}
|
||||
}
|
||||
|
||||
func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||
dims := []int{int(shape[1]), int(shape[0])}
|
||||
|
||||
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
|
||||
if err := n.T(1, 0); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Reshape(dims...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Transpose(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ts, err := native.SelectF32(n, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var f32s []float32
|
||||
for _, t := range ts {
|
||||
f32s = append(f32s, t...)
|
||||
}
|
||||
|
||||
return f32s, nil
|
||||
}
|
|
@ -3,7 +3,6 @@ package convert
|
|||
import (
|
||||
"cmp"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"github.com/pdevine/tensor"
|
||||
|
@ -12,8 +11,8 @@ import (
|
|||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type llamaModel struct {
|
||||
ModelParameters
|
||||
type llama struct {
|
||||
Parameters
|
||||
NLayers uint32 `json:"n_layers"`
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
NLayer uint32 `json:"n_layer"`
|
||||
|
@ -28,14 +27,8 @@ type llamaModel struct {
|
|||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
RopeScaling struct {
|
||||
Type string `json:"type"`
|
||||
RopeType string `json:"rope_type"`
|
||||
Factor float32 `json:"factor"`
|
||||
LowFrequencyFactor float32 `json:"low_freq_factor"`
|
||||
HighFrequencyFactor float32 `json:"high_freq_factor"`
|
||||
OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"`
|
||||
|
||||
factors ropeFactor
|
||||
Type string `json:"type"`
|
||||
Factor float32 `json:"factor"`
|
||||
} `json:"rope_scaling"`
|
||||
RMSNormEPS float32 `json:"rms_norm_eps"`
|
||||
LayerNormEPS float32 `json:"layer_norm_eps"`
|
||||
|
@ -44,11 +37,12 @@ type llamaModel struct {
|
|||
HeadDim uint32 `json:"head_dim"`
|
||||
}
|
||||
|
||||
var _ ModelConverter = (*llamaModel)(nil)
|
||||
var _ Converter = (*llama)(nil)
|
||||
|
||||
func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
func (p *llama) KV(t *Tokenizer) llm.KV {
|
||||
kv := p.Parameters.KV(t)
|
||||
kv["general.architecture"] = "llama"
|
||||
kv["general.name"] = "llama"
|
||||
kv["llama.vocab_size"] = p.VocabSize
|
||||
|
||||
kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer)
|
||||
|
@ -77,27 +71,6 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
|||
if p.RopeScaling.Type == "linear" {
|
||||
kv["llama.rope.scaling.type"] = p.RopeScaling.Type
|
||||
kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor
|
||||
} else if p.RopeScaling.RopeType == "llama3" {
|
||||
dim := p.HiddenSize / p.NumAttentionHeads
|
||||
for i := uint32(0); i < dim; i += 2 {
|
||||
factor := cmp.Or(p.RopeScaling.Factor, 8.0)
|
||||
factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0)
|
||||
factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0)
|
||||
|
||||
original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192)
|
||||
lambdaLow := float32(original) / factorLow
|
||||
lambdaHigh := float32(original) / factorHigh
|
||||
|
||||
lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim))
|
||||
if lambda < float64(lambdaHigh) {
|
||||
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0)
|
||||
} else if lambda > float64(lambdaLow) {
|
||||
p.RopeScaling.factors = append(p.RopeScaling.factors, factor)
|
||||
} else {
|
||||
smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow)
|
||||
p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if p.NumKeyValueHeads > 0 {
|
||||
|
@ -120,26 +93,17 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
|
|||
return kv
|
||||
}
|
||||
|
||||
func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
func (p *llama) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
|
||||
if p.RopeScaling.factors != nil {
|
||||
out = append(out, llm.Tensor{
|
||||
Name: "rope_freqs.weight",
|
||||
Kind: 0,
|
||||
Shape: []uint64{uint64(len(p.RopeScaling.factors))},
|
||||
WriterTo: p.RopeScaling.factors,
|
||||
})
|
||||
}
|
||||
|
||||
for _, t := range ts {
|
||||
if strings.HasSuffix(t.Name(), "attn_q.weight") ||
|
||||
strings.HasSuffix(t.Name(), "attn_k.weight") {
|
||||
name := p.tensorName(t.Name())
|
||||
if strings.HasSuffix(name, "attn_q.weight") ||
|
||||
strings.HasSuffix(name, "attn_k.weight") {
|
||||
t.SetRepacker(p.repack)
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
Name: t.Name(),
|
||||
Name: name,
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
|
@ -149,8 +113,8 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
|
|||
return out
|
||||
}
|
||||
|
||||
func (p *llamaModel) Replacements() []string {
|
||||
return []string{
|
||||
func (p *llama) tensorName(n string) string {
|
||||
return strings.NewReplacer(
|
||||
"lm_head", "output",
|
||||
"model.embed_tokens", "token_embd",
|
||||
"model.norm", "output_norm",
|
||||
|
@ -164,19 +128,21 @@ func (p *llamaModel) Replacements() []string {
|
|||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"post_attention_layernorm", "ffn_norm",
|
||||
}
|
||||
// mixtral
|
||||
"block_sparse_moe.gate", "ffn_gate_inp",
|
||||
).Replace(n)
|
||||
}
|
||||
|
||||
func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||
func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||
var dims []int
|
||||
for _, dim := range shape {
|
||||
dims = append(dims, int(dim))
|
||||
}
|
||||
|
||||
var heads uint32
|
||||
if strings.HasSuffix(name, "attn_q.weight") {
|
||||
if strings.HasSuffix(name, "q_proj.weight") {
|
||||
heads = p.NumAttentionHeads
|
||||
} else if strings.HasSuffix(name, "attn_k.weight") {
|
||||
} else if strings.HasSuffix(name, "k_proj.weight") {
|
||||
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||
} else {
|
||||
return nil, fmt.Errorf("unknown tensor for repack: %s", name)
|
||||
|
|
|
@ -1,169 +0,0 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"strings"
|
||||
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
|
||||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type llamaAdapter struct {
|
||||
AdapterParameters
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
}
|
||||
|
||||
var _ AdapterConverter = (*llamaAdapter)(nil)
|
||||
|
||||
func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
|
||||
kv := p.AdapterParameters.KV()
|
||||
kv["general.architecture"] = "llama"
|
||||
kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
|
||||
kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"]
|
||||
|
||||
p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32)
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var out []llm.Tensor
|
||||
for _, t := range ts {
|
||||
shape := t.Shape()
|
||||
if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
|
||||
(strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) {
|
||||
shape[0], shape[1] = shape[1], shape[0]
|
||||
t.SetRepacker(p.repackAndTranspose)
|
||||
} else {
|
||||
t.SetRepacker(p.repack)
|
||||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: shape,
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (p *llamaAdapter) Replacements() []string {
|
||||
return []string{
|
||||
"base_model.model.", "",
|
||||
"model.layers", "blk",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.o_proj", "attn_output",
|
||||
"mlp.gate_proj", "ffn_gate",
|
||||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.up_proj", "ffn_up",
|
||||
"lora_A.weight", "weight.lora_a",
|
||||
"lora_B.weight", "weight.lora_b",
|
||||
"lora_a", "weight.lora_a",
|
||||
"lora_b", "weight.lora_b",
|
||||
}
|
||||
}
|
||||
|
||||
func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||
dims := []int{int(shape[1]), int(shape[0])}
|
||||
|
||||
var heads uint32
|
||||
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
|
||||
heads = p.NumAttentionHeads
|
||||
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
|
||||
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||
} else {
|
||||
return data, nil
|
||||
}
|
||||
|
||||
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
|
||||
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.T(0, 2, 1, 3); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Reshape(dims...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Transpose(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ts, err := native.SelectF32(n, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var f32s []float32
|
||||
for _, t := range ts {
|
||||
f32s = append(f32s, t...)
|
||||
}
|
||||
|
||||
return f32s, nil
|
||||
}
|
||||
|
||||
func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) {
|
||||
dims := []int{int(shape[1]), int(shape[0])}
|
||||
|
||||
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
|
||||
|
||||
var heads uint32
|
||||
if strings.HasSuffix(name, "attn_q.weight.lora_a") {
|
||||
heads = p.NumAttentionHeads
|
||||
} else if strings.HasSuffix(name, "attn_k.weight.lora_a") {
|
||||
heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
|
||||
}
|
||||
|
||||
if heads > 0 {
|
||||
if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.T(0, 2, 1, 3); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Reshape(dims...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Transpose(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if err := n.T(1, 0); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Reshape(dims...); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if err := n.Transpose(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
ts, err := native.SelectF32(n, 1)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var f32s []float32
|
||||
for _, t := range ts {
|
||||
f32s = append(f32s, t...)
|
||||
}
|
||||
|
||||
return f32s, nil
|
||||
}
|
|
@ -9,14 +9,16 @@ import (
|
|||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type mixtralModel struct {
|
||||
llamaModel
|
||||
type mixtral struct {
|
||||
llama
|
||||
NumLocalExperts uint32 `json:"num_local_experts"`
|
||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||
}
|
||||
|
||||
func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
||||
kv := p.llamaModel.KV(t)
|
||||
var _ Converter = (*mixtral)(nil)
|
||||
|
||||
func (p *mixtral) KV(t *Tokenizer) llm.KV {
|
||||
kv := p.llama.KV(t)
|
||||
|
||||
if p.NumLocalExperts > 0 {
|
||||
kv["llama.expert_count"] = p.NumLocalExperts
|
||||
|
@ -29,7 +31,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
|
|||
return kv
|
||||
}
|
||||
|
||||
func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
||||
func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor {
|
||||
oldnew := []string{
|
||||
"model.layers", "blk",
|
||||
"w1", "ffn_gate_exps",
|
||||
|
@ -67,14 +69,7 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
|
|||
})
|
||||
}
|
||||
|
||||
return append(out, p.llamaModel.Tensors(ts)...)
|
||||
}
|
||||
|
||||
func (p *mixtralModel) Replacements() []string {
|
||||
return append(
|
||||
p.llamaModel.Replacements(),
|
||||
"block_sparse_moe.gate", "ffn_gate_inp",
|
||||
)
|
||||
return append(out, p.llama.Tensors(ts)...)
|
||||
}
|
||||
|
||||
type experts []Tensor
|
||||
|
|
|
@ -11,8 +11,8 @@ import (
|
|||
"github.com/ollama/ollama/llm"
|
||||
)
|
||||
|
||||
type phi3Model struct {
|
||||
ModelParameters
|
||||
type phi3 struct {
|
||||
Parameters
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
NLayers uint32 `json:"n_layers"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
|
@ -35,11 +35,12 @@ type phi3Model struct {
|
|||
SlidingWindow uint32 `json:"sliding_window"`
|
||||
}
|
||||
|
||||
var _ ModelConverter = (*phi3Model)(nil)
|
||||
var _ Converter = (*phi3)(nil)
|
||||
|
||||
func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
||||
kv := p.ModelParameters.KV(t)
|
||||
func (p *phi3) KV(t *Tokenizer) llm.KV {
|
||||
kv := p.Parameters.KV(t)
|
||||
kv["general.architecture"] = "phi3"
|
||||
kv["general.name"] = "phi3"
|
||||
kv["phi3.context_length"] = p.MaxPositionEmbeddings
|
||||
kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd)
|
||||
kv["phi3.feed_forward_length"] = p.IntermediateSize
|
||||
|
@ -68,12 +69,13 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
|
|||
return kv
|
||||
}
|
||||
|
||||
func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
||||
func (p *phi3) Tensors(ts []Tensor) []llm.Tensor {
|
||||
var addRopeFactors sync.Once
|
||||
|
||||
out := make([]llm.Tensor, 0, len(ts)+2)
|
||||
for _, t := range ts {
|
||||
if strings.HasPrefix(t.Name(), "blk.0.") {
|
||||
name := p.tensorName(t.Name())
|
||||
if strings.HasPrefix(name, "blk.0.") {
|
||||
addRopeFactors.Do(func() {
|
||||
out = append(out, llm.Tensor{
|
||||
Name: "rope_factors_long.weight",
|
||||
|
@ -90,7 +92,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
|||
}
|
||||
|
||||
out = append(out, llm.Tensor{
|
||||
Name: t.Name(),
|
||||
Name: name,
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
|
@ -100,8 +102,8 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
|
|||
return out
|
||||
}
|
||||
|
||||
func (p *phi3Model) Replacements() []string {
|
||||
return []string{
|
||||
func (p *phi3) tensorName(n string) string {
|
||||
return strings.NewReplacer(
|
||||
"lm_head", "output",
|
||||
"model.embed_tokens", "token_embd",
|
||||
"model.norm", "output_norm",
|
||||
|
@ -112,7 +114,7 @@ func (p *phi3Model) Replacements() []string {
|
|||
"mlp.down_proj", "ffn_down",
|
||||
"mlp.gate_up_proj", "ffn_up",
|
||||
"post_attention_layernorm", "ffn_norm",
|
||||
}
|
||||
).Replace(n)
|
||||
}
|
||||
|
||||
type ropeFactor []float32
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/sha256"
|
||||
"encoding/binary"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"flag"
|
||||
|
@ -31,7 +29,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
|||
}
|
||||
defer f.Close()
|
||||
|
||||
if err := ConvertModel(fsys, f); err != nil {
|
||||
if err := Convert(fsys, f); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
|
@ -53,34 +51,6 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
|||
return r, m.KV(), m.Tensors()
|
||||
}
|
||||
|
||||
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
|
||||
actual := make(map[string]string)
|
||||
for k, v := range kv {
|
||||
if s, ok := v.(json.Marshaler); !ok {
|
||||
actual[k] = fmt.Sprintf("%v", v)
|
||||
} else {
|
||||
bts, err := json.Marshal(s)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
|
||||
}
|
||||
}
|
||||
|
||||
for _, tensor := range tensors.Items {
|
||||
sha256sum := sha256.New()
|
||||
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
|
||||
if _, err := io.Copy(sha256sum, sr); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
|
||||
}
|
||||
|
||||
return actual
|
||||
}
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
var level slog.Level
|
||||
flag.TextVar(&level, "level", slog.LevelInfo, "log level")
|
||||
|
@ -92,14 +62,11 @@ func TestMain(m *testing.M) {
|
|||
func TestConvertFull(t *testing.T) {
|
||||
cases := []string{
|
||||
"Meta-Llama-3-8B-Instruct",
|
||||
"Meta-Llama-3.1-8B-Instruct",
|
||||
"Mistral-7B-Instruct-v0.2",
|
||||
"Mixtral-8x7B-Instruct-v0.1",
|
||||
"gemma-2b-it",
|
||||
// microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8
|
||||
"Phi-3-mini-128k-instruct",
|
||||
"all-MiniLM-L6-v2",
|
||||
"gemma-2-9b-it",
|
||||
}
|
||||
|
||||
for i := range cases {
|
||||
|
@ -115,7 +82,29 @@ func TestConvertFull(t *testing.T) {
|
|||
}
|
||||
|
||||
f, kv, tensors := convertFull(t, os.DirFS(p))
|
||||
actual := generateResultsJSON(t, f, kv, tensors)
|
||||
actual := make(map[string]string)
|
||||
for k, v := range kv {
|
||||
if s, ok := v.(json.Marshaler); !ok {
|
||||
actual[k] = fmt.Sprintf("%v", v)
|
||||
} else {
|
||||
bts, err := json.Marshal(s)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts))
|
||||
}
|
||||
}
|
||||
|
||||
for _, tensor := range tensors.Items {
|
||||
sha256sum := sha256.New()
|
||||
sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
|
||||
if _, err := io.Copy(sha256sum, sr); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil))
|
||||
}
|
||||
|
||||
expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt)))
|
||||
if err != nil {
|
||||
|
@ -139,209 +128,3 @@ func TestConvertFull(t *testing.T) {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestConvertAdapter(t *testing.T) {
|
||||
type AdapterCase struct {
|
||||
Name string
|
||||
BaseKV map[string]any
|
||||
Expected map[string]string
|
||||
}
|
||||
|
||||
cases := []AdapterCase{
|
||||
{
|
||||
Name: "discollama",
|
||||
BaseKV: map[string]any{
|
||||
"general.architecture": "llama",
|
||||
"llama.attention.head_count": uint32(32),
|
||||
"llama.attention.head_count_kv": uint32(8),
|
||||
},
|
||||
Expected: map[string]string{
|
||||
"general.architecture": "llama",
|
||||
"general.file_type": "1",
|
||||
"general.parameter_count": "106496",
|
||||
"general.type": "adapter",
|
||||
"general.version": "v0.2",
|
||||
"adapter.lora.alpha": "16",
|
||||
"adapter.type": "lora",
|
||||
"llama.attention.head_count": "32",
|
||||
"llama.attention.head_count_kv": "8",
|
||||
"blk.31.attn_q.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
|
||||
"blk.31.attn_q.weight.lora_b": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
|
||||
"blk.31.attn_v.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50",
|
||||
"blk.31.attn_v.weight.lora_b": "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run(c.Name, func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
f, err := os.CreateTemp(t.TempDir(), "f16")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
tempDir := t.TempDir()
|
||||
generateLoraTestData(t, tempDir)
|
||||
|
||||
if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
r, err := os.Open(f.Name())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
m, _, err := llm.DecodeGGML(r, math.MaxInt)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if _, err := r.Seek(0, io.SeekStart); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
|
||||
|
||||
keys := maps.Keys(c.Expected)
|
||||
slices.Sort(keys)
|
||||
for _, k := range keys {
|
||||
if v, ok := actual[k]; !ok {
|
||||
t.Errorf("missing %s", k)
|
||||
} else if v != c.Expected[k] {
|
||||
t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func generateLoraTestData(t *testing.T, tempDir string) {
|
||||
type tensorData struct {
|
||||
Offsets []int `json:"data_offsets"`
|
||||
Type string `json:"dtype"`
|
||||
Shape []int `json:"shape"`
|
||||
}
|
||||
offset := 4096 * 8 * 4
|
||||
|
||||
td := map[string]*tensorData{"__metadata__": nil}
|
||||
td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{
|
||||
Offsets: []int{0, offset},
|
||||
Type: "F32",
|
||||
Shape: []int{4096, 8},
|
||||
}
|
||||
td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{
|
||||
Offsets: []int{offset, offset * 2},
|
||||
Type: "F32",
|
||||
Shape: []int{8, 4096},
|
||||
}
|
||||
td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{
|
||||
Offsets: []int{offset * 2, offset * 3},
|
||||
Type: "F32",
|
||||
Shape: []int{4096, 8},
|
||||
}
|
||||
td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{
|
||||
Offsets: []int{offset * 3, offset*3 + 8*1024*4},
|
||||
Type: "F32",
|
||||
Shape: []int{8, 1024},
|
||||
}
|
||||
|
||||
data, err := json.Marshal(td)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
|
||||
l := int64(len(data))
|
||||
err = binary.Write(&buf, binary.LittleEndian, l)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
_, err = buf.Write(data)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// write some data for the tensors
|
||||
|
||||
ones := make([]float32, 4096*8)
|
||||
for i := range ones {
|
||||
ones[i] = float32(1)
|
||||
}
|
||||
|
||||
for range 3 {
|
||||
err = binary.Write(&buf, binary.LittleEndian, ones)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
ones = make([]float32, 1024*8)
|
||||
for i := range ones {
|
||||
ones[i] = float32(1)
|
||||
}
|
||||
|
||||
err = binary.Write(&buf, binary.LittleEndian, ones)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer fdata.Close()
|
||||
|
||||
_, err = fdata.Write(buf.Bytes())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
configData := `
|
||||
{
|
||||
"adapter_path": "adapters-test",
|
||||
"batch_size": 8,
|
||||
"config": "config-tiny.json",
|
||||
"data": "../discollama-completion",
|
||||
"grad_checkpoint": null,
|
||||
"iters": 1000,
|
||||
"learning_rate": 1e-05,
|
||||
"lora_layers": 1,
|
||||
"lora_parameters": {
|
||||
"rank": 8,
|
||||
"alpha": 16,
|
||||
"dropout": 0.0,
|
||||
"scale": 2.0
|
||||
},
|
||||
"lr_schedule": null,
|
||||
"max_seq_length": 2048,
|
||||
"model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct",
|
||||
"resume_adapter_file": null,
|
||||
"save_every": 100,
|
||||
"seed": 0,
|
||||
"steps_per_eval": 200,
|
||||
"steps_per_report": 10,
|
||||
"test": false,
|
||||
"test_batches": 500,
|
||||
"train": true,
|
||||
"use_dora": false,
|
||||
"val_batches": 25
|
||||
}
|
||||
`
|
||||
f, err := os.Create(filepath.Join(tempDir, "adapter_config.json"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
_, err = f.WriteString(configData)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,9 +35,7 @@ const (
|
|||
)
|
||||
|
||||
func (t tensorBase) Kind() uint32 {
|
||||
if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") ||
|
||||
t.name == "token_types.weight" {
|
||||
// these tensors are always F32
|
||||
if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") {
|
||||
return 0
|
||||
}
|
||||
|
||||
|
@ -57,15 +55,13 @@ func (t *tensorBase) SetRepacker(fn repacker) {
|
|||
|
||||
type repacker func(string, []float32, []uint64) ([]float32, error)
|
||||
|
||||
func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
|
||||
func parseTensors(fsys fs.FS) ([]Tensor, error) {
|
||||
patterns := []struct {
|
||||
Pattern string
|
||||
Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error)
|
||||
Func func(fs.FS, ...string) ([]Tensor, error)
|
||||
}{
|
||||
{"model-*-of-*.safetensors", parseSafetensors},
|
||||
{"model.safetensors", parseSafetensors},
|
||||
{"adapters.safetensors", parseSafetensors},
|
||||
{"adapter_model.safetensors", parseSafetensors},
|
||||
{"pytorch_model-*-of-*.bin", parseTorch},
|
||||
{"pytorch_model.bin", parseTorch},
|
||||
{"consolidated.*.pth", parseTorch},
|
||||
|
@ -78,7 +74,7 @@ func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) {
|
|||
}
|
||||
|
||||
if len(matches) > 0 {
|
||||
return pattern.Func(fsys, replacer, matches...)
|
||||
return pattern.Func(fsys, matches...)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -8,7 +8,6 @@ import (
|
|||
"io"
|
||||
"io/fs"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/d4l3k/go-bfloat16"
|
||||
"github.com/x448/float16"
|
||||
|
@ -21,7 +20,7 @@ type safetensorMetadata struct {
|
|||
Offsets []int64 `json:"data_offsets"`
|
||||
}
|
||||
|
||||
func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
|
||||
func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
||||
var ts []Tensor
|
||||
for _, p := range ps {
|
||||
f, err := fsys.Open(p)
|
||||
|
@ -57,7 +56,7 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
|
|||
offset: safetensorsPad(n, value.Offsets[0]),
|
||||
size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]),
|
||||
tensorBase: &tensorBase{
|
||||
name: replacer.Replace(key),
|
||||
name: key,
|
||||
shape: value.Shape,
|
||||
},
|
||||
})
|
||||
|
|
|
@ -3,13 +3,12 @@ package convert
|
|||
import (
|
||||
"io"
|
||||
"io/fs"
|
||||
"strings"
|
||||
|
||||
"github.com/nlpodyssey/gopickle/pytorch"
|
||||
"github.com/nlpodyssey/gopickle/types"
|
||||
)
|
||||
|
||||
func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) {
|
||||
func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) {
|
||||
var ts []Tensor
|
||||
for _, p := range ps {
|
||||
pt, err := pytorch.Load(p)
|
||||
|
@ -28,7 +27,7 @@ func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor,
|
|||
ts = append(ts, torch{
|
||||
storage: t.(*pytorch.Tensor).Source,
|
||||
tensorBase: &tensorBase{
|
||||
name: replacer.Replace(k.(string)),
|
||||
name: k.(string),
|
||||
shape: shape,
|
||||
},
|
||||
})
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
{
|
||||
"rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222"
|
||||
}
|
124
convert/testdata/all-MiniLM-L6-v2.json
vendored
124
convert/testdata/all-MiniLM-L6-v2.json
vendored
|
@ -1,124 +0,0 @@
|
|||
{
|
||||
"general.architecture": "bert",
|
||||
"general.file_type": "1",
|
||||
"general.quantization_version": "2",
|
||||
"bert.attention.causal": "false",
|
||||
"bert.attention.head_count": "12",
|
||||
"bert.attention.layer_norm_epsilon": "1e-12",
|
||||
"bert.block_count": "6",
|
||||
"bert.context_length": "512",
|
||||
"bert.embedding_length": "384",
|
||||
"bert.feed_forward_length": "1536",
|
||||
"bert.pooling_type": "1",
|
||||
"tokenizer.ggml.model": "bert",
|
||||
"tokenizer.ggml.padding_token_id": "0",
|
||||
"tokenizer.ggml.unknown_token_id": "100",
|
||||
"tokenizer.ggml.cls_token_id": "101",
|
||||
"tokenizer.ggml.seperator_token_id": "102",
|
||||
"tokenizer.ggml.mask_token_id": "103",
|
||||
"tokenizer.ggml.token_type_count": "2",
|
||||
"tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f",
|
||||
"tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1",
|
||||
"tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86",
|
||||
"token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a",
|
||||
"position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6",
|
||||
"token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21",
|
||||
"token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4",
|
||||
"token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d",
|
||||
"blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c",
|
||||
"blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62",
|
||||
"blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba",
|
||||
"blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9",
|
||||
"blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3",
|
||||
"blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb",
|
||||
"blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802",
|
||||
"blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701",
|
||||
"blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62",
|
||||
"blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50",
|
||||
"blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874",
|
||||
"blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34",
|
||||
"blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca",
|
||||
"blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054",
|
||||
"blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8",
|
||||
"blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a",
|
||||
"blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10",
|
||||
"blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069",
|
||||
"blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d",
|
||||
"blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f",
|
||||
"blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21",
|
||||
"blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199",
|
||||
"blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73",
|
||||
"blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d",
|
||||
"blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8",
|
||||
"blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d",
|
||||
"blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7",
|
||||
"blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a",
|
||||
"blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613",
|
||||
"blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88",
|
||||
"blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd",
|
||||
"blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164",
|
||||
"blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e",
|
||||
"blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9",
|
||||
"blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289",
|
||||
"blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d",
|
||||
"blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993",
|
||||
"blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab",
|
||||
"blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79",
|
||||
"blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3",
|
||||
"blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21",
|
||||
"blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08",
|
||||
"blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59",
|
||||
"blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252",
|
||||
"blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef",
|
||||
"blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27",
|
||||
"blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35",
|
||||
"blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247",
|
||||
"blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6",
|
||||
"blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb",
|
||||
"blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18",
|
||||
"blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e",
|
||||
"blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b",
|
||||
"blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1",
|
||||
"blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502",
|
||||
"blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae",
|
||||
"blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0",
|
||||
"blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40",
|
||||
"blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0",
|
||||
"blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b",
|
||||
"blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847",
|
||||
"blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de",
|
||||
"blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482",
|
||||
"blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab",
|
||||
"blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc",
|
||||
"blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb",
|
||||
"blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16",
|
||||
"blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60",
|
||||
"blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a",
|
||||
"blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6",
|
||||
"blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44",
|
||||
"blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222",
|
||||
"blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44",
|
||||
"blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374",
|
||||
"blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223",
|
||||
"blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017",
|
||||
"blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21",
|
||||
"blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1",
|
||||
"blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f",
|
||||
"blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c",
|
||||
"blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d",
|
||||
"blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b",
|
||||
"blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5",
|
||||
"blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77",
|
||||
"blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923",
|
||||
"blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c",
|
||||
"blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471",
|
||||
"blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219",
|
||||
"blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271",
|
||||
"blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712",
|
||||
"blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430",
|
||||
"blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370",
|
||||
"blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368",
|
||||
"blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291",
|
||||
"blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a",
|
||||
"blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766"
|
||||
}
|
6
convert/testdata/gemma-2-9b-it.json
vendored
6
convert/testdata/gemma-2-9b-it.json
vendored
|
@ -1,6 +0,0 @@
|
|||
{
|
||||
"general.architecture": "gemma2",
|
||||
"gemma2.attention.sliding_window": "4096",
|
||||
"gemma2.attn_logit_softcapping": "50",
|
||||
"gemma2.final_logit_softcapping": "30"
|
||||
}
|
|
@ -1,6 +1,7 @@
|
|||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
|
@ -10,8 +11,6 @@ import (
|
|||
"log/slog"
|
||||
"os"
|
||||
"slices"
|
||||
|
||||
"golang.org/x/exp/maps"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -185,32 +184,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
tokens := make(map[int]token, len(t.Model.Vocab))
|
||||
var tokens []token
|
||||
for k, v := range t.Model.Vocab {
|
||||
tokens[v] = token{
|
||||
tokens = append(tokens, token{
|
||||
ID: v,
|
||||
Content: k,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
for _, token := range t.AddedTokens {
|
||||
token.UserDefined = true
|
||||
tokens[token.ID] = token
|
||||
for _, t := range t.AddedTokens {
|
||||
t.UserDefined = true
|
||||
tokens = append(tokens, t)
|
||||
}
|
||||
|
||||
keys := maps.Keys(tokens)
|
||||
slices.Sort(keys)
|
||||
slices.SortFunc(tokens, func(i, j token) int {
|
||||
return cmp.Compare(i.ID, j.ID)
|
||||
})
|
||||
|
||||
v := Vocabulary{Model: "gpt2"}
|
||||
for _, k := range keys {
|
||||
token := tokens[k]
|
||||
v.Tokens = append(v.Tokens, token.Content)
|
||||
v.Scores = append(v.Scores, float32(token.ID))
|
||||
for _, t := range tokens {
|
||||
v.Tokens = append(v.Tokens, t.Content)
|
||||
v.Scores = append(v.Scores, float32(t.ID))
|
||||
|
||||
switch {
|
||||
case token.Special:
|
||||
case t.Special:
|
||||
v.Types = append(v.Types, tokenTypeControl)
|
||||
case token.UserDefined:
|
||||
case t.UserDefined:
|
||||
v.Types = append(v.Types, tokenTypeUserDefined)
|
||||
default:
|
||||
v.Types = append(v.Types, tokenTypeNormal)
|
||||
|
|
|
@ -15,11 +15,6 @@ import (
|
|||
)
|
||||
|
||||
func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
||||
ast, err := parseAdditionalSpecialTokens(fsys)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
bts, err := fs.ReadFile(fsys, "tokenizer.model")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -42,12 +37,7 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
|||
sentencepiece.ModelProto_SentencePiece_BYTE:
|
||||
v.Types = append(v.Types, int32(t))
|
||||
default:
|
||||
tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
|
||||
if slices.Contains(ast, piece.GetPiece()) {
|
||||
tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL)
|
||||
}
|
||||
|
||||
v.Types = append(v.Types, tt)
|
||||
v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -91,23 +81,3 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) {
|
|||
|
||||
return &v, nil
|
||||
}
|
||||
|
||||
func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) {
|
||||
f, err := fsys.Open("special_tokens_map.json")
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return nil, nil
|
||||
} else if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var m struct {
|
||||
AdditionalSpecialTokens []string `json:"additional_special_tokens"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(f).Decode(&m); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return m.AdditionalSpecialTokens, nil
|
||||
}
|
||||
|
|
|
@ -111,10 +111,7 @@ On Windows, Ollama inherits your user and system environment variables.
|
|||
|
||||
## How do I use Ollama behind a proxy?
|
||||
|
||||
Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
|
||||
|
||||
> [!NOTE]
|
||||
> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server.
|
||||
Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform.
|
||||
|
||||
### How do I use Ollama behind a proxy in Docker?
|
||||
|
||||
|
@ -279,4 +276,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit
|
|||
|
||||
## How does Ollama load models on multiple GPUs?
|
||||
|
||||
Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
|
||||
Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
|
|
@ -20,12 +20,13 @@ GPU.
|
|||
|
||||
## Manual install
|
||||
|
||||
### Download `ollama`
|
||||
### Download the `ollama` binary
|
||||
|
||||
Download and extract the Linux package:
|
||||
Ollama is distributed as a self-contained binary. Download it to a directory in your PATH:
|
||||
|
||||
```bash
|
||||
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
|
||||
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
|
||||
sudo chmod +x /usr/bin/ollama
|
||||
```
|
||||
|
||||
### Adding Ollama as a startup service (recommended)
|
||||
|
@ -95,7 +96,8 @@ curl -fsSL https://ollama.com/install.sh | sh
|
|||
Or by downloading the ollama binary:
|
||||
|
||||
```bash
|
||||
curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr
|
||||
sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama
|
||||
sudo chmod +x /usr/bin/ollama
|
||||
```
|
||||
|
||||
## Installing specific versions
|
||||
|
|
|
@ -174,7 +174,7 @@ func RunnersDir() (p string) {
|
|||
|
||||
defer func() {
|
||||
if p == "" {
|
||||
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
|
||||
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'")
|
||||
}
|
||||
}()
|
||||
|
||||
|
@ -190,17 +190,17 @@ func RunnersDir() (p string) {
|
|||
}
|
||||
|
||||
var paths []string
|
||||
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} {
|
||||
for _, root := range []string{filepath.Dir(exe), cwd} {
|
||||
paths = append(paths,
|
||||
root,
|
||||
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
|
||||
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
|
||||
filepath.Join(root, "windows-"+runtime.GOARCH),
|
||||
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
||||
)
|
||||
}
|
||||
|
||||
// Try a few variations to improve developer experience when building from source in the local tree
|
||||
for _, path := range paths {
|
||||
candidate := filepath.Join(path, "lib", "ollama", "runners")
|
||||
candidate := filepath.Join(path, "ollama_runners")
|
||||
if _, err := os.Stat(candidate); err == nil {
|
||||
p = candidate
|
||||
break
|
||||
|
|
|
@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) {
|
|||
// Installer payload location if we're running the installed binary
|
||||
exe, err := os.Executable()
|
||||
if err == nil {
|
||||
rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama")
|
||||
rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
|
||||
if rocmLibUsable(rocmTargetDir) {
|
||||
slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
|
||||
return rocmTargetDir, nil
|
||||
|
|
|
@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) {
|
|||
// Installer payload (if we're running from some other location)
|
||||
localAppData := os.Getenv("LOCALAPPDATA")
|
||||
appDir := filepath.Join(localAppData, "Programs", "Ollama")
|
||||
rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama")
|
||||
rocmTargetDir := filepath.Join(appDir, "rocm")
|
||||
if rocmLibUsable(rocmTargetDir) {
|
||||
slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
|
||||
return rocmTargetDir, nil
|
||||
|
|
|
@ -4,17 +4,9 @@ package gpu
|
|||
|
||||
import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||
|
||||
func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||
ids := []string{}
|
||||
for _, info := range gpuInfo {
|
||||
|
@ -27,38 +19,3 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
|||
}
|
||||
return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||
}
|
||||
|
||||
func cudaVariant(gpuInfo CudaGPUInfo) string {
|
||||
if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
|
||||
if CudaTegra != "" {
|
||||
ver := strings.Split(CudaTegra, ".")
|
||||
if len(ver) > 0 {
|
||||
return "jetpack" + ver[0]
|
||||
}
|
||||
} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
|
||||
r := regexp.MustCompile(` R(\d+) `)
|
||||
m := r.FindSubmatch(data)
|
||||
if len(m) != 2 {
|
||||
slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version")
|
||||
} else {
|
||||
if l4t, err := strconv.Atoi(string(m[1])); err == nil {
|
||||
// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
|
||||
// https://developer.nvidia.com/embedded/jetpack-archive
|
||||
switch l4t {
|
||||
case 35:
|
||||
return "jetpack5"
|
||||
case 36:
|
||||
return "jetpack6"
|
||||
default:
|
||||
slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
|
||||
return "v11"
|
||||
}
|
||||
return "v12"
|
||||
}
|
||||
|
|
72
gpu/gpu.go
72
gpu/gpu.go
|
@ -64,6 +64,10 @@ var RocmComputeMin = 9
|
|||
// TODO find a better way to detect iGPU instead of minimum memory
|
||||
const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
|
||||
|
||||
// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
|
||||
// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
|
||||
var CudaTegra string = os.Getenv("JETSON_JETPACK")
|
||||
|
||||
// Note: gpuMutex must already be held
|
||||
func initCudaHandles() *cudaHandles {
|
||||
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||
|
@ -211,7 +215,7 @@ func GetGPUInfo() GpuInfoList {
|
|||
GpuInfo: GpuInfo{
|
||||
memInfo: mem,
|
||||
Library: "cpu",
|
||||
Variant: cpuCapability.String(),
|
||||
Variant: cpuCapability,
|
||||
ID: "0",
|
||||
},
|
||||
},
|
||||
|
@ -225,7 +229,11 @@ func GetGPUInfo() GpuInfoList {
|
|||
return GpuInfoList{cpus[0].GpuInfo}
|
||||
}
|
||||
|
||||
depPath := LibraryDir()
|
||||
// On windows we bundle the nvidia library one level above the runner dir
|
||||
depPath := ""
|
||||
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
|
||||
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda")
|
||||
}
|
||||
|
||||
// Load ALL libraries
|
||||
cHandles = initCudaHandles()
|
||||
|
@ -261,23 +269,11 @@ func GetGPUInfo() GpuInfoList {
|
|||
gpuInfo.FreeMemory = uint64(memInfo.free)
|
||||
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
|
||||
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
|
||||
gpuInfo.computeMajor = int(memInfo.major)
|
||||
gpuInfo.computeMinor = int(memInfo.minor)
|
||||
gpuInfo.MinimumMemory = cudaMinimumMemory
|
||||
gpuInfo.DependencyPath = depPath
|
||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||
gpuInfo.DriverMajor = driverMajor
|
||||
gpuInfo.DriverMinor = driverMinor
|
||||
variant := cudaVariant(gpuInfo)
|
||||
if depPath != "" {
|
||||
gpuInfo.DependencyPath = depPath
|
||||
// Check for variant specific directory
|
||||
if variant != "" {
|
||||
if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
|
||||
gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant)
|
||||
}
|
||||
}
|
||||
}
|
||||
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
|
||||
gpuInfo.Variant = variant
|
||||
|
||||
// query the management library as well so we can record any skew between the two
|
||||
// which represents overhead on the GPU we must set aside on subsequent updates
|
||||
|
@ -310,6 +306,13 @@ func GetGPUInfo() GpuInfoList {
|
|||
if envconfig.IntelGPU() {
|
||||
oHandles = initOneAPIHandles()
|
||||
if oHandles != nil && oHandles.oneapi != nil {
|
||||
|
||||
// On windows we bundle the oneapi library one level above the runner dir
|
||||
depPath = ""
|
||||
if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" {
|
||||
depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi")
|
||||
}
|
||||
|
||||
for d := range oHandles.oneapi.num_drivers {
|
||||
if oHandles.oneapi == nil {
|
||||
// shouldn't happen
|
||||
|
@ -464,12 +467,10 @@ func GetGPUInfo() GpuInfoList {
|
|||
func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
||||
// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
|
||||
var ldPaths []string
|
||||
var patterns []string
|
||||
gpuLibPaths := []string{}
|
||||
slog.Debug("Searching for GPU library", "name", baseLibName)
|
||||
|
||||
// Start with our bundled libraries
|
||||
patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
|
||||
|
||||
switch runtime.GOOS {
|
||||
case "windows":
|
||||
ldPaths = strings.Split(os.Getenv("PATH"), ";")
|
||||
|
@ -478,14 +479,13 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
|
|||
default:
|
||||
return gpuLibPaths
|
||||
}
|
||||
|
||||
// Then with whatever we find in the PATH/LD_LIBRARY_PATH
|
||||
// Start with whatever we find in the PATH/LD_LIBRARY_PATH
|
||||
for _, ldPath := range ldPaths {
|
||||
d, err := filepath.Abs(ldPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
patterns = append(patterns, filepath.Join(d, baseLibName))
|
||||
patterns = append(patterns, filepath.Join(d, baseLibName+"*"))
|
||||
}
|
||||
patterns = append(patterns, defaultPatterns...)
|
||||
slog.Debug("gpu library search", "globs", patterns)
|
||||
|
@ -641,31 +641,3 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
|||
return "", ""
|
||||
}
|
||||
}
|
||||
|
||||
func LibraryDir() string {
|
||||
// On Windows/linux we bundle the dependencies at the same level as the executable
|
||||
appExe, err := os.Executable()
|
||||
if err != nil {
|
||||
slog.Warn("failed to lookup executable path", "error", err)
|
||||
}
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
slog.Warn("failed to lookup working directory", "error", err)
|
||||
}
|
||||
// Scan for any of our dependeices, and pick first match
|
||||
for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} {
|
||||
libDep := filepath.Join("lib", "ollama")
|
||||
if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
|
||||
return filepath.Join(root, libDep)
|
||||
}
|
||||
// Developer mode, local build
|
||||
if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
|
||||
return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
|
||||
return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
|
||||
}
|
||||
}
|
||||
slog.Warn("unable to locate gpu dependency libraries")
|
||||
return ""
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList {
|
|||
return []GpuInfo{
|
||||
{
|
||||
Library: "cpu",
|
||||
Variant: GetCPUCapability().String(),
|
||||
Variant: GetCPUCapability(),
|
||||
memInfo: mem,
|
||||
},
|
||||
}
|
||||
|
@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList {
|
|||
return []GpuInfo{
|
||||
{
|
||||
Library: "cpu",
|
||||
Variant: GetCPUCapability().String(),
|
||||
Variant: GetCPUCapability(),
|
||||
memInfo: mem,
|
||||
},
|
||||
}
|
||||
|
|
|
@ -47,7 +47,7 @@ var (
|
|||
CudartMgmtName = "libcudart.so*"
|
||||
NvcudaMgmtName = "libcuda.so*"
|
||||
NvmlMgmtName = "" // not currently wired on linux
|
||||
OneapiMgmtName = "libze_intel_gpu.so*"
|
||||
OneapiMgmtName = "libze_intel_gpu.so"
|
||||
)
|
||||
|
||||
func GetCPUMem() (memInfo, error) {
|
||||
|
|
|
@ -32,29 +32,4 @@ func TestCPUMemInfo(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestByLibrary(t *testing.T) {
|
||||
type testCase struct {
|
||||
input []GpuInfo
|
||||
expect int
|
||||
}
|
||||
|
||||
testCases := map[string]*testCase{
|
||||
"empty": {input: []GpuInfo{}, expect: 0},
|
||||
"cpu": {input: []GpuInfo{{Library: "cpu"}}, expect: 1},
|
||||
"cpu + GPU": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2},
|
||||
"cpu + 2 GPU no variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2},
|
||||
"cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2},
|
||||
"cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3},
|
||||
}
|
||||
|
||||
for k, v := range testCases {
|
||||
t.Run(k, func(t *testing.T) {
|
||||
resp := (GpuInfoList)(v.input).ByLibrary()
|
||||
if len(resp) != v.expect {
|
||||
t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TODO - add some logic to figure out card type through other means and actually verify we got back what we expected
|
||||
|
|
15
gpu/types.go
15
gpu/types.go
|
@ -19,7 +19,7 @@ type GpuInfo struct {
|
|||
Library string `json:"library,omitempty"`
|
||||
|
||||
// Optional variant to select (e.g. versions, cpu feature flags)
|
||||
Variant string `json:"variant"`
|
||||
Variant CPUCapability `json:"variant"`
|
||||
|
||||
// MinimumMemory represents the minimum memory required to use the GPU
|
||||
MinimumMemory uint64 `json:"-"`
|
||||
|
@ -53,10 +53,8 @@ type CPUInfo struct {
|
|||
|
||||
type CudaGPUInfo struct {
|
||||
GpuInfo
|
||||
OSOverhead uint64 // Memory overhead between the driver library and management library
|
||||
index int //nolint:unused,nolintlint
|
||||
computeMajor int //nolint:unused,nolintlint
|
||||
computeMinor int //nolint:unused,nolintlint
|
||||
OSOverhead uint64 // Memory overhead between the driver library and management library
|
||||
index int //nolint:unused,nolintlint
|
||||
}
|
||||
type CudaGPUInfoList []CudaGPUInfo
|
||||
|
||||
|
@ -83,8 +81,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
|||
for _, info := range l {
|
||||
found := false
|
||||
requested := info.Library
|
||||
if info.Variant != CPUCapabilityNone.String() {
|
||||
requested += "_" + info.Variant
|
||||
if info.Variant != CPUCapabilityNone {
|
||||
requested += "_" + info.Variant.String()
|
||||
}
|
||||
for i, lib := range libs {
|
||||
if lib == requested {
|
||||
|
@ -94,7 +92,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
|||
}
|
||||
}
|
||||
if !found {
|
||||
libs = append(libs, requested)
|
||||
libs = append(libs, info.Library)
|
||||
resp = append(resp, []GpuInfo{info})
|
||||
}
|
||||
}
|
||||
|
@ -107,7 +105,6 @@ func (l GpuInfoList) LogDetails() {
|
|||
slog.Info("inference compute",
|
||||
"id", g.ID,
|
||||
"library", g.Library,
|
||||
"variant", g.Variant,
|
||||
"compute", g.Compute,
|
||||
"driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor),
|
||||
"name", g.Name,
|
||||
|
|
|
@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
|||
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
|
||||
}
|
||||
|
||||
if res.PromptEvalCount != 6 {
|
||||
t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
|
||||
if res.PromptEvalCount != 8 {
|
||||
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
|||
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
|
||||
}
|
||||
|
||||
if res.PromptEvalCount != 12 {
|
||||
t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
|
||||
if res.PromptEvalCount != 16 {
|
||||
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
3
llm/ext_server/CMakeLists.txt
vendored
3
llm/ext_server/CMakeLists.txt
vendored
|
@ -1,13 +1,12 @@
|
|||
set(TARGET ollama_llama_server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
)
|
||||
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
|
||||
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||
if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
|
|
8
llm/ext_server/server.cpp
vendored
8
llm/ext_server/server.cpp
vendored
|
@ -1429,13 +1429,7 @@ struct llama_server_context
|
|||
switch (task.type)
|
||||
{
|
||||
case TASK_TYPE_COMPLETION: {
|
||||
server_slot *slot = nullptr;
|
||||
if (task.embedding_mode) {
|
||||
// Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
|
||||
slot = slots[0].available() ? &slots[0] : nullptr;
|
||||
} else {
|
||||
slot = prefix_slot(task.data["prompt"]);
|
||||
}
|
||||
server_slot *slot = prefix_slot(task.data["prompt"]);
|
||||
if (slot == nullptr)
|
||||
{
|
||||
// if no slot is available, we defer this task for processing later
|
||||
|
|
|
@ -9,14 +9,11 @@ init_vars() {
|
|||
ARCH="arm64"
|
||||
;;
|
||||
*)
|
||||
echo "GOARCH must be set"
|
||||
echo "this script is meant to be run from within go generate"
|
||||
exit 1
|
||||
;;
|
||||
ARCH=$(uname -m | sed -e "s/aarch64/arm64/g")
|
||||
esac
|
||||
|
||||
LLAMACPP_DIR=../llama.cpp
|
||||
CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on"
|
||||
CMAKE_DEFS=""
|
||||
CMAKE_TARGETS="--target ollama_llama_server"
|
||||
if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
|
||||
|
@ -30,7 +27,6 @@ init_vars() {
|
|||
WHOLE_ARCHIVE="-Wl,-force_load"
|
||||
NO_WHOLE_ARCHIVE=""
|
||||
GCC_ARCH="-arch ${ARCH}"
|
||||
DIST_BASE=../../dist/darwin-${GOARCH}/
|
||||
;;
|
||||
"Linux")
|
||||
LIB_EXT="so"
|
||||
|
@ -39,7 +35,6 @@ init_vars() {
|
|||
|
||||
# Cross compiling not supported on linux - Use docker
|
||||
GCC_ARCH=""
|
||||
DIST_BASE=../../dist/linux-${GOARCH}/
|
||||
;;
|
||||
*)
|
||||
;;
|
||||
|
@ -47,7 +42,6 @@ init_vars() {
|
|||
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
|
||||
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||
fi
|
||||
GZIP=$(which pigz 2>/dev/null || echo "gzip")
|
||||
}
|
||||
|
||||
git_module_setup() {
|
||||
|
@ -91,36 +85,26 @@ build() {
|
|||
|
||||
compress() {
|
||||
echo "Compressing payloads to reduce overall binary size..."
|
||||
pids=""
|
||||
rm -rf ${BUILD_DIR}/bin/*.gz
|
||||
for f in ${BUILD_DIR}/bin/* ; do
|
||||
${GZIP} -n --best -f ${f} &
|
||||
compress_pids+=" $!"
|
||||
gzip -n --best -f ${f} &
|
||||
pids+=" $!"
|
||||
done
|
||||
# check for lib directory
|
||||
if [ -d ${BUILD_DIR}/lib ]; then
|
||||
for f in ${BUILD_DIR}/lib/* ; do
|
||||
${GZIP} -n --best -f ${f} &
|
||||
compress_pids+=" $!"
|
||||
gzip -n --best -f ${f} &
|
||||
pids+=" $!"
|
||||
done
|
||||
fi
|
||||
echo
|
||||
}
|
||||
|
||||
wait_for_compress() {
|
||||
for pid in ${compress_pids}; do
|
||||
for pid in ${pids}; do
|
||||
wait $pid
|
||||
done
|
||||
echo "Finished compression"
|
||||
}
|
||||
|
||||
install() {
|
||||
echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
|
||||
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
|
||||
rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
|
||||
cp -af "${lib}" "${BUILD_DIR}/bin/"
|
||||
done
|
||||
}
|
||||
|
||||
# Keep the local tree clean after we're done with the build
|
||||
cleanup() {
|
||||
(cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt)
|
||||
|
|
|
@ -6,7 +6,6 @@
|
|||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
compress_pids=""
|
||||
echo "Starting darwin generate script"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
init_vars
|
||||
|
@ -99,5 +98,4 @@ case "${GOARCH}" in
|
|||
esac
|
||||
|
||||
cleanup
|
||||
wait_for_compress
|
||||
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
compress_pids=""
|
||||
|
||||
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
|
||||
amdGPUs() {
|
||||
|
@ -52,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
|
|||
export CUDACXX=$(command -v nvcc)
|
||||
fi
|
||||
fi
|
||||
COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
|
||||
COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
|
||||
source $(dirname $0)/gen_common.sh
|
||||
init_vars
|
||||
git_module_setup
|
||||
|
@ -78,11 +77,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
||||
init_vars
|
||||
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
||||
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
||||
echo "Building custom CPU"
|
||||
build
|
||||
install
|
||||
compress
|
||||
else
|
||||
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
||||
|
@ -95,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
|
||||
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
|
||||
|
||||
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
|
||||
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
|
||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
||||
#
|
||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||
|
@ -105,7 +103,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
||||
echo "Building LCD CPU"
|
||||
build
|
||||
install
|
||||
compress
|
||||
fi
|
||||
|
||||
|
@ -123,7 +120,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
|
||||
echo "Building AVX CPU"
|
||||
build
|
||||
install
|
||||
compress
|
||||
fi
|
||||
|
||||
|
@ -137,7 +133,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
|
||||
echo "Building AVX2 CPU"
|
||||
build
|
||||
install
|
||||
compress
|
||||
fi
|
||||
fi
|
||||
|
@ -165,7 +160,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
|||
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||
init_vars
|
||||
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
|
||||
if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
|
||||
if [ -n "${CUDA_MAJOR}" ]; then
|
||||
CUDA_VARIANT=_v${CUDA_MAJOR}
|
||||
fi
|
||||
if [ "${ARCH}" == "arm64" ]; then
|
||||
|
@ -183,19 +178,29 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
|||
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
||||
echo "Building custom CUDA GPU"
|
||||
else
|
||||
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
||||
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
||||
fi
|
||||
export CUDAFLAGS="-t8"
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
|
||||
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||
build
|
||||
install
|
||||
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
|
||||
mkdir -p "${CUDA_DIST_DIR}"
|
||||
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
|
||||
cp -a "${lib}" "${CUDA_DIST_DIR}"
|
||||
|
||||
# Carry the CUDA libs as payloads to help reduce dependency burden on users
|
||||
#
|
||||
# TODO - in the future we may shift to packaging these separately and conditionally
|
||||
# downloading them in the install script.
|
||||
DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
|
||||
for lib in libcudart.so libcublas.so libcublasLt.so ; do
|
||||
DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
|
||||
if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
|
||||
cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
|
||||
elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
|
||||
cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
|
||||
elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
|
||||
cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
|
||||
else
|
||||
cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
|
||||
fi
|
||||
done
|
||||
compress
|
||||
|
||||
|
@ -213,24 +218,21 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
|
|||
CC=icx
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
|
||||
BUILD_DIR="../build/linux/${ARCH}/oneapi"
|
||||
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
|
||||
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
|
||||
EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
|
||||
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
|
||||
build
|
||||
|
||||
# copy oneAPI dependencies
|
||||
mkdir -p "${ONEAPI_DIST_DIR}"
|
||||
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
|
||||
cp -a "${dep}" "${ONEAPI_DIST_DIR}"
|
||||
cp "${dep}" "${BUILD_DIR}/bin/"
|
||||
done
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
|
||||
install
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
|
||||
compress
|
||||
fi
|
||||
|
||||
|
@ -252,7 +254,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
|||
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
|
||||
fi
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
||||
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
|
||||
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
|
||||
|
@ -260,22 +262,23 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
|||
echo "Building custom ROCM GPU"
|
||||
fi
|
||||
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
||||
# ROCm dependencies are too large to fit into a unified bundle
|
||||
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
|
||||
# TODO figure out how to disable runpath (rpath)
|
||||
# export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
|
||||
export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||
build
|
||||
|
||||
# copy the ROCM dependencies
|
||||
mkdir -p "${ROCM_DIST_DIR}"
|
||||
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
|
||||
cp -a "${dep}"* "${ROCM_DIST_DIR}"
|
||||
# Record the ROCM dependencies
|
||||
rm -f "${BUILD_DIR}/bin/deps.txt"
|
||||
touch "${BUILD_DIR}/bin/deps.txt"
|
||||
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
|
||||
echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
|
||||
done
|
||||
install
|
||||
# bomb out if for some reason we didn't get a few deps
|
||||
if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
|
||||
cat "${BUILD_DIR}/bin/deps.txt"
|
||||
echo "ERROR: deps file short"
|
||||
exit 1
|
||||
fi
|
||||
compress
|
||||
fi
|
||||
|
||||
cleanup
|
||||
wait_for_compress
|
||||
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
||||
|
|
|
@ -35,7 +35,7 @@ function init_vars {
|
|||
)
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners"
|
||||
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
|
||||
md "$script:DIST_BASE" -ea 0 > $null
|
||||
if ($env:CGO_CFLAGS -contains "-g") {
|
||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
|
||||
|
@ -117,7 +117,7 @@ function build {
|
|||
if ($cmakeDefs -contains "-G") {
|
||||
$extra=@("-j8")
|
||||
} else {
|
||||
$extra= @("--", "/maxCpuCount:8")
|
||||
$extra= @("--", "/p:CL_MPcount=8")
|
||||
}
|
||||
write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra"
|
||||
& cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra
|
||||
|
@ -261,7 +261,7 @@ function build_cuda() {
|
|||
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
|
||||
# Then build cuda as a dynamically loaded library
|
||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||
$script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0]
|
||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
||||
if ($null -ne $script:CUDA_VERSION) {
|
||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||
}
|
||||
|
@ -273,9 +273,9 @@ function build_cuda() {
|
|||
"-DGGML_CUDA=ON",
|
||||
"-DGGML_AVX=on",
|
||||
"-DGGML_AVX2=off",
|
||||
"-DCMAKE_CUDA_FLAGS=-t6",
|
||||
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}",
|
||||
"-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH"
|
||||
"-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR",
|
||||
"-DCMAKE_CUDA_FLAGS=-t8",
|
||||
"-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}"
|
||||
)
|
||||
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
||||
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
||||
|
@ -286,11 +286,12 @@ function build_cuda() {
|
|||
sign
|
||||
install
|
||||
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
|
||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null
|
||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\"
|
||||
} else {
|
||||
write-host "Skipping CUDA generation step"
|
||||
}
|
||||
|
@ -324,17 +325,18 @@ function build_oneapi() {
|
|||
sign
|
||||
install
|
||||
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\"
|
||||
} else {
|
||||
Write-Host "Skipping oneAPI generation step"
|
||||
}
|
||||
|
@ -355,7 +357,7 @@ function build_rocm() {
|
|||
"-DCMAKE_C_COMPILER=clang.exe",
|
||||
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
||||
"-DGGML_HIPBLAS=on",
|
||||
"-DGGML_CUDA_NO_PEER_COPY=on",
|
||||
"-DLLAMA_CUDA_NO_PEER_COPY=on",
|
||||
"-DHIP_PLATFORM=amd",
|
||||
"-DGGML_AVX=on",
|
||||
"-DGGML_AVX2=off",
|
||||
|
@ -384,11 +386,12 @@ function build_rocm() {
|
|||
sign
|
||||
install
|
||||
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\"
|
||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
|
||||
} else {
|
||||
write-host "Skipping ROCm generation step"
|
||||
}
|
||||
|
|
|
@ -43,14 +43,6 @@ func (kv KV) Architecture() string {
|
|||
return "unknown"
|
||||
}
|
||||
|
||||
func (kv KV) Kind() string {
|
||||
if s, ok := kv["general.type"].(string); ok {
|
||||
return s
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func (kv KV) ParameterCount() uint64 {
|
||||
return kv.u64("general.parameter_count")
|
||||
}
|
||||
|
|
|
@ -33,6 +33,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||
assert.Len(t, tensors, inputLayerCount+1)
|
||||
err = WriteGGUF(f, KV{
|
||||
"general.architecture": "llama",
|
||||
"general.name": "name",
|
||||
"llama.context_length": uint32(32),
|
||||
"llama.embedding_length": uint32(4096),
|
||||
"llama.block_count": uint32(inputLayerCount),
|
||||
|
|
60
llm/patches/08-pooling.diff
Normal file
60
llm/patches/08-pooling.diff
Normal file
|
@ -0,0 +1,60 @@
|
|||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 721b8f4e..cfe7ac40 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -8420,14 +8420,14 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_mean() {
|
||||
- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
||||
+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
|
||||
cb(lctx.inp_mean, "inp_mean", -1);
|
||||
ggml_set_input(lctx.inp_mean);
|
||||
return lctx.inp_mean;
|
||||
}
|
||||
|
||||
struct ggml_tensor * build_inp_cls() {
|
||||
- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
|
||||
cb(lctx.inp_cls, "inp_cls", -1);
|
||||
ggml_set_input(lctx.inp_cls);
|
||||
return lctx.inp_cls;
|
||||
@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
||||
|
||||
float * data = (float *) lctx.inp_mean->data;
|
||||
- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
|
||||
+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
|
||||
|
||||
std::vector<uint64_t> sum(n_tokens, 0);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||
-
|
||||
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
|
||||
-
|
||||
sum[seq_id] += 1;
|
||||
}
|
||||
|
||||
- std::vector<float> div(n_tokens, 0.0f);
|
||||
- for (int i = 0; i < n_tokens; ++i) {
|
||||
+ std::vector<float> div(cparams.n_seq_max, 0.0f);
|
||||
+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
|
||||
const uint64_t s = sum[i];
|
||||
if (s > 0) {
|
||||
div[i] = 1.0f/float(s);
|
||||
@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
||||
|
||||
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
||||
- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
||||
+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const llama_seq_id seq_id = batch.seq_id[i][0];
|
||||
const llama_pos pos = batch.pos[i];
|
||||
-
|
||||
- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
|
||||
-
|
||||
if (pos == 0) {
|
||||
data[seq_id] = i;
|
||||
}
|
|
@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
|||
// glob workDir for files that start with ollama_
|
||||
availableServers := getAvailableServers()
|
||||
requested := info.Library
|
||||
if info.Variant != gpu.CPUCapabilityNone.String() {
|
||||
requested += "_" + info.Variant
|
||||
if info.Variant != gpu.CPUCapabilityNone {
|
||||
requested += "_" + info.Variant.String()
|
||||
}
|
||||
|
||||
servers := []string{}
|
||||
|
|
|
@ -258,7 +258,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
params = append(params, "--mlock")
|
||||
}
|
||||
|
||||
if gpu.IsNUMA() && gpus[0].Library == "cpu" {
|
||||
if gpu.IsNUMA() {
|
||||
numaMode := "distribute"
|
||||
if runtime.GOOS == "linux" {
|
||||
if _, err := exec.LookPath("numactl"); err == nil {
|
||||
|
@ -306,18 +306,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
if runtime.GOOS == "windows" {
|
||||
pathEnv = "PATH"
|
||||
}
|
||||
// Start with the server directory for the LD_LIBRARY_PATH/PATH
|
||||
libraryPaths := []string{dir}
|
||||
// prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies
|
||||
libraryPaths := []string{dir, filepath.Dir(dir)}
|
||||
|
||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||
// favor our bundled library dependencies over system libraries
|
||||
// Append our runner directory to the path
|
||||
// This will favor system libraries over our bundled library dependencies
|
||||
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
||||
}
|
||||
|
||||
// Note: we always put the dependency path first
|
||||
// since this was the exact version we compiled/linked against
|
||||
// since this was the exact version we verified for AMD GPUs
|
||||
// and we favor what the user had in their path
|
||||
if gpus[0].DependencyPath != "" {
|
||||
// assume gpus from the same library have the same dependency path
|
||||
// TODO refine for multi-gpu support
|
||||
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
|
||||
}
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@ set -eu
|
|||
|
||||
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
||||
GZIP=$(which pigz 2>/dev/null || echo "gzip")
|
||||
|
||||
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
|
||||
export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
|
||||
|
@ -22,16 +21,11 @@ for TARGETARCH in ${BUILD_ARCH}; do
|
|||
-t builder:$TARGETARCH \
|
||||
.
|
||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
||||
rm -rf ./dist/linux-$TARGETARCH
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
|
||||
if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH
|
||||
|
||||
if [ "$TARGETARCH" = "amd64" ]; then
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/
|
||||
fi
|
||||
|
||||
docker rm builder-$TARGETARCH
|
||||
echo "Compressing final linux bundle..."
|
||||
rm -f ./dist/ollama-linux-$TARGETARCH.tgz
|
||||
(cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
|
||||
if [ -d dist/linux-$TARGETARCH-rocm ]; then
|
||||
(cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
|
||||
fi
|
||||
done
|
||||
|
|
|
@ -7,7 +7,6 @@
|
|||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function checkEnv() {
|
||||
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||
$script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||
Write-host "Building for ${script:TARGET_ARCH}"
|
||||
write-host "Locating required tools and paths"
|
||||
|
@ -16,23 +15,26 @@ function checkEnv() {
|
|||
$MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
|
||||
$env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
|
||||
}
|
||||
# Locate CUDA versions
|
||||
# Note: this assumes every version found will be built
|
||||
$cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
|
||||
if ($cudaList.length -eq 0) {
|
||||
# Try to find the CUDA dir
|
||||
if ($null -eq $env:NVIDIA_DIR) {
|
||||
$d=(get-command -ea 'silentlycontinue' nvcc).path
|
||||
if ($null -ne $d) {
|
||||
$script:CUDA_DIRS=@($d| split-path -parent)
|
||||
if ($d -ne $null) {
|
||||
$script:NVIDIA_DIR=($d| split-path -parent)
|
||||
} else {
|
||||
$cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue')
|
||||
if ($cudaList.length > 0) {
|
||||
$script:NVIDIA_DIR=$cudaList[0]
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$script:CUDA_DIRS=$cudaList
|
||||
$script:NVIDIA_DIR=$env:NVIDIA_DIR
|
||||
}
|
||||
|
||||
$script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
|
||||
|
||||
$script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
|
||||
$env:CGO_ENABLED="1"
|
||||
Write-Output "Checking version"
|
||||
echo "Checking version"
|
||||
if (!$env:VERSION) {
|
||||
$data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always)
|
||||
$pattern="v(.+)"
|
||||
|
@ -69,48 +71,7 @@ function checkEnv() {
|
|||
function buildOllama() {
|
||||
write-host "Building ollama CLI"
|
||||
if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
|
||||
Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
|
||||
|
||||
# TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle
|
||||
# which targets to build
|
||||
|
||||
# Start by skipping CUDA to build everything else
|
||||
pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
|
||||
# Then skip everyhting else and build all the CUDA variants
|
||||
foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
|
||||
write-host "Building CUDA ${env:CUDA_LIB_DIR}"
|
||||
|
||||
if ($env:CUDA_LIB_DIR.Contains("v12")) {
|
||||
pwsh -Command {
|
||||
$env:OLLAMA_SKIP_CUDA_GENERATE=""
|
||||
$env:OLLAMA_SKIP_STATIC_GENERATE="1"
|
||||
$env:OLLAMA_SKIP_CPU_GENERATE="1"
|
||||
$env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
|
||||
$env:OLLAMA_SKIP_ROCM_GENERATE="1"
|
||||
$env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
|
||||
$env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on"
|
||||
$env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
|
||||
$env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
|
||||
& go generate ./...
|
||||
}
|
||||
} else {
|
||||
pwsh -Command {
|
||||
$env:OLLAMA_SKIP_CUDA_GENERATE=""
|
||||
$env:OLLAMA_SKIP_STATIC_GENERATE="1"
|
||||
$env:OLLAMA_SKIP_CPU_GENERATE="1"
|
||||
$env:OLLAMA_SKIP_ONEAPI_GENERATE="1"
|
||||
$env:OLLAMA_SKIP_ROCM_GENERATE="1"
|
||||
$env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
|
||||
$env:OLLAMA_CUSTOM_CUDA_DEFS=""
|
||||
$env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent
|
||||
$env:PATH="$envs:CUDA_LIB_DIR;$env:PATH"
|
||||
& go generate ./...
|
||||
}
|
||||
}
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
& go generate ./...
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
} else {
|
||||
write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
|
||||
|
@ -122,8 +83,8 @@ function buildOllama() {
|
|||
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe
|
||||
if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
|
||||
}
|
||||
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force
|
||||
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\
|
||||
New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force
|
||||
cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\
|
||||
}
|
||||
|
||||
function buildApp() {
|
||||
|
@ -142,22 +103,22 @@ function buildApp() {
|
|||
function gatherDependencies() {
|
||||
write-host "Gathering runtime dependencies"
|
||||
cd "${script:SRC_DIR}"
|
||||
md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null
|
||||
md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null
|
||||
|
||||
# TODO - this varies based on host build system and MSVC version - drive from dumpbin output
|
||||
# currently works for Win11 + MSVC 2019 + Cuda V11
|
||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\"
|
||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\"
|
||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\"
|
||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\"
|
||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\"
|
||||
cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\"
|
||||
foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
|
||||
cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\"
|
||||
cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\"
|
||||
}
|
||||
|
||||
|
||||
cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
|
||||
if ("${env:KEY_CONTAINER}") {
|
||||
write-host "about to sign"
|
||||
foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
|
||||
foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
|
||||
write-host "signing $file"
|
||||
& "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
|
||||
/csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
|
||||
|
|
|
@ -63,36 +63,16 @@ if [ -n "$NEEDS" ]; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
status "Downloading ollama..."
|
||||
curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
|
||||
|
||||
for BINDIR in /usr/local/bin /usr/bin /bin; do
|
||||
echo $PATH | grep -q $BINDIR && break || continue
|
||||
done
|
||||
OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})
|
||||
|
||||
status "Installing ollama to $OLLAMA_INSTALL_DIR"
|
||||
status "Installing ollama to $BINDIR..."
|
||||
$SUDO install -o0 -g0 -m755 -d $BINDIR
|
||||
$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
|
||||
if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then
|
||||
status "Downloading Linux ${ARCH} bundle"
|
||||
curl --fail --show-error --location --progress-bar \
|
||||
"https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
|
||||
$SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
|
||||
BUNDLE=1
|
||||
if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
|
||||
status "Making ollama accessible in the PATH in $BINDIR"
|
||||
$SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
|
||||
fi
|
||||
else
|
||||
status "Downloading Linux ${ARCH} CLI"
|
||||
curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
|
||||
"https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
|
||||
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
|
||||
BUNDLE=0
|
||||
if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
|
||||
status "Making ollama accessible in the PATH in $BINDIR"
|
||||
$SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
|
||||
fi
|
||||
fi
|
||||
|
||||
$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama
|
||||
|
||||
install_success() {
|
||||
status 'The Ollama API is now available at 127.0.0.1:11434.'
|
||||
|
@ -198,16 +178,6 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
|
|||
fi
|
||||
|
||||
if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
|
||||
if [ $BUNDLE -ne 0 ]; then
|
||||
status "Downloading Linux ROCm ${ARCH} bundle"
|
||||
curl --fail --show-error --location --progress-bar \
|
||||
"https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
|
||||
$SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
|
||||
|
||||
install_success
|
||||
status "AMD GPU ready."
|
||||
exit 0
|
||||
fi
|
||||
# Look for pre-existing ROCm v6 before downloading the dependencies
|
||||
for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
|
||||
if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
# Script for common Dockerfile dependency installation in redhat linux based images
|
||||
|
||||
set -ex
|
||||
set -o pipefail
|
||||
MACHINE=$(uname -m)
|
||||
|
||||
if grep -i "centos" /etc/system-release >/dev/null; then
|
||||
|
@ -30,7 +29,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then
|
|||
dnf install -y rh-git227-git
|
||||
ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git
|
||||
fi
|
||||
dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz
|
||||
dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++
|
||||
elif grep -i "rocky" /etc/system-release >/dev/null; then
|
||||
# Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC)
|
||||
cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo
|
||||
|
@ -44,21 +43,12 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial
|
|||
EOF
|
||||
dnf install -y git \
|
||||
gcc-toolset-10-gcc-10.2.1-8.2.el8 \
|
||||
gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \
|
||||
pigz
|
||||
gcc-toolset-10-gcc-c++-10.2.1-8.2.el8
|
||||
else
|
||||
echo "ERROR Unexpected distro"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ "${MACHINE}" = "x86_64" ] ; then
|
||||
curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \
|
||||
mv /tmp/ccache /usr/local/bin/
|
||||
else
|
||||
yum -y install epel-release
|
||||
yum install -y ccache
|
||||
fi
|
||||
|
||||
if [ -n "${CMAKE_VERSION}" ]; then
|
||||
curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1
|
||||
fi
|
||||
|
|
103
server/images.go
103
server/images.go
|
@ -215,20 +215,25 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) {
|
|||
return nil, "", err
|
||||
}
|
||||
|
||||
f, err := os.Open(fp)
|
||||
if _, err = os.Stat(fp); err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
var manifest *Manifest
|
||||
|
||||
bts, err := os.ReadFile(fp)
|
||||
if err != nil {
|
||||
return nil, "", err
|
||||
return nil, "", fmt.Errorf("couldn't open file '%s'", fp)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
sha256sum := sha256.New()
|
||||
shaSum := sha256.Sum256(bts)
|
||||
shaStr := hex.EncodeToString(shaSum[:])
|
||||
|
||||
var manifest Manifest
|
||||
if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil {
|
||||
if err := json.Unmarshal(bts, &manifest); err != nil {
|
||||
return nil, "", err
|
||||
}
|
||||
|
||||
return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil
|
||||
return manifest, shaStr, nil
|
||||
}
|
||||
|
||||
func GetModel(name string) (*Model, error) {
|
||||
|
@ -369,14 +374,13 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
|||
parameters := make(map[string]any)
|
||||
|
||||
var layers []Layer
|
||||
var baseLayers []*layerGGML
|
||||
for _, c := range modelfile.Commands {
|
||||
mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name)
|
||||
command := c.Name
|
||||
|
||||
switch command {
|
||||
switch c.Name {
|
||||
case "model", "adapter":
|
||||
if name := model.ParseName(c.Args); name.IsValid() && command == "model" {
|
||||
var baseLayers []*layerGGML
|
||||
if name := model.ParseName(c.Args); name.IsValid() {
|
||||
baseLayers, err = parseFromModel(ctx, name, fn)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -410,14 +414,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
|||
}
|
||||
defer blob.Close()
|
||||
|
||||
baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn)
|
||||
baseLayers, err = parseFromFile(ctx, blob, digest, fn)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil {
|
||||
defer file.Close()
|
||||
|
||||
baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn)
|
||||
baseLayers, err = parseFromFile(ctx, file, "", fn)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -688,18 +692,43 @@ func CopyModel(src, dst model.Name) error {
|
|||
return err
|
||||
}
|
||||
|
||||
func deleteUnusedLayers(deleteMap map[string]struct{}) error {
|
||||
manifests, err := Manifests()
|
||||
func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error {
|
||||
fp, err := GetManifestPath()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, manifest := range manifests {
|
||||
walkFunc := func(path string, info os.FileInfo, _ error) error {
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
dir, file := filepath.Split(path)
|
||||
dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator))
|
||||
tag := strings.Join([]string{dir, file}, ":")
|
||||
fmp := ParseModelPath(tag)
|
||||
|
||||
// skip the manifest we're trying to delete
|
||||
if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// save (i.e. delete from the deleteMap) any files used in other manifests
|
||||
manifest, _, err := GetManifest(fmp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, layer := range manifest.Layers {
|
||||
delete(deleteMap, layer.Digest)
|
||||
}
|
||||
|
||||
delete(deleteMap, manifest.Config.Digest)
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := filepath.Walk(fp, walkFunc); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// only delete the files which are still in the deleteMap
|
||||
|
@ -752,7 +781,8 @@ func PruneLayers() error {
|
|||
|
||||
slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap)))
|
||||
|
||||
if err := deleteUnusedLayers(deleteMap); err != nil {
|
||||
err = deleteUnusedLayers(nil, deleteMap)
|
||||
if err != nil {
|
||||
slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
|
||||
return nil
|
||||
}
|
||||
|
@ -847,19 +877,26 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
|||
func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error {
|
||||
mp := ParseModelPath(name)
|
||||
|
||||
var manifest *Manifest
|
||||
var err error
|
||||
var noprune string
|
||||
|
||||
// build deleteMap to prune unused layers
|
||||
deleteMap := make(map[string]struct{})
|
||||
manifest, _, err := GetManifest(mp)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
// noop
|
||||
} else if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
} else {
|
||||
for _, l := range manifest.Layers {
|
||||
deleteMap[l.Digest] = struct{}{}
|
||||
|
||||
if !envconfig.NoPrune() {
|
||||
manifest, _, err = GetManifest(mp)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
if manifest.Config.Digest != "" {
|
||||
deleteMap[manifest.Config.Digest] = struct{}{}
|
||||
|
||||
if manifest != nil {
|
||||
for _, l := range manifest.Layers {
|
||||
deleteMap[l.Digest] = struct{}{}
|
||||
}
|
||||
if manifest.Config.Digest != "" {
|
||||
deleteMap[manifest.Config.Digest] = struct{}{}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -938,9 +975,11 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
|
|||
return err
|
||||
}
|
||||
|
||||
if !envconfig.NoPrune() && len(deleteMap) > 0 {
|
||||
fn(api.ProgressResponse{Status: "removing unused layers"})
|
||||
if err := deleteUnusedLayers(deleteMap); err != nil {
|
||||
if noprune == "" {
|
||||
fn(api.ProgressResponse{Status: "removing any unused layers"})
|
||||
err = deleteUnusedLayers(nil, deleteMap)
|
||||
if err != nil {
|
||||
slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err))
|
||||
fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)})
|
||||
}
|
||||
}
|
||||
|
@ -961,12 +1000,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio
|
|||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
var m Manifest
|
||||
var m *Manifest
|
||||
if err := json.NewDecoder(resp.Body).Decode(&m); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &m, err
|
||||
return m, err
|
||||
}
|
||||
|
||||
// GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer
|
||||
|
|
|
@ -51,9 +51,6 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) {
|
|||
if err := os.Rename(temp.Name(), blob); err != nil {
|
||||
return Layer{}, err
|
||||
}
|
||||
if err := os.Chmod(blob, 0o644); err != nil {
|
||||
return Layer{}, err
|
||||
}
|
||||
}
|
||||
|
||||
return Layer{
|
||||
|
|
|
@ -5,7 +5,6 @@ import (
|
|||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
|
@ -151,16 +150,14 @@ func Manifests() (map[model.Name]*Manifest, error) {
|
|||
|
||||
n := model.ParseNameFromFilepath(rel)
|
||||
if !n.IsValid() {
|
||||
slog.Warn("bad manifest name", "path", rel)
|
||||
slog.Warn("bad manifest name", "path", rel, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
m, err := ParseNamedManifest(n)
|
||||
if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) {
|
||||
if err != nil {
|
||||
slog.Warn("bad manifest", "name", n, "error", err)
|
||||
continue
|
||||
} else if err != nil {
|
||||
return nil, fmt.Errorf("%s: %w", n, err)
|
||||
}
|
||||
|
||||
ms[n] = m
|
||||
|
|
|
@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
|||
return layers, nil
|
||||
}
|
||||
|
||||
func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||
func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||
fi, err := f.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -108,38 +108,16 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
|
|||
defer t.Close()
|
||||
defer os.Remove(t.Name())
|
||||
|
||||
var layerType string
|
||||
|
||||
switch command {
|
||||
case "adapter":
|
||||
var baseModel *llm.GGML
|
||||
for _, l := range baseLayers {
|
||||
if l.GGML != nil {
|
||||
baseModel = l.GGML
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if baseModel == nil {
|
||||
return nil, fmt.Errorf("no base model specified for the adapter")
|
||||
}
|
||||
|
||||
if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
layerType = "application/vnd.ollama.image.adapter"
|
||||
case "model":
|
||||
if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
layerType = "application/vnd.ollama.image.model"
|
||||
fn(api.ProgressResponse{Status: "converting model"})
|
||||
if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if _, err := t.Seek(0, io.SeekStart); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
layer, err := NewLayer(t, layerType)
|
||||
layer, err := NewLayer(t, "application/vnd.ollama.image.model")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -161,7 +139,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
|
|||
return detectChatTemplate(layers)
|
||||
}
|
||||
|
||||
func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||
func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||
sr := io.NewSectionReader(file, 0, 512)
|
||||
contentType, err := detectContentType(sr)
|
||||
if err != nil {
|
||||
|
@ -172,7 +150,7 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML,
|
|||
case "gguf", "ggla":
|
||||
// noop
|
||||
case "application/zip":
|
||||
return parseFromZipFile(ctx, command, baseLayers, file, digest, fn)
|
||||
return parseFromZipFile(ctx, file, digest, fn)
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported content type: %s", contentType)
|
||||
}
|
||||
|
@ -192,7 +170,7 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML,
|
|||
}
|
||||
|
||||
mediatype := "application/vnd.ollama.image.model"
|
||||
if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" {
|
||||
if ggml.Name() == "ggla" {
|
||||
mediatype = "application/vnd.ollama.image.adapter"
|
||||
} else if ggml.KV().Architecture() == "clip" {
|
||||
mediatype = "application/vnd.ollama.image.projector"
|
||||
|
|
|
@ -153,7 +153,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
|
|||
t.Fatalf("failed to seek to start: %v", err)
|
||||
}
|
||||
|
||||
layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {})
|
||||
layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {})
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse from file: %v", err)
|
||||
}
|
||||
|
@ -166,7 +166,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
|
|||
t.Fatalf("failed to seek to start: %v", err)
|
||||
}
|
||||
|
||||
layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {})
|
||||
layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {})
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse from file: %v", err)
|
||||
}
|
||||
|
@ -206,7 +206,7 @@ func TestParseLayerFromCopy(t *testing.T) {
|
|||
t.Fatalf("failed to seek to start: %v", err)
|
||||
}
|
||||
|
||||
layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {})
|
||||
layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {})
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse from file: %v", err)
|
||||
}
|
||||
|
|
|
@ -193,11 +193,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||
break
|
||||
}
|
||||
|
||||
// Embedding models should always be loaded with parallel=1
|
||||
if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
|
||||
numParallel = 1
|
||||
}
|
||||
|
||||
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||
// simplifying assumption of defaultParallel when in CPU mode
|
||||
|
@ -739,10 +734,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
|||
|
||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
||||
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||
if *numParallel <= 0 {
|
||||
*numParallel = 1
|
||||
req.opts.NumCtx = req.origNumCtx
|
||||
}
|
||||
*numParallel = 1
|
||||
byLibrary := gpus.ByLibrary()
|
||||
if len(byLibrary) <= 1 {
|
||||
return gpus
|
||||
|
|
|
@ -117,6 +117,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
|
|||
|
||||
require.NoError(t, llm.WriteGGUF(f, llm.KV{
|
||||
"general.architecture": "llama",
|
||||
"general.name": "name",
|
||||
"llama.context_length": uint32(32),
|
||||
"llama.embedding_length": uint32(4096),
|
||||
"llama.block_count": uint32(1),
|
||||
|
|
|
@ -45,7 +45,7 @@ type blobUpload struct {
|
|||
}
|
||||
|
||||
const (
|
||||
numUploadParts = 16
|
||||
numUploadParts = 64
|
||||
minUploadPartSize int64 = 100 * format.MegaByte
|
||||
maxUploadPartSize int64 = 1000 * format.MegaByte
|
||||
)
|
||||
|
|
Loading…
Reference in a new issue