Signed-off-by: baalajimaestro <baalajimaestro@ptr.moe>
This commit is contained in:
commit
7f1565721c
51 changed files with 1383 additions and 868 deletions
|
@ -7,3 +7,5 @@ llm/llama.cpp
|
|||
.env
|
||||
.cache
|
||||
test_data
|
||||
llm/build
|
||||
llama/build
|
||||
|
|
209
.github/workflows/release.yaml
vendored
209
.github/workflows/release.yaml
vendored
|
@ -102,8 +102,8 @@ jobs:
|
|||
with:
|
||||
name: generate-windows-cpu
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
llm/build/**/*.a
|
||||
build/**/*
|
||||
build/**/*.a
|
||||
dist/windows-amd64/**
|
||||
|
||||
# ROCm generation step
|
||||
|
@ -176,7 +176,7 @@ jobs:
|
|||
with:
|
||||
name: generate-windows-rocm
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
build/**/*
|
||||
dist/windows-amd64/**
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
|
@ -265,7 +265,7 @@ jobs:
|
|||
with:
|
||||
name: generate-windows-cuda-${{ matrix.cuda.version }}
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
build/**/*
|
||||
dist/windows-amd64/**
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
|
@ -338,7 +338,7 @@ jobs:
|
|||
- uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: generate-windows-rocm
|
||||
- run: dir llm/build
|
||||
- run: dir build
|
||||
- run: |
|
||||
$gopath=(get-command go).source | split-path -parent
|
||||
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
|
||||
|
@ -359,9 +359,7 @@ jobs:
|
|||
environment: release
|
||||
runs-on: linux
|
||||
env:
|
||||
OLLAMA_SKIP_MANIFEST_CREATE: '1'
|
||||
BUILD_ARCH: amd64
|
||||
PUSH: '1'
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
|
@ -369,14 +367,8 @@ jobs:
|
|||
- name: Set Version
|
||||
shell: bash
|
||||
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ vars.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
- run: |
|
||||
./scripts/build_linux.sh
|
||||
./scripts/build_docker.sh
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: dist-linux-amd64
|
||||
|
@ -390,9 +382,7 @@ jobs:
|
|||
environment: release
|
||||
runs-on: linux-arm64
|
||||
env:
|
||||
OLLAMA_SKIP_MANIFEST_CREATE: '1'
|
||||
BUILD_ARCH: arm64
|
||||
PUSH: '1'
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
|
@ -421,14 +411,8 @@ jobs:
|
|||
sudo usermod -aG docker $USER
|
||||
sudo apt-get install acl
|
||||
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ vars.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
- run: |
|
||||
./scripts/build_linux.sh
|
||||
./scripts/build_docker.sh
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: dist-linux-arm64
|
||||
|
@ -436,6 +420,181 @@ jobs:
|
|||
dist/*linux*
|
||||
!dist/*-cov
|
||||
|
||||
# Container image build
|
||||
build-linux:
|
||||
environment: release
|
||||
strategy:
|
||||
matrix:
|
||||
runner:
|
||||
- linux
|
||||
- linux-arm64
|
||||
runs-on: ${{ matrix.runner }}
|
||||
env:
|
||||
FINAL_IMAGE_REPO: ollama/ollama
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive
|
||||
- name: 'Install Docker'
|
||||
if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y ca-certificates curl
|
||||
sudo install -m 0755 -d /etc/apt/keyrings
|
||||
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
||||
sudo usermod -aG docker $USER
|
||||
sudo apt-get install acl
|
||||
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.FINAL_IMAGE_REPO }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=ref,event=tag
|
||||
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
||||
type=semver,pattern={{version}}
|
||||
- name: Set Version
|
||||
shell: bash
|
||||
run: |
|
||||
machine=$(uname -m)
|
||||
case ${machine} in
|
||||
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
|
||||
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
|
||||
esac >>$GITHUB_ENV
|
||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ vars.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
- name: Build and push by digest
|
||||
id: build
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: "."
|
||||
platforms: linux/${{ env.ARCH }}
|
||||
build-args: |
|
||||
GOFLAGS
|
||||
outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
|
||||
- name: Export digest
|
||||
run: |
|
||||
mkdir -p /tmp/digests
|
||||
digest="${{ steps.build.outputs.digest }}"
|
||||
touch "/tmp/digests/${digest#sha256:}"
|
||||
- name: Upload digest
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: digests-${{ env.PLATFORM_PAIR }}
|
||||
path: /tmp/digests/*
|
||||
if-no-files-found: error
|
||||
retention-days: 1
|
||||
merge:
|
||||
environment: release
|
||||
runs-on: linux
|
||||
needs:
|
||||
- build-linux
|
||||
env:
|
||||
FINAL_IMAGE_REPO: ollama/ollama
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive
|
||||
- name: Download digests
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: /tmp/digests
|
||||
pattern: digests-*
|
||||
merge-multiple: true
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.FINAL_IMAGE_REPO }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=ref,event=tag
|
||||
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
||||
type=semver,pattern={{version}}
|
||||
- name: Set Version
|
||||
shell: bash
|
||||
run: |
|
||||
machine=$(uname -m)
|
||||
case ${machine} in
|
||||
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
|
||||
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
|
||||
esac >>$GITHUB_ENV
|
||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ vars.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
- name: Create manifest list and push
|
||||
working-directory: /tmp/digests
|
||||
run: |
|
||||
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
||||
$(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
|
||||
- name: Inspect image
|
||||
run: |
|
||||
docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}
|
||||
build-linux-rocm:
|
||||
environment: release
|
||||
runs-on: linux
|
||||
env:
|
||||
FINAL_IMAGE_REPO: ollama/ollama
|
||||
ARCH: amd64
|
||||
PLATFORM_PAIR: linux-amd64
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: recursive
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.FINAL_IMAGE_REPO }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=ref,event=tag
|
||||
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
||||
type=semver,pattern={{version}}
|
||||
- name: Set Version
|
||||
shell: bash
|
||||
run: |
|
||||
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ vars.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
- name: Build and push by digest
|
||||
id: build
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: "."
|
||||
target: runtime-rocm
|
||||
build-args: |
|
||||
GOFLAGS
|
||||
tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm,${{ env.FINAL_IMAGE_REPO }}:rocm
|
||||
push: true
|
||||
|
||||
# Aggregate all the assets and ship a release
|
||||
release:
|
||||
needs:
|
||||
|
@ -448,8 +607,6 @@ jobs:
|
|||
permissions:
|
||||
contents: write
|
||||
env:
|
||||
OLLAMA_SKIP_IMAGE_BUILD: '1'
|
||||
PUSH: '1'
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
@ -458,12 +615,6 @@ jobs:
|
|||
run: |
|
||||
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
||||
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ vars.DOCKER_USER }}
|
||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||
- run: ./scripts/build_docker.sh
|
||||
- name: Retrieve built artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
|
|
43
.github/workflows/test.yaml
vendored
43
.github/workflows/test.yaml
vendored
|
@ -81,12 +81,6 @@ jobs:
|
|||
if: ${{ ! startsWith(matrix.os, 'windows-') }}
|
||||
name: 'Unix Go Generate'
|
||||
- run: go build .
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
llm/build/**/*.a
|
||||
generate-cuda:
|
||||
needs: [changes]
|
||||
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
|
||||
|
@ -114,12 +108,6 @@ jobs:
|
|||
go generate -x ./...
|
||||
env:
|
||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cuda-${{ matrix.cuda-version }}-libraries
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
dist/windows-amd64/**
|
||||
generate-rocm:
|
||||
needs: [changes]
|
||||
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
|
||||
|
@ -147,12 +135,6 @@ jobs:
|
|||
go generate -x ./...
|
||||
env:
|
||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: rocm-${{ matrix.rocm-version }}-libraries
|
||||
path: |
|
||||
llm/build/**/bin/*
|
||||
dist/windows-amd64/**
|
||||
|
||||
# ROCm generation step
|
||||
generate-windows-rocm:
|
||||
|
@ -189,7 +171,6 @@ jobs:
|
|||
name: go generate
|
||||
env:
|
||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||
# TODO - do we need any artifacts?
|
||||
|
||||
# CUDA generation step
|
||||
generate-windows-cuda:
|
||||
|
@ -231,7 +212,6 @@ jobs:
|
|||
go generate -x ./...
|
||||
env:
|
||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||
# TODO - do we need any artifacts?
|
||||
|
||||
lint:
|
||||
strategy:
|
||||
|
@ -263,14 +243,6 @@ jobs:
|
|||
arm64) echo ARCH=arm64 ;;
|
||||
esac >>$GITHUB_ENV
|
||||
shell: bash
|
||||
- run: |
|
||||
mkdir -p llm/build/linux/$ARCH/stub/bin
|
||||
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
|
||||
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
|
||||
- run: |
|
||||
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
||||
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
||||
if: ${{ startsWith(matrix.os, 'macos-') }}
|
||||
- uses: golangci/golangci-lint-action@v6
|
||||
with:
|
||||
args: --timeout 8m0s -v
|
||||
|
@ -301,23 +273,10 @@ jobs:
|
|||
cache: true
|
||||
- run: |
|
||||
case ${{ matrix.arch }} in
|
||||
amd64) echo ARCH=x86_64 ;;
|
||||
amd64) echo ARCH=amd64 ;;
|
||||
arm64) echo ARCH=arm64 ;;
|
||||
esac >>$GITHUB_ENV
|
||||
shell: bash
|
||||
- run: |
|
||||
mkdir -p llm/build/linux/$ARCH/stub/bin
|
||||
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
|
||||
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
|
||||
- run: |
|
||||
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
||||
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
||||
if: ${{ startsWith(matrix.os, 'macos-') }}
|
||||
shell: bash
|
||||
- run: go generate ./...
|
||||
- run: go build
|
||||
- run: go test -v ./...
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ${{ matrix.os }}-binaries
|
||||
path: ollama
|
||||
|
|
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -12,4 +12,7 @@ ggml-metal.metal
|
|||
test_data
|
||||
*.crt
|
||||
llm/build
|
||||
build/*/*/*
|
||||
!build/**/placeholder
|
||||
llama/build
|
||||
__debug_bin*
|
|
@ -312,6 +312,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
|
||||
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
||||
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
|
||||
- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
|
||||
|
||||
### Terminal
|
||||
|
||||
|
@ -336,6 +337,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
|
||||
- [gollama](https://github.com/sammcj/gollama)
|
||||
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
||||
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
|
||||
|
||||
### Apple Vision Pro
|
||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||
|
@ -358,6 +360,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
|
||||
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
||||
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
|
||||
- [crewAI](https://github.com/crewAIInc/crewAI)
|
||||
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
|
||||
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
|
||||
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
||||
|
@ -427,6 +430,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
|||
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
||||
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
||||
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
||||
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
||||
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
|
||||
|
||||
### Supported backends
|
||||
|
||||
|
|
1
build/darwin/amd64/placeholder
Normal file
1
build/darwin/amd64/placeholder
Normal file
|
@ -0,0 +1 @@
|
|||
This is here to make sure the build/ directory exists for the go:embed command
|
1
build/darwin/arm64/placeholder
Normal file
1
build/darwin/arm64/placeholder
Normal file
|
@ -0,0 +1 @@
|
|||
This is here to make sure the build/ directory exists for the go:embed command
|
8
build/embed_darwin_amd64.go
Normal file
8
build/embed_darwin_amd64.go
Normal file
|
@ -0,0 +1,8 @@
|
|||
package build
|
||||
|
||||
import "embed"
|
||||
|
||||
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
|
||||
|
||||
//go:embed darwin/amd64/*
|
||||
var EmbedFS embed.FS
|
8
build/embed_darwin_arm64.go
Normal file
8
build/embed_darwin_arm64.go
Normal file
|
@ -0,0 +1,8 @@
|
|||
package build
|
||||
|
||||
import "embed"
|
||||
|
||||
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
|
||||
|
||||
//go:embed darwin/arm64/*
|
||||
var EmbedFS embed.FS
|
6
build/embed_linux.go
Normal file
6
build/embed_linux.go
Normal file
|
@ -0,0 +1,6 @@
|
|||
package build
|
||||
|
||||
import "embed"
|
||||
|
||||
//go:embed linux/*
|
||||
var EmbedFS embed.FS
|
8
build/embed_unused.go
Normal file
8
build/embed_unused.go
Normal file
|
@ -0,0 +1,8 @@
|
|||
//go:build !linux && !darwin
|
||||
|
||||
package build
|
||||
|
||||
import "embed"
|
||||
|
||||
// unused on windows
|
||||
var EmbedFS embed.FS
|
1
build/linux/amd64/placeholder
Normal file
1
build/linux/amd64/placeholder
Normal file
|
@ -0,0 +1 @@
|
|||
This is here to make sure the build/ directory exists for the go:embed command
|
1
build/linux/arm64/placeholder
Normal file
1
build/linux/arm64/placeholder
Normal file
|
@ -0,0 +1 @@
|
|||
This is here to make sure the build/ directory exists for the go:embed command
|
226
cmd/cmd.go
226
cmd/cmd.go
|
@ -2,6 +2,7 @@ package cmd
|
|||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bufio"
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/ed25519"
|
||||
|
@ -21,6 +22,7 @@ import (
|
|||
"regexp"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
|
@ -344,6 +346,39 @@ func (w *progressWriter) Write(p []byte) (n int, err error) {
|
|||
return len(p), nil
|
||||
}
|
||||
|
||||
func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
|
||||
p := progress.NewProgress(os.Stderr)
|
||||
defer p.StopAndClear()
|
||||
|
||||
spinner := progress.NewSpinner("")
|
||||
p.Add("", spinner)
|
||||
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
req := &api.GenerateRequest{
|
||||
Model: opts.Model,
|
||||
KeepAlive: opts.KeepAlive,
|
||||
}
|
||||
|
||||
return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
|
||||
}
|
||||
|
||||
func StopHandler(cmd *cobra.Command, args []string) error {
|
||||
opts := &runOptions{
|
||||
Model: args[0],
|
||||
KeepAlive: &api.Duration{Duration: 0},
|
||||
}
|
||||
if err := loadOrUnloadModel(cmd, opts); err != nil {
|
||||
if strings.Contains(err.Error(), "not found") {
|
||||
return fmt.Errorf("couldn't find model \"%s\" to stop", args[0])
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func RunHandler(cmd *cobra.Command, args []string) error {
|
||||
interactive := true
|
||||
|
||||
|
@ -422,7 +457,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
|||
opts.ParentModel = info.Details.ParentModel
|
||||
|
||||
if interactive {
|
||||
if err := loadModel(cmd, &opts); err != nil {
|
||||
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -578,7 +613,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
|
|||
table.SetHeaderLine(false)
|
||||
table.SetBorder(false)
|
||||
table.SetNoWhiteSpace(true)
|
||||
table.SetTablePadding("\t")
|
||||
table.SetTablePadding(" ")
|
||||
table.AppendBulk(data)
|
||||
table.Render()
|
||||
|
||||
|
@ -613,7 +648,15 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
|
|||
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
|
||||
procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
|
||||
}
|
||||
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
|
||||
|
||||
var until string
|
||||
delta := time.Since(m.ExpiresAt)
|
||||
if delta > 0 {
|
||||
until = "Stopping..."
|
||||
} else {
|
||||
until = format.HumanTime(m.ExpiresAt, "Never")
|
||||
}
|
||||
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -624,7 +667,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
|
|||
table.SetHeaderLine(false)
|
||||
table.SetBorder(false)
|
||||
table.SetNoWhiteSpace(true)
|
||||
table.SetTablePadding("\t")
|
||||
table.SetTablePadding(" ")
|
||||
table.AppendBulk(data)
|
||||
table.Render()
|
||||
|
||||
|
@ -720,125 +763,89 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
showInfo(resp)
|
||||
|
||||
return nil
|
||||
return showInfo(resp, os.Stdout)
|
||||
}
|
||||
|
||||
func showInfo(resp *api.ShowResponse) {
|
||||
modelData := [][]string{
|
||||
{"parameters", resp.Details.ParameterSize},
|
||||
{"quantization", resp.Details.QuantizationLevel},
|
||||
func showInfo(resp *api.ShowResponse, w io.Writer) error {
|
||||
tableRender := func(header string, rows func() [][]string) {
|
||||
fmt.Fprintln(w, " ", header)
|
||||
table := tablewriter.NewWriter(w)
|
||||
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
||||
table.SetBorder(false)
|
||||
table.SetNoWhiteSpace(true)
|
||||
table.SetTablePadding(" ")
|
||||
|
||||
switch header {
|
||||
case "Template", "System", "License":
|
||||
table.SetColWidth(100)
|
||||
}
|
||||
|
||||
table.AppendBulk(rows())
|
||||
table.Render()
|
||||
fmt.Fprintln(w)
|
||||
}
|
||||
|
||||
tableRender("Model", func() (rows [][]string) {
|
||||
if resp.ModelInfo != nil {
|
||||
arch := resp.ModelInfo["general.architecture"].(string)
|
||||
modelData = append(modelData,
|
||||
[]string{"arch", arch},
|
||||
[]string{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
|
||||
[]string{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
|
||||
)
|
||||
}
|
||||
|
||||
mainTableData := [][]string{
|
||||
{"Model"},
|
||||
{renderSubTable(modelData, false)},
|
||||
rows = append(rows, []string{"", "architecture", arch})
|
||||
rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))})
|
||||
rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)})
|
||||
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)})
|
||||
} else {
|
||||
rows = append(rows, []string{"", "architecture", resp.Details.Family})
|
||||
rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
|
||||
}
|
||||
rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
|
||||
return
|
||||
})
|
||||
|
||||
if resp.ProjectorInfo != nil {
|
||||
projectorData := [][]string{
|
||||
{"arch", "clip"},
|
||||
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
|
||||
}
|
||||
|
||||
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
|
||||
projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
|
||||
}
|
||||
|
||||
projectorData = append(projectorData,
|
||||
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
|
||||
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
|
||||
)
|
||||
|
||||
mainTableData = append(mainTableData,
|
||||
[]string{"Projector"},
|
||||
[]string{renderSubTable(projectorData, false)},
|
||||
)
|
||||
tableRender("Projector", func() (rows [][]string) {
|
||||
arch := resp.ProjectorInfo["general.architecture"].(string)
|
||||
rows = append(rows, []string{"", "architecture", arch})
|
||||
rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))})
|
||||
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.embedding_length", arch)].(float64), 'f', -1, 64)})
|
||||
rows = append(rows, []string{"", "dimensions", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.projection_dim", arch)].(float64), 'f', -1, 64)})
|
||||
return
|
||||
})
|
||||
}
|
||||
|
||||
if resp.Parameters != "" {
|
||||
mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
|
||||
tableRender("Parameters", func() (rows [][]string) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(resp.Parameters))
|
||||
for scanner.Scan() {
|
||||
if text := scanner.Text(); text != "" {
|
||||
rows = append(rows, append([]string{""}, strings.Fields(text)...))
|
||||
}
|
||||
}
|
||||
return
|
||||
})
|
||||
}
|
||||
|
||||
head := func(s string, n int) (rows [][]string) {
|
||||
scanner := bufio.NewScanner(strings.NewReader(s))
|
||||
for scanner.Scan() && (len(rows) < n || n < 0) {
|
||||
if text := scanner.Text(); text != "" {
|
||||
rows = append(rows, []string{"", strings.TrimSpace(text)})
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if resp.System != "" {
|
||||
mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
|
||||
tableRender("System", func() [][]string {
|
||||
return head(resp.System, 2)
|
||||
})
|
||||
}
|
||||
|
||||
if resp.License != "" {
|
||||
mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
|
||||
tableRender("License", func() [][]string {
|
||||
return head(resp.License, 2)
|
||||
})
|
||||
}
|
||||
|
||||
table := tablewriter.NewWriter(os.Stdout)
|
||||
table.SetAutoWrapText(false)
|
||||
table.SetBorder(false)
|
||||
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
||||
|
||||
for _, v := range mainTableData {
|
||||
table.Append(v)
|
||||
}
|
||||
|
||||
table.Render()
|
||||
}
|
||||
|
||||
func renderSubTable(data [][]string, file bool) string {
|
||||
var buf bytes.Buffer
|
||||
table := tablewriter.NewWriter(&buf)
|
||||
table.SetAutoWrapText(!file)
|
||||
table.SetBorder(false)
|
||||
table.SetNoWhiteSpace(true)
|
||||
table.SetTablePadding("\t")
|
||||
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
||||
|
||||
for _, v := range data {
|
||||
table.Append(v)
|
||||
}
|
||||
|
||||
table.Render()
|
||||
|
||||
renderedTable := buf.String()
|
||||
lines := strings.Split(renderedTable, "\n")
|
||||
for i, line := range lines {
|
||||
lines[i] = "\t" + line
|
||||
}
|
||||
|
||||
return strings.Join(lines, "\n")
|
||||
}
|
||||
|
||||
func twoLines(s string) [][]string {
|
||||
lines := strings.Split(s, "\n")
|
||||
res := [][]string{}
|
||||
|
||||
count := 0
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" {
|
||||
count++
|
||||
res = append(res, []string{line})
|
||||
if count == 2 {
|
||||
return res
|
||||
}
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func formatParams(s string) string {
|
||||
lines := strings.Split(s, "\n")
|
||||
table := [][]string{}
|
||||
|
||||
for _, line := range lines {
|
||||
table = append(table, strings.Fields(line))
|
||||
}
|
||||
return renderSubTable(table, false)
|
||||
return nil
|
||||
}
|
||||
|
||||
func CopyHandler(cmd *cobra.Command, args []string) error {
|
||||
|
@ -1328,6 +1335,15 @@ func NewCLI() *cobra.Command {
|
|||
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
||||
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
||||
runCmd.Flags().String("format", "", "Response format (e.g. json)")
|
||||
|
||||
stopCmd := &cobra.Command{
|
||||
Use: "stop MODEL",
|
||||
Short: "Stop a running model",
|
||||
Args: cobra.ExactArgs(1),
|
||||
PreRunE: checkServerHeartbeat,
|
||||
RunE: StopHandler,
|
||||
}
|
||||
|
||||
serveCmd := &cobra.Command{
|
||||
Use: "serve",
|
||||
Aliases: []string{"start"},
|
||||
|
@ -1395,6 +1411,7 @@ func NewCLI() *cobra.Command {
|
|||
createCmd,
|
||||
showCmd,
|
||||
runCmd,
|
||||
stopCmd,
|
||||
pullCmd,
|
||||
pushCmd,
|
||||
listCmd,
|
||||
|
@ -1434,6 +1451,7 @@ func NewCLI() *cobra.Command {
|
|||
createCmd,
|
||||
showCmd,
|
||||
runCmd,
|
||||
stopCmd,
|
||||
pullCmd,
|
||||
pushCmd,
|
||||
listCmd,
|
||||
|
|
206
cmd/cmd_test.go
Normal file
206
cmd/cmd_test.go
Normal file
|
@ -0,0 +1,206 @@
|
|||
package cmd
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
)
|
||||
|
||||
func TestShowInfo(t *testing.T) {
|
||||
t.Run("bare details", func(t *testing.T) {
|
||||
var b bytes.Buffer
|
||||
if err := showInfo(&api.ShowResponse{
|
||||
Details: api.ModelDetails{
|
||||
Family: "test",
|
||||
ParameterSize: "7B",
|
||||
QuantizationLevel: "FP16",
|
||||
},
|
||||
}, &b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expect := ` Model
|
||||
architecture test
|
||||
parameters 7B
|
||||
quantization FP16
|
||||
|
||||
`
|
||||
|
||||
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("bare model info", func(t *testing.T) {
|
||||
var b bytes.Buffer
|
||||
if err := showInfo(&api.ShowResponse{
|
||||
ModelInfo: map[string]any{
|
||||
"general.architecture": "test",
|
||||
"general.parameter_count": float64(7_000_000_000),
|
||||
"test.context_length": float64(0),
|
||||
"test.embedding_length": float64(0),
|
||||
},
|
||||
Details: api.ModelDetails{
|
||||
Family: "test",
|
||||
ParameterSize: "7B",
|
||||
QuantizationLevel: "FP16",
|
||||
},
|
||||
}, &b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expect := ` Model
|
||||
architecture test
|
||||
parameters 7B
|
||||
context length 0
|
||||
embedding length 0
|
||||
quantization FP16
|
||||
|
||||
`
|
||||
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("parameters", func(t *testing.T) {
|
||||
var b bytes.Buffer
|
||||
if err := showInfo(&api.ShowResponse{
|
||||
Details: api.ModelDetails{
|
||||
Family: "test",
|
||||
ParameterSize: "7B",
|
||||
QuantizationLevel: "FP16",
|
||||
},
|
||||
Parameters: `
|
||||
stop never
|
||||
stop gonna
|
||||
stop give
|
||||
stop you
|
||||
stop up
|
||||
temperature 99`,
|
||||
}, &b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expect := ` Model
|
||||
architecture test
|
||||
parameters 7B
|
||||
quantization FP16
|
||||
|
||||
Parameters
|
||||
stop never
|
||||
stop gonna
|
||||
stop give
|
||||
stop you
|
||||
stop up
|
||||
temperature 99
|
||||
|
||||
`
|
||||
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("project info", func(t *testing.T) {
|
||||
var b bytes.Buffer
|
||||
if err := showInfo(&api.ShowResponse{
|
||||
Details: api.ModelDetails{
|
||||
Family: "test",
|
||||
ParameterSize: "7B",
|
||||
QuantizationLevel: "FP16",
|
||||
},
|
||||
ProjectorInfo: map[string]any{
|
||||
"general.architecture": "clip",
|
||||
"general.parameter_count": float64(133_700_000),
|
||||
"clip.vision.embedding_length": float64(0),
|
||||
"clip.vision.projection_dim": float64(0),
|
||||
},
|
||||
}, &b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expect := ` Model
|
||||
architecture test
|
||||
parameters 7B
|
||||
quantization FP16
|
||||
|
||||
Projector
|
||||
architecture clip
|
||||
parameters 133.70M
|
||||
embedding length 0
|
||||
dimensions 0
|
||||
|
||||
`
|
||||
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("system", func(t *testing.T) {
|
||||
var b bytes.Buffer
|
||||
if err := showInfo(&api.ShowResponse{
|
||||
Details: api.ModelDetails{
|
||||
Family: "test",
|
||||
ParameterSize: "7B",
|
||||
QuantizationLevel: "FP16",
|
||||
},
|
||||
System: `You are a pirate!
|
||||
Ahoy, matey!
|
||||
Weigh anchor!
|
||||
`,
|
||||
}, &b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expect := ` Model
|
||||
architecture test
|
||||
parameters 7B
|
||||
quantization FP16
|
||||
|
||||
System
|
||||
You are a pirate!
|
||||
Ahoy, matey!
|
||||
|
||||
`
|
||||
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("license", func(t *testing.T) {
|
||||
var b bytes.Buffer
|
||||
license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := showInfo(&api.ShowResponse{
|
||||
Details: api.ModelDetails{
|
||||
Family: "test",
|
||||
ParameterSize: "7B",
|
||||
QuantizationLevel: "FP16",
|
||||
},
|
||||
License: string(license),
|
||||
}, &b); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expect := ` Model
|
||||
architecture test
|
||||
parameters 7B
|
||||
quantization FP16
|
||||
|
||||
License
|
||||
MIT License
|
||||
Copyright (c) Ollama
|
||||
|
||||
`
|
||||
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||
}
|
||||
})
|
||||
}
|
|
@ -18,7 +18,6 @@ import (
|
|||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/progress"
|
||||
"github.com/ollama/ollama/readline"
|
||||
"github.com/ollama/ollama/types/errtypes"
|
||||
)
|
||||
|
@ -31,26 +30,6 @@ const (
|
|||
MultilineSystem
|
||||
)
|
||||
|
||||
func loadModel(cmd *cobra.Command, opts *runOptions) error {
|
||||
p := progress.NewProgress(os.Stderr)
|
||||
defer p.StopAndClear()
|
||||
|
||||
spinner := progress.NewSpinner("")
|
||||
p.Add("", spinner)
|
||||
|
||||
client, err := api.ClientFromEnvironment()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
chatReq := &api.ChatRequest{
|
||||
Model: opts.Model,
|
||||
KeepAlive: opts.KeepAlive,
|
||||
}
|
||||
|
||||
return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
|
||||
}
|
||||
|
||||
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
usage := func() {
|
||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||
|
@ -217,7 +196,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
|||
opts.Model = args[1]
|
||||
opts.Messages = []api.Message{}
|
||||
fmt.Printf("Loading model '%s'\n", opts.Model)
|
||||
if err := loadModel(cmd, &opts); err != nil {
|
||||
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
|
@ -371,7 +350,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
|||
|
||||
switch args[1] {
|
||||
case "info":
|
||||
showInfo(resp)
|
||||
_ = showInfo(resp, os.Stderr)
|
||||
case "license":
|
||||
if resp.License == "" {
|
||||
fmt.Println("No license was specified for this model.")
|
||||
|
|
|
@ -208,14 +208,18 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
|||
return err
|
||||
}
|
||||
|
||||
if vocabSize := int(p.VocabSize); vocabSize > len(t.Vocabulary.Tokens) {
|
||||
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", p.VocabSize, "actual", len(t.Vocabulary.Tokens))
|
||||
vocabSize := int(p.VocabSize)
|
||||
switch {
|
||||
case vocabSize > len(t.Vocabulary.Tokens):
|
||||
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
||||
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
||||
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
||||
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
||||
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
||||
}
|
||||
} else {
|
||||
case vocabSize < len(t.Vocabulary.Tokens):
|
||||
return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
|
||||
default:
|
||||
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
||||
}
|
||||
|
||||
|
|
48
docs/api.md
48
docs/api.md
|
@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"prompt": "Why is the sky blue?"
|
||||
}'
|
||||
```
|
||||
|
@ -80,7 +80,7 @@ A stream of JSON objects is returned:
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"response": "The",
|
||||
"done": false
|
||||
|
@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "",
|
||||
"done": true,
|
||||
|
@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off.
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"stream": false
|
||||
}'
|
||||
|
@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "The sky is blue because it is the color of the sky.",
|
||||
"done": true,
|
||||
|
@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"prompt": "What color is the sky at different times of the day? Respond using JSON",
|
||||
"format": "json",
|
||||
"stream": false
|
||||
|
@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-11-09T21:07:55.186497Z",
|
||||
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
|
||||
"done": true,
|
||||
|
@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"stream": false,
|
||||
"options": {
|
||||
|
@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "The sky is blue because it is the color of the sky.",
|
||||
"done": true,
|
||||
|
@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory.
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3"
|
||||
"model": "llama3.1"
|
||||
}'
|
||||
```
|
||||
|
||||
|
@ -400,7 +400,7 @@ A single JSON object is returned:
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-12-18T19:52:07.071755Z",
|
||||
"response": "",
|
||||
"done": true
|
||||
|
@ -445,7 +445,7 @@ Send a chat message with a streaming response.
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
|
@ -461,7 +461,7 @@ A stream of JSON objects is returned:
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
|
@ -476,7 +476,7 @@ Final response:
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"done": true,
|
||||
"total_duration": 4883583458,
|
||||
|
@ -494,7 +494,7 @@ Final response:
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
|
@ -509,7 +509,7 @@ curl http://localhost:11434/api/chat -d '{
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "registry.ollama.ai/library/llama3:latest",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-12-12T14:13:43.416799Z",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
|
@ -533,7 +533,7 @@ Send a chat message with a conversation history. You can use this same approach
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
|
@ -557,7 +557,7 @@ A stream of JSON objects is returned:
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
|
@ -571,7 +571,7 @@ Final response:
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"done": true,
|
||||
"total_duration": 8113331500,
|
||||
|
@ -629,7 +629,7 @@ curl http://localhost:11434/api/chat -d '{
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/chat -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
|
@ -647,7 +647,7 @@ curl http://localhost:11434/api/chat -d '{
|
|||
|
||||
```json
|
||||
{
|
||||
"model": "registry.ollama.ai/library/llama3:latest",
|
||||
"model": "llama3.1",
|
||||
"created_at": "2023-12-12T14:13:43.416799Z",
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
|
@ -904,7 +904,7 @@ Show information about a model including details, modelfile, template, parameter
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/show -d '{
|
||||
"name": "llama3"
|
||||
"name": "llama3.1"
|
||||
}'
|
||||
```
|
||||
|
||||
|
@ -965,7 +965,7 @@ Copy a model. Creates a model with another name from an existing model.
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/copy -d '{
|
||||
"source": "llama3",
|
||||
"source": "llama3.1",
|
||||
"destination": "llama3-backup"
|
||||
}'
|
||||
```
|
||||
|
@ -1020,7 +1020,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/pull -d '{
|
||||
"name": "llama3"
|
||||
"name": "llama3.1"
|
||||
}'
|
||||
```
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
|
|||
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"prompt": "Why is the sky blue?",
|
||||
"options": {
|
||||
"num_ctx": 4096
|
||||
|
@ -247,12 +247,12 @@ The `keep_alive` parameter can be set to:
|
|||
|
||||
For example, to preload a model and leave it in memory use:
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": -1}'
|
||||
```
|
||||
|
||||
To unload the model and free up memory use:
|
||||
```shell
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
|
||||
curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}'
|
||||
```
|
||||
|
||||
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
|
||||
|
|
|
@ -11,7 +11,7 @@ A model file is the blueprint to create and share models with Ollama.
|
|||
- [Examples](#examples)
|
||||
- [Instructions](#instructions)
|
||||
- [FROM (Required)](#from-required)
|
||||
- [Build from llama3.1](#build-from-llama31)
|
||||
- [Build from existing model](#build-from-existing-model)
|
||||
- [Build from a Safetensors model](#build-from-a-safetensors-model)
|
||||
- [Build from a GGUF file](#build-from-a-gguf-file)
|
||||
- [PARAMETER](#parameter)
|
||||
|
@ -50,7 +50,7 @@ INSTRUCTION arguments
|
|||
An example of a `Modelfile` creating a mario blueprint:
|
||||
|
||||
```modelfile
|
||||
FROM llama3
|
||||
FROM llama3.1
|
||||
# sets the temperature to 1 [higher is more creative, lower is more coherent]
|
||||
PARAMETER temperature 1
|
||||
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
|
||||
|
@ -72,10 +72,10 @@ More examples are available in the [examples directory](../examples).
|
|||
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
|
||||
|
||||
```bash
|
||||
> ollama show --modelfile llama3
|
||||
> ollama show --modelfile llama3.1
|
||||
# Modelfile generated by "ollama show"
|
||||
# To build a new Modelfile based on this one, replace the FROM line with:
|
||||
# FROM llama3:latest
|
||||
# FROM llama3.1:latest
|
||||
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
|
||||
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
|
||||
|
||||
|
@ -100,7 +100,7 @@ The `FROM` instruction defines the base model to use when creating a model.
|
|||
FROM <model name>:<tag>
|
||||
```
|
||||
|
||||
#### Build from llama3.1
|
||||
#### Build from existing model
|
||||
|
||||
```modelfile
|
||||
FROM llama3.1
|
||||
|
|
|
@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
|
|||
'content': 'Say this is a test',
|
||||
}
|
||||
],
|
||||
model='llama3',
|
||||
model='llama3.1',
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
|
@ -46,13 +46,13 @@ response = client.chat.completions.create(
|
|||
)
|
||||
|
||||
completion = client.completions.create(
|
||||
model="llama3",
|
||||
model="llama3.1",
|
||||
prompt="Say this is a test",
|
||||
)
|
||||
|
||||
list_completion = client.models.list()
|
||||
|
||||
model = client.models.retrieve("llama3")
|
||||
model = client.models.retrieve("llama3.1")
|
||||
|
||||
embeddings = client.embeddings.create(
|
||||
model="all-minilm",
|
||||
|
@ -74,7 +74,7 @@ const openai = new OpenAI({
|
|||
|
||||
const chatCompletion = await openai.chat.completions.create({
|
||||
messages: [{ role: 'user', content: 'Say this is a test' }],
|
||||
model: 'llama3',
|
||||
model: 'llama3.1',
|
||||
})
|
||||
|
||||
const response = await openai.chat.completions.create({
|
||||
|
@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({
|
|||
})
|
||||
|
||||
const completion = await openai.completions.create({
|
||||
model: "llama3",
|
||||
model: "llama3.1",
|
||||
prompt: "Say this is a test.",
|
||||
})
|
||||
|
||||
const listCompletion = await openai.models.list()
|
||||
|
||||
const model = await openai.models.retrieve("llama3")
|
||||
const model = await openai.models.retrieve("llama3.1")
|
||||
|
||||
const embedding = await openai.embeddings.create({
|
||||
model: "all-minilm",
|
||||
|
@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({
|
|||
curl http://localhost:11434/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
|
@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \
|
|||
curl http://localhost:11434/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "llama3",
|
||||
"model": "llama3.1",
|
||||
"prompt": "Say this is a test"
|
||||
}'
|
||||
|
||||
curl http://localhost:11434/v1/models
|
||||
|
||||
curl http://localhost:11434/v1/models/llama3
|
||||
curl http://localhost:11434/v1/models/llama3.1
|
||||
|
||||
curl http://localhost:11434/v1/embeddings \
|
||||
-H "Content-Type: application/json" \
|
||||
|
@ -274,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \
|
|||
Before using a model, pull it locally `ollama pull`:
|
||||
|
||||
```shell
|
||||
ollama pull llama3
|
||||
ollama pull llama3.1
|
||||
```
|
||||
|
||||
### Default model names
|
||||
|
@ -282,7 +282,7 @@ ollama pull llama3
|
|||
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
|
||||
|
||||
```
|
||||
ollama cp llama3 gpt-3.5-turbo
|
||||
ollama cp llama3.1 gpt-3.5-turbo
|
||||
```
|
||||
|
||||
Afterwards, this new model name can be specified the `model` field:
|
||||
|
|
|
@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat
|
|||
To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
|
||||
|
||||
```dockerfile
|
||||
FROM llama3
|
||||
FROM llama3.1
|
||||
|
||||
TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
|
||||
|
||||
|
|
|
@ -91,6 +91,17 @@ If none of those resolve the problem, gather additional information and file an
|
|||
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
|
||||
|
||||
|
||||
## AMD GPU Discovery
|
||||
|
||||
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
|
||||
|
||||
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
|
||||
|
||||
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
|
||||
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
|
||||
- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
|
||||
- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
|
||||
|
||||
## Windows Terminal Errors
|
||||
|
||||
Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer.
|
||||
|
|
|
@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn
|
|||
|
||||
Here's a quick example showing API access from `powershell`
|
||||
```powershell
|
||||
(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
||||
(Invoke-WebRequest -method POST -Body '{"model":"llama3.1", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
|
|
@ -179,53 +179,6 @@ var (
|
|||
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
|
||||
)
|
||||
|
||||
func RunnersDir() (p string) {
|
||||
if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
|
||||
return p
|
||||
}
|
||||
|
||||
if runtime.GOOS != "windows" {
|
||||
return
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if p == "" {
|
||||
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
|
||||
}
|
||||
}()
|
||||
|
||||
// On Windows we do not carry the payloads inside the main executable
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
var paths []string
|
||||
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} {
|
||||
paths = append(paths,
|
||||
root,
|
||||
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
|
||||
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
|
||||
)
|
||||
}
|
||||
|
||||
// Try a few variations to improve developer experience when building from source in the local tree
|
||||
for _, path := range paths {
|
||||
candidate := filepath.Join(path, "lib", "ollama", "runners")
|
||||
if _, err := os.Stat(candidate); err == nil {
|
||||
p = candidate
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
func Uint(key string, defaultValue uint) func() uint {
|
||||
return func() uint {
|
||||
if s := Var(key); s != "" {
|
||||
|
@ -290,10 +243,22 @@ func AsMap() map[string]EnvVar {
|
|||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
||||
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
|
||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
|
||||
|
||||
// Informational
|
||||
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
||||
"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
|
||||
"NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
|
||||
}
|
||||
|
||||
if runtime.GOOS != "windows" {
|
||||
// Windows environment variables are case-insensitive so there's no need to duplicate them
|
||||
ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
|
||||
ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
|
||||
ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
|
||||
}
|
||||
|
||||
if runtime.GOOS != "darwin" {
|
||||
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
||||
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
|
||||
|
@ -302,6 +267,7 @@ func AsMap() map[string]EnvVar {
|
|||
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
||||
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
||||
}
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
langchain==0.0.274
|
||||
gpt4all==1.0.8
|
||||
chromadb==0.4.7
|
||||
chromadb==0.5.0
|
||||
llama-cpp-python==0.1.81
|
||||
urllib3==2.0.4
|
||||
PyMuPDF==1.23.5
|
||||
|
|
|
@ -4,5 +4,5 @@ SYSTEM """
|
|||
You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer.
|
||||
"""
|
||||
|
||||
PARAMETER TEMPERATURE 0.3
|
||||
PARAMETER temperature 0.3
|
||||
|
||||
|
|
|
@ -21,6 +21,8 @@ You can try this with the `logtest.logfile` file included in this directory.
|
|||
2. Install the Python Requirements.
|
||||
|
||||
```bash
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
Requests==2.31.0
|
||||
Requests>=2.32.3
|
||||
|
|
|
@ -5,6 +5,7 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
@ -359,6 +360,10 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
|||
if len(resp) == 0 {
|
||||
slog.Info("no compatible amdgpu devices detected")
|
||||
}
|
||||
if err := verifyKFDDriverAccess(); err != nil {
|
||||
slog.Error("amdgpu devices detected but permission problems block access", "error", err)
|
||||
return nil
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
|
@ -455,3 +460,19 @@ func getFreeMemory(usedFile string) (uint64, error) {
|
|||
}
|
||||
return usedMemory, nil
|
||||
}
|
||||
|
||||
func verifyKFDDriverAccess() error {
|
||||
// Verify we have permissions - either running as root, or we have group access to the driver
|
||||
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
|
||||
if err != nil {
|
||||
if errors.Is(err, fs.ErrPermission) {
|
||||
return fmt.Errorf("permissions not set up properly. Either run ollama as root, or add you user account to the render group. %w", err)
|
||||
} else if errors.Is(err, fs.ErrNotExist) {
|
||||
// Container runtime failure?
|
||||
return fmt.Errorf("kfd driver not loaded. If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
|
||||
}
|
||||
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
|
||||
}
|
||||
fd.Close()
|
||||
return nil
|
||||
}
|
||||
|
|
148
gpu/assets.go
148
gpu/assets.go
|
@ -1,148 +0,0 @@
|
|||
package gpu
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
)
|
||||
|
||||
var (
|
||||
lock sync.Mutex
|
||||
payloadsDir = ""
|
||||
)
|
||||
|
||||
func PayloadsDir() (string, error) {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
var err error
|
||||
if payloadsDir == "" {
|
||||
runnersDir := envconfig.RunnersDir()
|
||||
|
||||
if runnersDir != "" {
|
||||
payloadsDir = runnersDir
|
||||
return payloadsDir, nil
|
||||
}
|
||||
|
||||
// The remainder only applies on non-windows where we still carry payloads in the main executable
|
||||
cleanupTmpDirs()
|
||||
tmpDir := envconfig.TmpDir()
|
||||
if tmpDir == "" {
|
||||
tmpDir, err = os.MkdirTemp("", "ollama")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
|
||||
}
|
||||
} else {
|
||||
err = os.MkdirAll(tmpDir, 0o755)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Track our pid so we can clean up orphaned tmpdirs
|
||||
n := filepath.Join(tmpDir, "ollama.pid")
|
||||
if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
|
||||
return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
|
||||
}
|
||||
|
||||
// We create a distinct subdirectory for payloads within the tmpdir
|
||||
// This will typically look like /tmp/ollama3208993108/runners on linux
|
||||
payloadsDir = filepath.Join(tmpDir, "runners")
|
||||
}
|
||||
return payloadsDir, nil
|
||||
}
|
||||
|
||||
// Best effort to clean up prior tmpdirs
|
||||
func cleanupTmpDirs() {
|
||||
matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
for _, match := range matches {
|
||||
raw, err := os.ReadFile(match)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
slog.Debug("not a ollama runtime directory, skipping", "path", match)
|
||||
continue
|
||||
} else if err != nil {
|
||||
slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
pid, err := strconv.Atoi(string(raw))
|
||||
if err != nil {
|
||||
slog.Warn("invalid pid, skipping", "path", match, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
p, err := os.FindProcess(pid)
|
||||
if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
|
||||
slog.Warn("process still running, skipping", "pid", pid, "path", match)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := os.Remove(match); err != nil {
|
||||
slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
|
||||
}
|
||||
|
||||
runners := filepath.Join(filepath.Dir(match), "runners")
|
||||
if err := os.RemoveAll(runners); err != nil {
|
||||
slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
|
||||
}
|
||||
|
||||
if err := os.Remove(filepath.Dir(match)); err != nil {
|
||||
slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func Cleanup() {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
runnersDir := envconfig.RunnersDir()
|
||||
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
|
||||
// We want to fully clean up the tmpdir parent of the payloads dir
|
||||
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
||||
slog.Debug("cleaning up", "dir", tmpDir)
|
||||
err := os.RemoveAll(tmpDir)
|
||||
if err != nil {
|
||||
// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
|
||||
time.Sleep(1000 * time.Millisecond)
|
||||
err = os.RemoveAll(tmpDir)
|
||||
if err != nil {
|
||||
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func UpdatePath(dir string) {
|
||||
if runtime.GOOS == "windows" {
|
||||
tmpDir := filepath.Dir(dir)
|
||||
pathComponents := strings.Split(os.Getenv("PATH"), ";")
|
||||
i := 0
|
||||
for _, comp := range pathComponents {
|
||||
if strings.EqualFold(comp, dir) {
|
||||
return
|
||||
}
|
||||
// Remove any other prior paths to our temp dir
|
||||
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
|
||||
pathComponents[i] = comp
|
||||
i++
|
||||
}
|
||||
}
|
||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
||||
slog.Info("updating", "PATH", newPath)
|
||||
os.Setenv("PATH", newPath)
|
||||
}
|
||||
// linux and darwin rely on rpath
|
||||
}
|
|
@ -93,10 +93,9 @@ func initCudaHandles() *cudaHandles {
|
|||
localAppData := os.Getenv("LOCALAPPDATA")
|
||||
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
|
||||
}
|
||||
tmpDir, _ := PayloadsDir()
|
||||
if tmpDir != "" {
|
||||
// TODO - add "payloads" for subprocess
|
||||
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
|
||||
libDir := LibraryDir()
|
||||
if libDir != "" {
|
||||
cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
|
||||
}
|
||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
|
||||
|
||||
|
|
10
llm/ext_server/server.cpp
vendored
10
llm/ext_server/server.cpp
vendored
|
@ -913,7 +913,9 @@ struct llama_server_context
|
|||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
if (!llama_token_is_eog(model, result.tok))
|
||||
slot.generated_text += token_str;
|
||||
|
||||
slot.has_next_token = true;
|
||||
|
||||
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
||||
|
@ -954,6 +956,8 @@ struct llama_server_context
|
|||
if (!incomplete)
|
||||
{
|
||||
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||
|
||||
if (!llama_token_is_eog(model, result.tok)) {
|
||||
const std::string str_test = slot.generated_text.substr(pos);
|
||||
bool is_stop_full = false;
|
||||
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
||||
|
@ -979,6 +983,10 @@ struct llama_server_context
|
|||
slot.n_sent_text += result.text_to_send.size();
|
||||
// add the token to slot queue and cache
|
||||
}
|
||||
} else {
|
||||
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||
slot.n_sent_text += result.text_to_send.size();
|
||||
}
|
||||
|
||||
if (slot.params.stream)
|
||||
{
|
||||
|
@ -1117,9 +1125,7 @@ struct llama_server_context
|
|||
{"multimodal", multimodal}
|
||||
};
|
||||
|
||||
if (!llama_token_is_eog(model, tkn.tok)) {
|
||||
res.result_json["content"] = tkn.text_to_send;
|
||||
}
|
||||
|
||||
if (slot.sparams.n_probs > 0)
|
||||
{
|
||||
|
|
|
@ -31,6 +31,7 @@ init_vars() {
|
|||
NO_WHOLE_ARCHIVE=""
|
||||
GCC_ARCH="-arch ${ARCH}"
|
||||
DIST_BASE=../../dist/darwin-${GOARCH}/
|
||||
PAYLOAD_BASE=../../build/darwin/${GOARCH}
|
||||
;;
|
||||
"Linux")
|
||||
LIB_EXT="so"
|
||||
|
@ -40,6 +41,7 @@ init_vars() {
|
|||
# Cross compiling not supported on linux - Use docker
|
||||
GCC_ARCH=""
|
||||
DIST_BASE=../../dist/linux-${GOARCH}/
|
||||
PAYLOAD_BASE=../../build/linux/${GOARCH}
|
||||
;;
|
||||
*)
|
||||
;;
|
||||
|
@ -47,7 +49,8 @@ init_vars() {
|
|||
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
|
||||
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||
fi
|
||||
GZIP=$(which pigz 2>/dev/null || echo "gzip")
|
||||
GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
|
||||
RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
|
||||
}
|
||||
|
||||
git_module_setup() {
|
||||
|
@ -91,17 +94,34 @@ build() {
|
|||
rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
|
||||
}
|
||||
|
||||
compress() {
|
||||
echo "Compressing payloads to reduce overall binary size..."
|
||||
rm -rf ${BUILD_DIR}/bin/*.gz
|
||||
dist() {
|
||||
[ -z "${RUNNER}" ] && exit 1
|
||||
mkdir -p ${RUNNER_BASE}/${RUNNER}/
|
||||
for f in ${BUILD_DIR}/bin/* ; do
|
||||
${GZIP} -n --best -f ${f} &
|
||||
cp ${f} ${RUNNER_BASE}/${RUNNER}/
|
||||
done
|
||||
# check for lib directory
|
||||
if [ -d ${BUILD_DIR}/lib ]; then
|
||||
for f in ${BUILD_DIR}/lib/* ; do
|
||||
cp ${f} ${RUNNER_BASE}/${RUNNER}/
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
|
||||
compress() {
|
||||
[ -z "${RUNNER}" ] && exit 1
|
||||
echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
|
||||
rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
|
||||
mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
|
||||
for f in ${BUILD_DIR}/bin/* ; do
|
||||
${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
|
||||
compress_pids+=" $!"
|
||||
done
|
||||
# check for lib directory
|
||||
if [ -d ${BUILD_DIR}/lib ]; then
|
||||
for f in ${BUILD_DIR}/lib/* ; do
|
||||
${GZIP} -n --best -f ${f} &
|
||||
${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
|
||||
compress_pids+=" $!"
|
||||
done
|
||||
fi
|
||||
|
@ -117,7 +137,7 @@ wait_for_compress() {
|
|||
|
||||
install() {
|
||||
echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
|
||||
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
|
||||
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
|
||||
rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
|
||||
cp -af "${lib}" "${BUILD_DIR}/bin/"
|
||||
done
|
||||
|
|
|
@ -39,7 +39,8 @@ case "${GOARCH}" in
|
|||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/darwin/${ARCH}/cpu"
|
||||
RUNNER=cpu
|
||||
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||
echo "Building LCD CPU"
|
||||
build
|
||||
sign ${BUILD_DIR}/bin/ollama_llama_server
|
||||
|
@ -51,7 +52,8 @@ case "${GOARCH}" in
|
|||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
|
||||
RUNNER=cpu_avx
|
||||
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||
echo "Building AVX CPU"
|
||||
build
|
||||
sign ${BUILD_DIR}/bin/ollama_llama_server
|
||||
|
@ -63,7 +65,8 @@ case "${GOARCH}" in
|
|||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
|
||||
RUNNER=cpu_avx2
|
||||
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||
echo "Building AVX2 CPU"
|
||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
|
||||
build
|
||||
|
@ -84,7 +87,8 @@ case "${GOARCH}" in
|
|||
if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/darwin/${ARCH}/metal"
|
||||
RUNNER="metal"
|
||||
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
|
||||
build
|
||||
sign ${BUILD_DIR}/bin/ollama_llama_server
|
||||
|
|
|
@ -79,10 +79,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||
init_vars
|
||||
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
||||
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
||||
RUNNER="cpu"
|
||||
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||
echo "Building custom CPU"
|
||||
build
|
||||
install
|
||||
dist
|
||||
compress
|
||||
else
|
||||
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
||||
|
@ -102,10 +104,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
||||
RUNNER=cpu
|
||||
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||
echo "Building LCD CPU"
|
||||
build
|
||||
install
|
||||
dist
|
||||
compress
|
||||
fi
|
||||
|
||||
|
@ -120,10 +124,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
|
||||
RUNNER=cpu_avx
|
||||
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||
echo "Building AVX CPU"
|
||||
build
|
||||
install
|
||||
dist
|
||||
compress
|
||||
fi
|
||||
|
||||
|
@ -134,10 +140,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
|
||||
RUNNER=cpu_avx2
|
||||
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||
echo "Building AVX2 CPU"
|
||||
build
|
||||
install
|
||||
dist
|
||||
compress
|
||||
fi
|
||||
fi
|
||||
|
@ -187,11 +195,13 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
|||
fi
|
||||
export CUDAFLAGS="-t8"
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
|
||||
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||
RUNNER=cuda${CUDA_VARIANT}
|
||||
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
|
||||
build
|
||||
install
|
||||
dist
|
||||
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
|
||||
mkdir -p "${CUDA_DIST_DIR}"
|
||||
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
|
||||
|
@ -212,7 +222,8 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
|
|||
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
|
||||
CC=icx
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
|
||||
BUILD_DIR="../build/linux/${ARCH}/oneapi"
|
||||
RUNNER=oneapi
|
||||
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
|
||||
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
|
||||
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
|
||||
|
@ -231,6 +242,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
|
|||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
|
||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
|
||||
install
|
||||
dist
|
||||
compress
|
||||
fi
|
||||
|
||||
|
@ -259,7 +271,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
|||
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
|
||||
echo "Building custom ROCM GPU"
|
||||
fi
|
||||
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
||||
RUNNER=rocm${ROCM_VARIANT}
|
||||
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||
# ROCm dependencies are too large to fit into a unified bundle
|
||||
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
|
||||
# TODO figure out how to disable runpath (rpath)
|
||||
|
@ -269,13 +282,17 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
|||
|
||||
# copy the ROCM dependencies
|
||||
mkdir -p "${ROCM_DIST_DIR}"
|
||||
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
|
||||
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
|
||||
cp -a "${dep}"* "${ROCM_DIST_DIR}"
|
||||
if [ $(readlink -f "${dep}") != "${dep}" ] ; then
|
||||
cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
|
||||
fi
|
||||
done
|
||||
install
|
||||
dist
|
||||
compress
|
||||
fi
|
||||
|
||||
cleanup
|
||||
wait_for_compress
|
||||
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
||||
echo "go generate completed. LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
|
||||
|
|
|
@ -1,11 +1,7 @@
|
|||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
//go:embed build/darwin/arm64/*/bin/*
|
||||
var libEmbed embed.FS
|
||||
|
||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
|
@ -1,11 +0,0 @@
|
|||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
//go:embed build/darwin/x86_64/*/bin/*
|
||||
var libEmbed embed.FS
|
||||
|
||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
|
@ -1,11 +1,7 @@
|
|||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
//go:embed build/linux/*/*/bin/*
|
||||
var libEmbed embed.FS
|
||||
|
||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// unused on windows
|
||||
var libEmbed embed.FS
|
||||
|
||||
const CREATE_DEFAULT_ERROR_MODE = 0x04000000
|
||||
|
||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
|
||||
|
|
233
llm/payload.go
233
llm/payload.go
|
@ -1,233 +0,0 @@
|
|||
package llm
|
||||
|
||||
import (
|
||||
"compress/gzip"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/sync/errgroup"
|
||||
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
|
||||
var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")
|
||||
|
||||
func Init() error {
|
||||
payloadsDir, err := gpu.PayloadsDir()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if runtime.GOOS != "windows" {
|
||||
slog.Info("extracting embedded files", "dir", payloadsDir)
|
||||
binGlob := "build/*/*/*/bin/*"
|
||||
|
||||
// extract server libraries
|
||||
err = extractFiles(payloadsDir, binGlob)
|
||||
if err != nil {
|
||||
return fmt.Errorf("extract binaries: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
var variants []string
|
||||
for v := range getAvailableServers() {
|
||||
variants = append(variants, v)
|
||||
}
|
||||
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
|
||||
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// binary names may contain an optional variant separated by '_'
|
||||
// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
|
||||
// Any library without a variant is the lowest common denominator
|
||||
func getAvailableServers() map[string]string {
|
||||
payloadsDir, err := gpu.PayloadsDir()
|
||||
if err != nil {
|
||||
slog.Error("payload lookup error", "error", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
// glob payloadsDir for files that start with ollama_
|
||||
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
|
||||
|
||||
files, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
slog.Debug("could not glob", "pattern", pattern, "error", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
servers := make(map[string]string)
|
||||
for _, file := range files {
|
||||
slog.Debug("availableServers : found", "file", file)
|
||||
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
|
||||
}
|
||||
|
||||
return servers
|
||||
}
|
||||
|
||||
// serversForGpu returns a list of compatible servers give the provided GPU
|
||||
// info, ordered by performance. assumes Init() has been called
|
||||
// TODO - switch to metadata based mapping
|
||||
func serversForGpu(info gpu.GpuInfo) []string {
|
||||
// glob workDir for files that start with ollama_
|
||||
availableServers := getAvailableServers()
|
||||
requested := info.Library
|
||||
if info.Variant != gpu.CPUCapabilityNone.String() {
|
||||
requested += "_" + info.Variant
|
||||
}
|
||||
|
||||
servers := []string{}
|
||||
|
||||
// exact match first
|
||||
for a := range availableServers {
|
||||
if a == requested {
|
||||
servers = []string{a}
|
||||
|
||||
if a == "metal" {
|
||||
return servers
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
alt := []string{}
|
||||
|
||||
// Then for GPUs load alternates and sort the list for consistent load ordering
|
||||
if info.Library != "cpu" {
|
||||
for a := range availableServers {
|
||||
if info.Library == strings.Split(a, "_")[0] && a != requested {
|
||||
alt = append(alt, a)
|
||||
}
|
||||
}
|
||||
|
||||
slices.Sort(alt)
|
||||
servers = append(servers, alt...)
|
||||
}
|
||||
|
||||
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
|
||||
// Load up the best CPU variant if not primary requested
|
||||
if info.Library != "cpu" {
|
||||
variant := gpu.GetCPUCapability()
|
||||
// If no variant, then we fall back to default
|
||||
// If we have a variant, try that if we find an exact match
|
||||
// Attempting to run the wrong CPU instructions will panic the
|
||||
// process
|
||||
if variant != gpu.CPUCapabilityNone {
|
||||
for cmp := range availableServers {
|
||||
if cmp == "cpu_"+variant.String() {
|
||||
servers = append(servers, cmp)
|
||||
break
|
||||
}
|
||||
}
|
||||
} else {
|
||||
servers = append(servers, "cpu")
|
||||
}
|
||||
}
|
||||
|
||||
if len(servers) == 0 {
|
||||
servers = []string{"cpu"}
|
||||
}
|
||||
}
|
||||
|
||||
return servers
|
||||
}
|
||||
|
||||
// Return the optimal server for this CPU architecture
|
||||
func serverForCpu() string {
|
||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||
return "metal"
|
||||
}
|
||||
variant := gpu.GetCPUCapability()
|
||||
availableServers := getAvailableServers()
|
||||
if variant != gpu.CPUCapabilityNone {
|
||||
for cmp := range availableServers {
|
||||
if cmp == "cpu_"+variant.String() {
|
||||
return cmp
|
||||
}
|
||||
}
|
||||
}
|
||||
return "cpu"
|
||||
}
|
||||
|
||||
// extract extracts the embedded files to the target directory
|
||||
func extractFiles(targetDir string, glob string) error {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return errPayloadMissing
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(targetDir, 0o755); err != nil {
|
||||
return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
|
||||
}
|
||||
|
||||
g := new(errgroup.Group)
|
||||
|
||||
// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
|
||||
for _, file := range files {
|
||||
filename := file
|
||||
|
||||
variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
|
||||
|
||||
slog.Debug("extracting", "variant", variant, "file", filename)
|
||||
|
||||
g.Go(func() error {
|
||||
srcf, err := libEmbed.Open(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer srcf.Close()
|
||||
|
||||
src := io.Reader(srcf)
|
||||
if strings.HasSuffix(filename, ".gz") {
|
||||
src, err = gzip.NewReader(src)
|
||||
if err != nil {
|
||||
return fmt.Errorf("decompress payload %s: %v", filename, err)
|
||||
}
|
||||
filename = strings.TrimSuffix(filename, ".gz")
|
||||
}
|
||||
|
||||
variantDir := filepath.Join(targetDir, variant)
|
||||
if err := os.MkdirAll(variantDir, 0o755); err != nil {
|
||||
return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
|
||||
}
|
||||
|
||||
base := filepath.Base(filename)
|
||||
destFilename := filepath.Join(variantDir, base)
|
||||
|
||||
_, err = os.Stat(destFilename)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("write payload %s: %v", filename, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
if _, err := io.Copy(destFile, src); err != nil {
|
||||
return fmt.Errorf("copy payload %s: %v", filename, err)
|
||||
}
|
||||
case err != nil:
|
||||
return fmt.Errorf("stat payload %s: %v", filename, err)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
err = g.Wait()
|
||||
if err != nil {
|
||||
// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
|
||||
gpu.Cleanup()
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
|
@ -24,9 +24,11 @@ import (
|
|||
"golang.org/x/sync/semaphore"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/build"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/runners"
|
||||
)
|
||||
|
||||
type LlamaServer interface {
|
||||
|
@ -106,7 +108,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
gpus = gpu.GetCPUInfo()
|
||||
}
|
||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||
cpuRunner = serverForCpu()
|
||||
cpuRunner = runners.ServerForCpu()
|
||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
} else {
|
||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
|
@ -118,7 +120,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
opts.NumGPU = 0
|
||||
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||
// Don't bother loading into the GPU if no layers can fit
|
||||
cpuRunner = serverForCpu()
|
||||
cpuRunner = runners.ServerForCpu()
|
||||
gpus = gpu.GetCPUInfo()
|
||||
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||
opts.NumGPU = estimate.Layers
|
||||
|
@ -145,25 +147,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||
}
|
||||
|
||||
availableServers := getAvailableServers()
|
||||
if len(availableServers) == 0 {
|
||||
if runtime.GOOS != "windows" {
|
||||
slog.Warn("llama server binary disappeared, reinitializing payloads")
|
||||
err = Init()
|
||||
rDir, err := runners.Refresh(build.EmbedFS)
|
||||
if err != nil {
|
||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
||||
return nil, err
|
||||
}
|
||||
availableServers = getAvailableServers()
|
||||
} else {
|
||||
|
||||
availableServers := runners.GetAvailableServers(rDir)
|
||||
if len(availableServers) == 0 {
|
||||
return nil, finalErr
|
||||
}
|
||||
}
|
||||
var servers []string
|
||||
if cpuRunner != "" {
|
||||
servers = []string{cpuRunner}
|
||||
} else {
|
||||
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||
servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||
}
|
||||
demandLib := envconfig.LLMLibrary()
|
||||
if demandLib != "" {
|
||||
|
@ -274,7 +271,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
params = append(params, "--tensor-split", estimate.TensorSplit)
|
||||
}
|
||||
|
||||
for i := range len(servers) {
|
||||
for i := range servers {
|
||||
dir := availableServers[servers[i]]
|
||||
if dir == "" {
|
||||
// Shouldn't happen
|
||||
|
@ -330,7 +327,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||
_, err := os.Stat(server)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
||||
err = Init()
|
||||
_, err = runners.Refresh(build.EmbedFS)
|
||||
if err != nil {
|
||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
||||
return nil, err
|
||||
|
|
384
runners/common.go
Normal file
384
runners/common.go
Normal file
|
@ -0,0 +1,384 @@
|
|||
package runners
|
||||
|
||||
import (
|
||||
"compress/gzip"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
|
||||
"golang.org/x/sync/errgroup"
|
||||
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
|
||||
const (
|
||||
binGlob = "*/*/*/*"
|
||||
)
|
||||
|
||||
var (
|
||||
lock sync.Mutex
|
||||
runnersDir = ""
|
||||
)
|
||||
|
||||
// Return the location where runners are stored
|
||||
// If runners are payloads, this will either extract them
|
||||
// or refresh them if any have disappeared due to tmp cleaners
|
||||
func Refresh(payloadFS fs.FS) (string, error) {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
var err error
|
||||
|
||||
// Wire up extra logging on our first load
|
||||
if runnersDir == "" {
|
||||
defer func() {
|
||||
var runners []string
|
||||
for v := range GetAvailableServers(runnersDir) {
|
||||
runners = append(runners, v)
|
||||
}
|
||||
slog.Info("Dynamic LLM libraries", "runners", runners)
|
||||
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
|
||||
}()
|
||||
}
|
||||
|
||||
if hasPayloads(payloadFS) {
|
||||
if runnersDir == "" {
|
||||
runnersDir, err = extractRunners(payloadFS)
|
||||
} else {
|
||||
err = refreshRunners(payloadFS, runnersDir)
|
||||
}
|
||||
} else if runnersDir == "" {
|
||||
runnersDir, err = locateRunners()
|
||||
}
|
||||
|
||||
return runnersDir, err
|
||||
}
|
||||
|
||||
func Cleanup(payloadFS fs.FS) {
|
||||
lock.Lock()
|
||||
defer lock.Unlock()
|
||||
if hasPayloads(payloadFS) && runnersDir != "" {
|
||||
// We want to fully clean up the tmpdir parent of the payloads dir
|
||||
tmpDir := filepath.Clean(filepath.Join(runnersDir, ".."))
|
||||
slog.Debug("cleaning up", "dir", tmpDir)
|
||||
err := os.RemoveAll(tmpDir)
|
||||
if err != nil {
|
||||
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func locateRunners() (string, error) {
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var paths []string
|
||||
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
|
||||
paths = append(paths,
|
||||
root,
|
||||
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
|
||||
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
|
||||
)
|
||||
}
|
||||
|
||||
// Try a few variations to improve developer experience when building from source in the local tree
|
||||
for _, path := range paths {
|
||||
candidate := filepath.Join(path, "lib", "ollama", "runners")
|
||||
if _, err := os.Stat(candidate); err == nil {
|
||||
return candidate, nil
|
||||
}
|
||||
}
|
||||
return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
|
||||
}
|
||||
|
||||
// Return true if we're carying nested payloads for the runners
|
||||
func hasPayloads(payloadFS fs.FS) bool {
|
||||
files, err := fs.Glob(payloadFS, binGlob)
|
||||
if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func extractRunners(payloadFS fs.FS) (string, error) {
|
||||
cleanupTmpDirs()
|
||||
tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
|
||||
}
|
||||
// Track our pid so we can clean up orphaned tmpdirs
|
||||
n := filepath.Join(tmpDir, "ollama.pid")
|
||||
if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
|
||||
slog.Warn("failed to write pid file", "file", n, "error", err)
|
||||
}
|
||||
// We create a distinct subdirectory for payloads within the tmpdir
|
||||
// This will typically look like /tmp/ollama3208993108/runners on linux
|
||||
rDir := filepath.Join(tmpDir, "runners")
|
||||
|
||||
slog.Info("extracting embedded files", "dir", rDir)
|
||||
return rDir, refreshRunners(payloadFS, rDir)
|
||||
}
|
||||
|
||||
func refreshRunners(payloadFS fs.FS, rDir string) error {
|
||||
// extract or refresh server libraries
|
||||
err := extractFiles(payloadFS, rDir, binGlob)
|
||||
if err != nil {
|
||||
return fmt.Errorf("extract binaries: %v", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// extract extracts the embedded files to the target directory
|
||||
func extractFiles(payloadFS fs.FS, targetDir string, glob string) error {
|
||||
files, err := fs.Glob(payloadFS, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
// Should not happen
|
||||
return fmt.Errorf("extractFiles called without payload present")
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(targetDir, 0o755); err != nil {
|
||||
return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
|
||||
}
|
||||
|
||||
g := new(errgroup.Group)
|
||||
|
||||
// $OS/$GOARCH/$RUNNER/$FILE
|
||||
for _, file := range files {
|
||||
filename := file
|
||||
|
||||
runner := filepath.Base(filepath.Dir(filename))
|
||||
|
||||
slog.Debug("extracting", "runner", runner, "payload", filename)
|
||||
|
||||
g.Go(func() error {
|
||||
srcf, err := payloadFS.Open(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer srcf.Close()
|
||||
|
||||
src := io.Reader(srcf)
|
||||
if strings.HasSuffix(filename, ".gz") {
|
||||
src, err = gzip.NewReader(src)
|
||||
if err != nil {
|
||||
return fmt.Errorf("decompress payload %s: %v", filename, err)
|
||||
}
|
||||
filename = strings.TrimSuffix(filename, ".gz")
|
||||
}
|
||||
|
||||
runnerDir := filepath.Join(targetDir, runner)
|
||||
if err := os.MkdirAll(runnerDir, 0o755); err != nil {
|
||||
return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err)
|
||||
}
|
||||
|
||||
base := filepath.Base(filename)
|
||||
destFilename := filepath.Join(runnerDir, base)
|
||||
|
||||
_, err = os.Stat(destFilename)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("write payload %s: %v", filename, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
if _, err := io.Copy(destFile, src); err != nil {
|
||||
return fmt.Errorf("copy payload %s: %v", filename, err)
|
||||
}
|
||||
case err != nil:
|
||||
return fmt.Errorf("stat payload %s: %v", filename, err)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
err = g.Wait()
|
||||
if err != nil {
|
||||
slog.Error("failed to extract files", "error", err)
|
||||
// If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted
|
||||
err := os.RemoveAll(targetDir)
|
||||
if err != nil {
|
||||
slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Best effort to clean up prior tmpdirs
|
||||
func cleanupTmpDirs() {
|
||||
tmpDir := envconfig.TmpDir()
|
||||
if tmpDir == "" {
|
||||
tmpDir = os.TempDir()
|
||||
}
|
||||
matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
for _, match := range matches {
|
||||
raw, err := os.ReadFile(match)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
slog.Debug("not a ollama runtime directory, skipping", "path", match)
|
||||
continue
|
||||
} else if err != nil {
|
||||
slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
pid, err := strconv.Atoi(string(raw))
|
||||
if err != nil {
|
||||
slog.Warn("invalid pid, skipping", "path", match, "error", err)
|
||||
continue
|
||||
}
|
||||
|
||||
p, err := os.FindProcess(pid)
|
||||
if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
|
||||
slog.Warn("process still running, skipping", "pid", pid, "path", match)
|
||||
continue
|
||||
}
|
||||
|
||||
if err := os.Remove(match); err != nil {
|
||||
slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
|
||||
}
|
||||
|
||||
runners := filepath.Join(filepath.Dir(match), "runners")
|
||||
if err := os.RemoveAll(runners); err != nil {
|
||||
slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
|
||||
}
|
||||
|
||||
if err := os.Remove(filepath.Dir(match)); err != nil {
|
||||
slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// directory names are the name of the runner and may contain an optional
|
||||
// variant prefixed with '_' as the separator. For example, "cuda_v11" and
|
||||
// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
|
||||
// lowest common denominator
|
||||
func GetAvailableServers(payloadsDir string) map[string]string {
|
||||
if payloadsDir == "" {
|
||||
slog.Error("empty runner dir")
|
||||
return nil
|
||||
}
|
||||
|
||||
// glob payloadsDir for files that start with ollama_
|
||||
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
|
||||
|
||||
files, err := filepath.Glob(pattern)
|
||||
if err != nil {
|
||||
slog.Debug("could not glob", "pattern", pattern, "error", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
servers := make(map[string]string)
|
||||
for _, file := range files {
|
||||
slog.Debug("availableServers : found", "file", file)
|
||||
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
|
||||
}
|
||||
|
||||
return servers
|
||||
}
|
||||
|
||||
// serversForGpu returns a list of compatible servers give the provided GPU
|
||||
// info, ordered by performance. assumes Init() has been called
|
||||
// TODO - switch to metadata based mapping
|
||||
func ServersForGpu(info gpu.GpuInfo) []string {
|
||||
// glob workDir for files that start with ollama_
|
||||
availableServers := GetAvailableServers(runnersDir)
|
||||
requested := info.Library
|
||||
if info.Variant != gpu.CPUCapabilityNone.String() {
|
||||
requested += "_" + info.Variant
|
||||
}
|
||||
|
||||
servers := []string{}
|
||||
|
||||
// exact match first
|
||||
for a := range availableServers {
|
||||
if a == requested {
|
||||
servers = []string{a}
|
||||
|
||||
if a == "metal" {
|
||||
return servers
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
alt := []string{}
|
||||
|
||||
// Then for GPUs load alternates and sort the list for consistent load ordering
|
||||
if info.Library != "cpu" {
|
||||
for a := range availableServers {
|
||||
if info.Library == strings.Split(a, "_")[0] && a != requested {
|
||||
alt = append(alt, a)
|
||||
}
|
||||
}
|
||||
|
||||
slices.Sort(alt)
|
||||
servers = append(servers, alt...)
|
||||
}
|
||||
|
||||
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
|
||||
// Load up the best CPU variant if not primary requested
|
||||
if info.Library != "cpu" {
|
||||
variant := gpu.GetCPUCapability()
|
||||
// If no variant, then we fall back to default
|
||||
// If we have a variant, try that if we find an exact match
|
||||
// Attempting to run the wrong CPU instructions will panic the
|
||||
// process
|
||||
if variant != gpu.CPUCapabilityNone {
|
||||
for cmp := range availableServers {
|
||||
if cmp == "cpu_"+variant.String() {
|
||||
servers = append(servers, cmp)
|
||||
break
|
||||
}
|
||||
}
|
||||
} else {
|
||||
servers = append(servers, "cpu")
|
||||
}
|
||||
}
|
||||
|
||||
if len(servers) == 0 {
|
||||
servers = []string{"cpu"}
|
||||
}
|
||||
}
|
||||
|
||||
return servers
|
||||
}
|
||||
|
||||
// Return the optimal server for this CPU architecture
|
||||
func ServerForCpu() string {
|
||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||
return "metal"
|
||||
}
|
||||
variant := gpu.GetCPUCapability()
|
||||
availableServers := GetAvailableServers(runnersDir)
|
||||
if variant != gpu.CPUCapabilityNone {
|
||||
for cmp := range availableServers {
|
||||
if cmp == "cpu_"+variant.String() {
|
||||
return cmp
|
||||
}
|
||||
}
|
||||
}
|
||||
return "cpu"
|
||||
}
|
50
runners/runners_test.go
Normal file
50
runners/runners_test.go
Normal file
|
@ -0,0 +1,50 @@
|
|||
package runners
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"path"
|
||||
"runtime"
|
||||
"strings"
|
||||
"testing"
|
||||
"testing/fstest"
|
||||
)
|
||||
|
||||
func TestRefreshRunners(t *testing.T) {
|
||||
slog.SetLogLoggerLevel(slog.LevelDebug)
|
||||
|
||||
payloadFS := fstest.MapFS{
|
||||
path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")},
|
||||
}
|
||||
tmpDir, err := os.MkdirTemp("", "testing")
|
||||
if err != nil {
|
||||
t.Fatalf("failed to make tmp dir %s", err)
|
||||
}
|
||||
t.Setenv("OLLAMA_TMPDIR", tmpDir)
|
||||
rDir, err := Refresh(payloadFS)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to extract to %s %s", tmpDir, err)
|
||||
}
|
||||
if !strings.Contains(rDir, tmpDir) {
|
||||
t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
|
||||
}
|
||||
|
||||
// spot check results
|
||||
servers := GetAvailableServers(rDir)
|
||||
if len(servers) < 1 {
|
||||
t.Fatalf("expected at least 1 server")
|
||||
}
|
||||
|
||||
// Refresh contents
|
||||
rDir, err = extractRunners(payloadFS)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to extract to %s %s", tmpDir, err)
|
||||
}
|
||||
if !strings.Contains(rDir, tmpDir) {
|
||||
t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
|
||||
}
|
||||
|
||||
cleanupTmpDirs()
|
||||
|
||||
Cleanup(payloadFS)
|
||||
}
|
|
@ -2,8 +2,7 @@
|
|||
|
||||
set -e
|
||||
|
||||
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
||||
. $(dirname $0)/env.sh
|
||||
|
||||
mkdir -p dist
|
||||
|
||||
|
|
|
@ -2,76 +2,34 @@
|
|||
|
||||
set -eu
|
||||
|
||||
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
||||
|
||||
# We use 2 different image repositories to handle combining architecture images into multiarch manifest
|
||||
# (The ROCm image is x86 only and is not a multiarch manifest)
|
||||
# For developers, you can override the DOCKER_ORG to generate multiarch manifests
|
||||
# DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh
|
||||
DOCKER_ORG=${DOCKER_ORG:-"ollama"}
|
||||
RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
|
||||
FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
|
||||
|
||||
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
|
||||
. $(dirname $0)/env.sh
|
||||
|
||||
# Set PUSH to a non-empty string to trigger push instead of load
|
||||
PUSH=${PUSH:-""}
|
||||
|
||||
# In CI mode, we break things down
|
||||
OLLAMA_SKIP_MANIFEST_CREATE=${OLLAMA_SKIP_MANIFEST_CREATE:-""}
|
||||
OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""}
|
||||
|
||||
if [ -z "${PUSH}" ] ; then
|
||||
echo "Building ${FINAL_IMAGE_REPO}:$VERSION locally. set PUSH=1 to push"
|
||||
LOAD_OR_PUSH="--load"
|
||||
else
|
||||
echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}"
|
||||
echo "Will be pushing ${FINAL_IMAGE_REPO}:$VERSION"
|
||||
LOAD_OR_PUSH="--push"
|
||||
fi
|
||||
|
||||
if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then
|
||||
for TARGETARCH in ${BUILD_ARCH}; do
|
||||
docker build \
|
||||
docker buildx build \
|
||||
${LOAD_OR_PUSH} \
|
||||
--platform=linux/${TARGETARCH} \
|
||||
--build-arg=VERSION \
|
||||
--build-arg=GOFLAGS \
|
||||
--platform=${PLATFORM} \
|
||||
${OLLAMA_COMMON_BUILD_ARGS} \
|
||||
-f Dockerfile \
|
||||
-t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \
|
||||
-t ${FINAL_IMAGE_REPO}:$VERSION \
|
||||
.
|
||||
done
|
||||
|
||||
if echo ${BUILD_ARCH} | grep "amd64" > /dev/null; then
|
||||
docker build \
|
||||
if echo $PLATFORM | grep "amd64" > /dev/null; then
|
||||
docker buildx build \
|
||||
${LOAD_OR_PUSH} \
|
||||
--platform=linux/amd64 \
|
||||
--build-arg=VERSION \
|
||||
--build-arg=GOFLAGS \
|
||||
${OLLAMA_COMMON_BUILD_ARGS} \
|
||||
--target runtime-rocm \
|
||||
-f Dockerfile \
|
||||
-t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \
|
||||
-t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
|
||||
.
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then
|
||||
if [ -n "${PUSH}" ]; then
|
||||
docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \
|
||||
${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
|
||||
${RELEASE_IMAGE_REPO}:$VERSION-arm64
|
||||
docker manifest push ${FINAL_IMAGE_REPO}:$VERSION
|
||||
|
||||
# For symmetry, tag/push the rocm image
|
||||
if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
|
||||
echo "Tagging and pushing rocm image"
|
||||
docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
|
||||
docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
|
||||
docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm
|
||||
fi
|
||||
else
|
||||
echo "Skipping manifest generation when not pushing images are available locally as "
|
||||
echo " ${RELEASE_IMAGE_REPO}:$VERSION-amd64"
|
||||
echo " ${RELEASE_IMAGE_REPO}:$VERSION-arm64"
|
||||
echo " ${RELEASE_IMAGE_REPO}:$VERSION-rocm"
|
||||
fi
|
||||
fi
|
|
@ -1,37 +1,29 @@
|
|||
#!/bin/sh
|
||||
#
|
||||
# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder
|
||||
#
|
||||
# docker context create amd64 --docker host=ssh://mybuildhost
|
||||
# docker buildx create --name mybuilder amd64 --platform linux/amd64
|
||||
# docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64
|
||||
# docker buildx use mybuilder
|
||||
|
||||
|
||||
set -eu
|
||||
|
||||
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
||||
GZIP=$(which pigz 2>/dev/null || echo "gzip")
|
||||
. $(dirname $0)/env.sh
|
||||
|
||||
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
|
||||
export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
|
||||
mkdir -p dist
|
||||
|
||||
for TARGETARCH in ${BUILD_ARCH}; do
|
||||
docker build \
|
||||
--platform=linux/$TARGETARCH \
|
||||
--build-arg=GOFLAGS \
|
||||
--build-arg=CGO_CFLAGS \
|
||||
--build-arg=OLLAMA_CUSTOM_CPU_DEFS \
|
||||
--build-arg=AMDGPU_TARGETS \
|
||||
--target build-$TARGETARCH \
|
||||
docker buildx build \
|
||||
--output type=local,dest=./dist/ \
|
||||
--platform=${PLATFORM} \
|
||||
${OLLAMA_COMMON_BUILD_ARGS} \
|
||||
--target dist \
|
||||
-f Dockerfile \
|
||||
-t builder:$TARGETARCH \
|
||||
.
|
||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
||||
rm -rf ./dist/linux-$TARGETARCH
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
|
||||
if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
|
||||
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
|
||||
fi
|
||||
docker rm builder-$TARGETARCH
|
||||
echo "Compressing final linux bundle..."
|
||||
rm -f ./dist/ollama-linux-$TARGETARCH.tgz
|
||||
(cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
|
||||
if [ -d dist/linux-$TARGETARCH-rocm ]; then
|
||||
(cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
|
||||
fi
|
||||
done
|
||||
|
||||
# buildx behavior changes for single vs. multiplatform
|
||||
if echo $PLATFORM | grep "," > /dev/null ; then
|
||||
mv -f ./dist/linux_*64/ollama* ./dist/
|
||||
rmdir ./dist/linux_*64
|
||||
fi
|
14
scripts/env.sh
Normal file
14
scripts/env.sh
Normal file
|
@ -0,0 +1,14 @@
|
|||
# Common environment setup across build*.sh scripts
|
||||
|
||||
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
||||
# TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
|
||||
PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
|
||||
DOCKER_ORG=${DOCKER_ORG:-"ollama"}
|
||||
RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
|
||||
FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
|
||||
OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION --build-arg=GOFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS --build-arg=AMDGPU_TARGETS"
|
||||
|
||||
echo "Building Ollama"
|
||||
echo "VERSION=$VERSION"
|
||||
echo "PLATFORM=$PLATFORM"
|
|
@ -26,11 +26,13 @@ import (
|
|||
"golang.org/x/sync/errgroup"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/build"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/openai"
|
||||
"github.com/ollama/ollama/parser"
|
||||
"github.com/ollama/ollama/runners"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/types/errtypes"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
|
@ -117,6 +119,32 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
|||
return
|
||||
}
|
||||
|
||||
// expire the runner
|
||||
if req.Prompt == "" && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
|
||||
model, err := GetModel(req.Model)
|
||||
if err != nil {
|
||||
switch {
|
||||
case os.IsNotExist(err):
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
|
||||
case err.Error() == "invalid model name":
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
default:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
}
|
||||
return
|
||||
}
|
||||
s.sched.expireRunner(model)
|
||||
|
||||
c.JSON(http.StatusOK, api.GenerateResponse{
|
||||
Model: req.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Response: "",
|
||||
Done: true,
|
||||
DoneReason: "unload",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
if req.Format != "" && req.Format != "json" {
|
||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""})
|
||||
return
|
||||
|
@ -1190,12 +1218,12 @@ func Serve(ln net.Listener) error {
|
|||
srvr.Close()
|
||||
schedDone()
|
||||
sched.unloadAllRunners()
|
||||
gpu.Cleanup()
|
||||
runners.Cleanup(build.EmbedFS)
|
||||
done()
|
||||
}()
|
||||
|
||||
if err := llm.Init(); err != nil {
|
||||
return fmt.Errorf("unable to initialize llm library %w", err)
|
||||
if _, err := runners.Refresh(build.EmbedFS); err != nil {
|
||||
return fmt.Errorf("unable to initialize llm runners %w", err)
|
||||
}
|
||||
|
||||
s.sched.Run(schedCtx)
|
||||
|
@ -1322,6 +1350,32 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
|||
return
|
||||
}
|
||||
|
||||
// expire the runner
|
||||
if len(req.Messages) == 0 && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
|
||||
model, err := GetModel(req.Model)
|
||||
if err != nil {
|
||||
switch {
|
||||
case os.IsNotExist(err):
|
||||
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
|
||||
case err.Error() == "invalid model name":
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||
default:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
}
|
||||
return
|
||||
}
|
||||
s.sched.expireRunner(model)
|
||||
|
||||
c.JSON(http.StatusOK, api.ChatResponse{
|
||||
Model: req.Model,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
Message: api.Message{Role: "assistant"},
|
||||
Done: true,
|
||||
DoneReason: "unload",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
caps := []Capability{CapabilityCompletion}
|
||||
if len(req.Tools) > 0 {
|
||||
caps = append(caps, CapabilityTools)
|
||||
|
|
|
@ -360,7 +360,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
|
|||
slog.Debug("runner expired event received", "modelPath", runner.modelPath)
|
||||
runner.refMu.Lock()
|
||||
if runner.refCount > 0 {
|
||||
// Shouldn't happen, but safeguard to ensure no leaked runners
|
||||
slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount)
|
||||
go func(runner *runnerRef) {
|
||||
// We can't unload yet, but want to as soon as the current request completes
|
||||
|
@ -802,6 +801,25 @@ func (s *Scheduler) unloadAllRunners() {
|
|||
}
|
||||
}
|
||||
|
||||
func (s *Scheduler) expireRunner(model *Model) {
|
||||
s.loadedMu.Lock()
|
||||
defer s.loadedMu.Unlock()
|
||||
runner, ok := s.loaded[model.ModelPath]
|
||||
if ok {
|
||||
runner.refMu.Lock()
|
||||
runner.expiresAt = time.Now()
|
||||
if runner.expireTimer != nil {
|
||||
runner.expireTimer.Stop()
|
||||
runner.expireTimer = nil
|
||||
}
|
||||
runner.sessionDuration = 0
|
||||
if runner.refCount <= 0 {
|
||||
s.expiredCh <- runner
|
||||
}
|
||||
runner.refMu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// If other runners are loaded, make sure the pending request will fit in system memory
|
||||
// If not, pick a runner to unload, else return nil and the request can be loaded
|
||||
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
|
||||
|
|
|
@ -406,6 +406,52 @@ func TestGetRunner(t *testing.T) {
|
|||
b.ctxDone()
|
||||
}
|
||||
|
||||
func TestExpireRunner(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
req := &LlmRequest{
|
||||
ctx: ctx,
|
||||
model: &Model{ModelPath: "foo"},
|
||||
opts: api.DefaultOptions(),
|
||||
successCh: make(chan *runnerRef, 1),
|
||||
errCh: make(chan error, 1),
|
||||
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
|
||||
}
|
||||
|
||||
var ggml *llm.GGML
|
||||
gpus := gpu.GpuInfoList{}
|
||||
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
return server, nil
|
||||
}
|
||||
s.load(req, ggml, gpus, 0)
|
||||
|
||||
select {
|
||||
case err := <-req.errCh:
|
||||
if err != nil {
|
||||
t.Fatalf("expected no errors when loading, got '%s'", err.Error())
|
||||
}
|
||||
case resp := <-req.successCh:
|
||||
s.loadedMu.Lock()
|
||||
if resp.refCount != uint(1) || len(s.loaded) != 1 {
|
||||
t.Fatalf("expected a model to be loaded")
|
||||
}
|
||||
s.loadedMu.Unlock()
|
||||
}
|
||||
|
||||
s.expireRunner(&Model{ModelPath: "foo"})
|
||||
|
||||
s.finishedReqCh <- req
|
||||
s.processCompleted(ctx)
|
||||
|
||||
s.loadedMu.Lock()
|
||||
if len(s.loaded) != 0 {
|
||||
t.Fatalf("expected model to be unloaded")
|
||||
}
|
||||
s.loadedMu.Unlock()
|
||||
}
|
||||
|
||||
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
||||
func TestPrematureExpired(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||
|
|
Loading…
Reference in a new issue