Compare commits

...

22 commits

Author SHA1 Message Date
7f1565721c
Merge https://github.com/ollama/ollama
Signed-off-by: baalajimaestro <baalajimaestro@ptr.moe>
2024-09-15 23:49:24 +05:30
Edward Cui
d889c6fd07
readme: add Obsidian Quiz Generator plugin to community integrations (#6789) 2024-09-14 23:52:37 -04:00
Daniel Hiltgen
56b9af336a
Fix incremental builds on linux (#6780)
scripts: fix incremental builds on linux or similar
2024-09-13 08:24:08 -07:00
Daniel Hiltgen
fda0d3be52
Use GOARCH for build dirs (#6779)
Corrects x86_64 vs amd64 discrepancy
2024-09-12 16:38:05 -07:00
Daniel Hiltgen
cd5c8f6471
Optimize container images for startup (#6547)
* Optimize container images for startup

This change adjusts how to handle runner payloads to support
container builds where we keep them extracted in the filesystem.
This makes it easier to optimize the cpu/cuda vs cpu/rocm images for
size, and should result in faster startup times for container images.

* Refactor payload logic and add buildx support for faster builds

* Move payloads around

* Review comments

* Converge to buildx based helper scripts

* Use docker buildx action for release
2024-09-12 12:10:30 -07:00
dcasota
fef257c5c5
examples: updated requirements.txt for privategpt example 2024-09-11 18:56:56 -07:00
Adrian Cole
d066d9b8e0
examples: polish loganalyzer example (#6744) 2024-09-11 18:37:37 -07:00
RAPID ARCHITECT
5a00dc9fc9
readme: add ollama_moe to community integrations (#6752) 2024-09-11 18:36:26 -07:00
Jesse Gross
c354e87809
Merge pull request #6767 from ollama/jessegross/bug_6707
runner: Flush pending responses before returning
2024-09-11 17:20:22 -07:00
Jesse Gross
93ac3760cb runner: Flush pending responses before returning
If there are any pending reponses (such as from potential stop
tokens) then we should send them back before ending the sequence.
Otherwise, we can be missing tokens at the end of a response.

Fixes #6707
2024-09-11 16:39:32 -07:00
Patrick Devine
abed273de3
add "stop" command (#6739) 2024-09-11 16:36:21 -07:00
Michael Yang
034392624c
Merge pull request #6762 from ollama/mxyng/show-output
refactor show ouput
2024-09-11 14:58:40 -07:00
Michael Yang
ecab6f1cc5 refactor show ouput
fixes line wrapping on long texts
2024-09-11 14:23:09 -07:00
Petr Mironychev
7d6900827d
readme: add QodeAssist to community integrations (#6754) 2024-09-11 13:19:49 -07:00
Daniel Hiltgen
9246e6dd15
Verify permissions for AMD GPU (#6736)
This adds back a check which was lost many releases back to verify /dev/kfd permissions
which when lacking, can lead to confusing failure modes of:
  "rocBLAS error: Could not initialize Tensile host: No devices found"

This implementation does not hard fail the serve command but instead will fall back to CPU
with an error log.  In the future we can include this in the GPU discovery UX to show
detected but unsupported devices we discovered.
2024-09-11 11:38:25 -07:00
Michael Yang
735a0ca2e4
Merge pull request #6732 from ollama/mxyng/debug-proxy
add *_proxy to env map for debugging
2024-09-10 16:13:25 -07:00
Michael Yang
dddb72e084 add *_proxy for debugging 2024-09-10 09:43:35 -07:00
Jeffrey Morgan
83a9b5271a
docs: update examples to use llama3.1 (#6718) 2024-09-09 22:47:16 -07:00
Daniel Hiltgen
4a8069f9c4
Quiet down dockers new lint warnings (#6716)
* Quiet down dockers new lint warnings

Docker has recently added lint warnings to build.  This cleans up those warnings.

* Fix go lint regression
2024-09-09 17:22:20 -07:00
Patrick Devine
84b84ce2db
catch when model vocab size is set correctly (#6714) 2024-09-09 17:18:54 -07:00
Jeffrey Morgan
bb6a086d63
readme: add crewAI to community integrations (#6699) 2024-09-08 00:36:24 -07:00
RAPID ARCHITECT
30c8f201cc
readme: add crewAI with mesop to community integrations 2024-09-08 00:35:59 -07:00
51 changed files with 1383 additions and 868 deletions

View file

@ -7,3 +7,5 @@ llm/llama.cpp
.env .env
.cache .cache
test_data test_data
llm/build
llama/build

View file

@ -102,8 +102,8 @@ jobs:
with: with:
name: generate-windows-cpu name: generate-windows-cpu
path: | path: |
llm/build/**/bin/* build/**/*
llm/build/**/*.a build/**/*.a
dist/windows-amd64/** dist/windows-amd64/**
# ROCm generation step # ROCm generation step
@ -176,7 +176,7 @@ jobs:
with: with:
name: generate-windows-rocm name: generate-windows-rocm
path: | path: |
llm/build/**/bin/* build/**/*
dist/windows-amd64/** dist/windows-amd64/**
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
@ -265,7 +265,7 @@ jobs:
with: with:
name: generate-windows-cuda-${{ matrix.cuda.version }} name: generate-windows-cuda-${{ matrix.cuda.version }}
path: | path: |
llm/build/**/bin/* build/**/*
dist/windows-amd64/** dist/windows-amd64/**
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
@ -338,7 +338,7 @@ jobs:
- uses: actions/download-artifact@v4 - uses: actions/download-artifact@v4
with: with:
name: generate-windows-rocm name: generate-windows-rocm
- run: dir llm/build - run: dir build
- run: | - run: |
$gopath=(get-command go).source | split-path -parent $gopath=(get-command go).source | split-path -parent
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
@ -359,9 +359,7 @@ jobs:
environment: release environment: release
runs-on: linux runs-on: linux
env: env:
OLLAMA_SKIP_MANIFEST_CREATE: '1'
BUILD_ARCH: amd64 BUILD_ARCH: amd64
PUSH: '1'
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
@ -369,14 +367,8 @@ jobs:
- name: Set Version - name: Set Version
shell: bash shell: bash
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- run: | - run: |
./scripts/build_linux.sh ./scripts/build_linux.sh
./scripts/build_docker.sh
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: dist-linux-amd64 name: dist-linux-amd64
@ -390,9 +382,7 @@ jobs:
environment: release environment: release
runs-on: linux-arm64 runs-on: linux-arm64
env: env:
OLLAMA_SKIP_MANIFEST_CREATE: '1'
BUILD_ARCH: arm64 BUILD_ARCH: arm64
PUSH: '1'
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
@ -421,14 +411,8 @@ jobs:
sudo usermod -aG docker $USER sudo usermod -aG docker $USER
sudo apt-get install acl sudo apt-get install acl
sudo setfacl --modify user:$USER:rw /var/run/docker.sock sudo setfacl --modify user:$USER:rw /var/run/docker.sock
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- run: | - run: |
./scripts/build_linux.sh ./scripts/build_linux.sh
./scripts/build_docker.sh
- uses: actions/upload-artifact@v4 - uses: actions/upload-artifact@v4
with: with:
name: dist-linux-arm64 name: dist-linux-arm64
@ -436,6 +420,181 @@ jobs:
dist/*linux* dist/*linux*
!dist/*-cov !dist/*-cov
# Container image build
build-linux:
environment: release
strategy:
matrix:
runner:
- linux
- linux-arm64
runs-on: ${{ matrix.runner }}
env:
FINAL_IMAGE_REPO: ollama/ollama
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: 'Install Docker'
if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
run: |
sudo apt-get update
sudo apt-get install -y ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
sudo usermod -aG docker $USER
sudo apt-get install acl
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.FINAL_IMAGE_REPO }}
flavor: |
latest=false
tags: |
type=ref,event=tag
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
type=semver,pattern={{version}}
- name: Set Version
shell: bash
run: |
machine=$(uname -m)
case ${machine} in
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
esac >>$GITHUB_ENV
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: Build and push by digest
id: build
uses: docker/build-push-action@v6
with:
context: "."
platforms: linux/${{ env.ARCH }}
build-args: |
GOFLAGS
outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
- name: Export digest
run: |
mkdir -p /tmp/digests
digest="${{ steps.build.outputs.digest }}"
touch "/tmp/digests/${digest#sha256:}"
- name: Upload digest
uses: actions/upload-artifact@v4
with:
name: digests-${{ env.PLATFORM_PAIR }}
path: /tmp/digests/*
if-no-files-found: error
retention-days: 1
merge:
environment: release
runs-on: linux
needs:
- build-linux
env:
FINAL_IMAGE_REPO: ollama/ollama
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Download digests
uses: actions/download-artifact@v4
with:
path: /tmp/digests
pattern: digests-*
merge-multiple: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.FINAL_IMAGE_REPO }}
flavor: |
latest=false
tags: |
type=ref,event=tag
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
type=semver,pattern={{version}}
- name: Set Version
shell: bash
run: |
machine=$(uname -m)
case ${machine} in
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
esac >>$GITHUB_ENV
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: Create manifest list and push
working-directory: /tmp/digests
run: |
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
$(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
- name: Inspect image
run: |
docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}
build-linux-rocm:
environment: release
runs-on: linux
env:
FINAL_IMAGE_REPO: ollama/ollama
ARCH: amd64
PLATFORM_PAIR: linux-amd64
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.FINAL_IMAGE_REPO }}
flavor: |
latest=false
tags: |
type=ref,event=tag
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
type=semver,pattern={{version}}
- name: Set Version
shell: bash
run: |
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- name: Build and push by digest
id: build
uses: docker/build-push-action@v6
with:
context: "."
target: runtime-rocm
build-args: |
GOFLAGS
tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm,${{ env.FINAL_IMAGE_REPO }}:rocm
push: true
# Aggregate all the assets and ship a release # Aggregate all the assets and ship a release
release: release:
needs: needs:
@ -448,8 +607,6 @@ jobs:
permissions: permissions:
contents: write contents: write
env: env:
OLLAMA_SKIP_IMAGE_BUILD: '1'
PUSH: '1'
GH_TOKEN: ${{ github.token }} GH_TOKEN: ${{ github.token }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -458,12 +615,6 @@ jobs:
run: | run: |
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ vars.DOCKER_USER }}
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
- run: ./scripts/build_docker.sh
- name: Retrieve built artifact - name: Retrieve built artifact
uses: actions/download-artifact@v4 uses: actions/download-artifact@v4
with: with:

View file

@ -81,12 +81,6 @@ jobs:
if: ${{ ! startsWith(matrix.os, 'windows-') }} if: ${{ ! startsWith(matrix.os, 'windows-') }}
name: 'Unix Go Generate' name: 'Unix Go Generate'
- run: go build . - run: go build .
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
path: |
llm/build/**/bin/*
llm/build/**/*.a
generate-cuda: generate-cuda:
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }} if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
@ -114,12 +108,6 @@ jobs:
go generate -x ./... go generate -x ./...
env: env:
OLLAMA_SKIP_CPU_GENERATE: '1' OLLAMA_SKIP_CPU_GENERATE: '1'
- uses: actions/upload-artifact@v4
with:
name: cuda-${{ matrix.cuda-version }}-libraries
path: |
llm/build/**/bin/*
dist/windows-amd64/**
generate-rocm: generate-rocm:
needs: [changes] needs: [changes]
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }} if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
@ -147,12 +135,6 @@ jobs:
go generate -x ./... go generate -x ./...
env: env:
OLLAMA_SKIP_CPU_GENERATE: '1' OLLAMA_SKIP_CPU_GENERATE: '1'
- uses: actions/upload-artifact@v4
with:
name: rocm-${{ matrix.rocm-version }}-libraries
path: |
llm/build/**/bin/*
dist/windows-amd64/**
# ROCm generation step # ROCm generation step
generate-windows-rocm: generate-windows-rocm:
@ -189,7 +171,6 @@ jobs:
name: go generate name: go generate
env: env:
OLLAMA_SKIP_CPU_GENERATE: '1' OLLAMA_SKIP_CPU_GENERATE: '1'
# TODO - do we need any artifacts?
# CUDA generation step # CUDA generation step
generate-windows-cuda: generate-windows-cuda:
@ -231,7 +212,6 @@ jobs:
go generate -x ./... go generate -x ./...
env: env:
OLLAMA_SKIP_CPU_GENERATE: '1' OLLAMA_SKIP_CPU_GENERATE: '1'
# TODO - do we need any artifacts?
lint: lint:
strategy: strategy:
@ -263,14 +243,6 @@ jobs:
arm64) echo ARCH=arm64 ;; arm64) echo ARCH=arm64 ;;
esac >>$GITHUB_ENV esac >>$GITHUB_ENV
shell: bash shell: bash
- run: |
mkdir -p llm/build/linux/$ARCH/stub/bin
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: |
mkdir -p llm/build/darwin/$ARCH/stub/bin
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
if: ${{ startsWith(matrix.os, 'macos-') }}
- uses: golangci/golangci-lint-action@v6 - uses: golangci/golangci-lint-action@v6
with: with:
args: --timeout 8m0s -v args: --timeout 8m0s -v
@ -301,23 +273,10 @@ jobs:
cache: true cache: true
- run: | - run: |
case ${{ matrix.arch }} in case ${{ matrix.arch }} in
amd64) echo ARCH=x86_64 ;; amd64) echo ARCH=amd64 ;;
arm64) echo ARCH=arm64 ;; arm64) echo ARCH=arm64 ;;
esac >>$GITHUB_ENV esac >>$GITHUB_ENV
shell: bash shell: bash
- run: |
mkdir -p llm/build/linux/$ARCH/stub/bin
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
- run: |
mkdir -p llm/build/darwin/$ARCH/stub/bin
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
if: ${{ startsWith(matrix.os, 'macos-') }}
shell: bash
- run: go generate ./... - run: go generate ./...
- run: go build - run: go build
- run: go test -v ./... - run: go test -v ./...
- uses: actions/upload-artifact@v4
with:
name: ${{ matrix.os }}-binaries
path: ollama

3
.gitignore vendored
View file

@ -12,4 +12,7 @@ ggml-metal.metal
test_data test_data
*.crt *.crt
llm/build llm/build
build/*/*/*
!build/**/placeholder
llama/build
__debug_bin* __debug_bin*

View file

@ -312,6 +312,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support) - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption) - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library) - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
### Terminal ### Terminal
@ -336,6 +337,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [podman-ollama](https://github.com/ericcurtin/podman-ollama) - [podman-ollama](https://github.com/ericcurtin/podman-ollama)
- [gollama](https://github.com/sammcj/gollama) - [gollama](https://github.com/sammcj/gollama)
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/) - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
### Apple Vision Pro ### Apple Vision Pro
- [Enchanted](https://github.com/AugustDev/enchanted) - [Enchanted](https://github.com/AugustDev/enchanted)
@ -358,6 +360,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa) - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama) - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
- [crewAI](https://github.com/crewAIInc/crewAI)
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example) - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java) - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs) - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
@ -427,6 +430,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server) - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links) - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality) - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
### Supported backends ### Supported backends

View file

@ -0,0 +1 @@
This is here to make sure the build/ directory exists for the go:embed command

View file

@ -0,0 +1 @@
This is here to make sure the build/ directory exists for the go:embed command

View file

@ -0,0 +1,8 @@
package build
import "embed"
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
//go:embed darwin/amd64/*
var EmbedFS embed.FS

View file

@ -0,0 +1,8 @@
package build
import "embed"
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
//go:embed darwin/arm64/*
var EmbedFS embed.FS

6
build/embed_linux.go Normal file
View file

@ -0,0 +1,6 @@
package build
import "embed"
//go:embed linux/*
var EmbedFS embed.FS

8
build/embed_unused.go Normal file
View file

@ -0,0 +1,8 @@
//go:build !linux && !darwin
package build
import "embed"
// unused on windows
var EmbedFS embed.FS

View file

@ -0,0 +1 @@
This is here to make sure the build/ directory exists for the go:embed command

View file

@ -0,0 +1 @@
This is here to make sure the build/ directory exists for the go:embed command

View file

@ -2,6 +2,7 @@ package cmd
import ( import (
"archive/zip" "archive/zip"
"bufio"
"bytes" "bytes"
"context" "context"
"crypto/ed25519" "crypto/ed25519"
@ -21,6 +22,7 @@ import (
"regexp" "regexp"
"runtime" "runtime"
"slices" "slices"
"strconv"
"strings" "strings"
"sync/atomic" "sync/atomic"
"syscall" "syscall"
@ -344,6 +346,39 @@ func (w *progressWriter) Write(p []byte) (n int, err error) {
return len(p), nil return len(p), nil
} }
func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
req := &api.GenerateRequest{
Model: opts.Model,
KeepAlive: opts.KeepAlive,
}
return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
}
func StopHandler(cmd *cobra.Command, args []string) error {
opts := &runOptions{
Model: args[0],
KeepAlive: &api.Duration{Duration: 0},
}
if err := loadOrUnloadModel(cmd, opts); err != nil {
if strings.Contains(err.Error(), "not found") {
return fmt.Errorf("couldn't find model \"%s\" to stop", args[0])
}
}
return nil
}
func RunHandler(cmd *cobra.Command, args []string) error { func RunHandler(cmd *cobra.Command, args []string) error {
interactive := true interactive := true
@ -422,7 +457,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
opts.ParentModel = info.Details.ParentModel opts.ParentModel = info.Details.ParentModel
if interactive { if interactive {
if err := loadModel(cmd, &opts); err != nil { if err := loadOrUnloadModel(cmd, &opts); err != nil {
return err return err
} }
@ -578,7 +613,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
table.SetHeaderLine(false) table.SetHeaderLine(false)
table.SetBorder(false) table.SetBorder(false)
table.SetNoWhiteSpace(true) table.SetNoWhiteSpace(true)
table.SetTablePadding("\t") table.SetTablePadding(" ")
table.AppendBulk(data) table.AppendBulk(data)
table.Render() table.Render()
@ -613,7 +648,15 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100) cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent)) procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
} }
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
var until string
delta := time.Since(m.ExpiresAt)
if delta > 0 {
until = "Stopping..."
} else {
until = format.HumanTime(m.ExpiresAt, "Never")
}
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
} }
} }
@ -624,7 +667,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
table.SetHeaderLine(false) table.SetHeaderLine(false)
table.SetBorder(false) table.SetBorder(false)
table.SetNoWhiteSpace(true) table.SetNoWhiteSpace(true)
table.SetTablePadding("\t") table.SetTablePadding(" ")
table.AppendBulk(data) table.AppendBulk(data)
table.Render() table.Render()
@ -720,125 +763,89 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
return nil return nil
} }
showInfo(resp) return showInfo(resp, os.Stdout)
return nil
} }
func showInfo(resp *api.ShowResponse) { func showInfo(resp *api.ShowResponse, w io.Writer) error {
modelData := [][]string{ tableRender := func(header string, rows func() [][]string) {
{"parameters", resp.Details.ParameterSize}, fmt.Fprintln(w, " ", header)
{"quantization", resp.Details.QuantizationLevel}, table := tablewriter.NewWriter(w)
} table.SetAlignment(tablewriter.ALIGN_LEFT)
if resp.ModelInfo != nil { table.SetBorder(false)
arch := resp.ModelInfo["general.architecture"].(string) table.SetNoWhiteSpace(true)
modelData = append(modelData, table.SetTablePadding(" ")
[]string{"arch", arch},
[]string{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))}, switch header {
[]string{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))}, case "Template", "System", "License":
) table.SetColWidth(100)
}
table.AppendBulk(rows())
table.Render()
fmt.Fprintln(w)
} }
mainTableData := [][]string{ tableRender("Model", func() (rows [][]string) {
{"Model"}, if resp.ModelInfo != nil {
{renderSubTable(modelData, false)}, arch := resp.ModelInfo["general.architecture"].(string)
} rows = append(rows, []string{"", "architecture", arch})
rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))})
rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)})
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)})
} else {
rows = append(rows, []string{"", "architecture", resp.Details.Family})
rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
}
rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
return
})
if resp.ProjectorInfo != nil { if resp.ProjectorInfo != nil {
projectorData := [][]string{ tableRender("Projector", func() (rows [][]string) {
{"arch", "clip"}, arch := resp.ProjectorInfo["general.architecture"].(string)
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))}, rows = append(rows, []string{"", "architecture", arch})
} rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))})
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.embedding_length", arch)].(float64), 'f', -1, 64)})
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok { rows = append(rows, []string{"", "dimensions", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.projection_dim", arch)].(float64), 'f', -1, 64)})
projectorData = append(projectorData, []string{"projector type", projectorType.(string)}) return
} })
projectorData = append(projectorData,
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
)
mainTableData = append(mainTableData,
[]string{"Projector"},
[]string{renderSubTable(projectorData, false)},
)
} }
if resp.Parameters != "" { if resp.Parameters != "" {
mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)}) tableRender("Parameters", func() (rows [][]string) {
scanner := bufio.NewScanner(strings.NewReader(resp.Parameters))
for scanner.Scan() {
if text := scanner.Text(); text != "" {
rows = append(rows, append([]string{""}, strings.Fields(text)...))
}
}
return
})
}
head := func(s string, n int) (rows [][]string) {
scanner := bufio.NewScanner(strings.NewReader(s))
for scanner.Scan() && (len(rows) < n || n < 0) {
if text := scanner.Text(); text != "" {
rows = append(rows, []string{"", strings.TrimSpace(text)})
}
}
return
} }
if resp.System != "" { if resp.System != "" {
mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)}) tableRender("System", func() [][]string {
return head(resp.System, 2)
})
} }
if resp.License != "" { if resp.License != "" {
mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)}) tableRender("License", func() [][]string {
return head(resp.License, 2)
})
} }
table := tablewriter.NewWriter(os.Stdout) return nil
table.SetAutoWrapText(false)
table.SetBorder(false)
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range mainTableData {
table.Append(v)
}
table.Render()
}
func renderSubTable(data [][]string, file bool) string {
var buf bytes.Buffer
table := tablewriter.NewWriter(&buf)
table.SetAutoWrapText(!file)
table.SetBorder(false)
table.SetNoWhiteSpace(true)
table.SetTablePadding("\t")
table.SetAlignment(tablewriter.ALIGN_LEFT)
for _, v := range data {
table.Append(v)
}
table.Render()
renderedTable := buf.String()
lines := strings.Split(renderedTable, "\n")
for i, line := range lines {
lines[i] = "\t" + line
}
return strings.Join(lines, "\n")
}
func twoLines(s string) [][]string {
lines := strings.Split(s, "\n")
res := [][]string{}
count := 0
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
count++
res = append(res, []string{line})
if count == 2 {
return res
}
}
}
return res
}
func formatParams(s string) string {
lines := strings.Split(s, "\n")
table := [][]string{}
for _, line := range lines {
table = append(table, strings.Fields(line))
}
return renderSubTable(table, false)
} }
func CopyHandler(cmd *cobra.Command, args []string) error { func CopyHandler(cmd *cobra.Command, args []string) error {
@ -1328,6 +1335,15 @@ func NewCLI() *cobra.Command {
runCmd.Flags().Bool("insecure", false, "Use an insecure registry") runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically") runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
runCmd.Flags().String("format", "", "Response format (e.g. json)") runCmd.Flags().String("format", "", "Response format (e.g. json)")
stopCmd := &cobra.Command{
Use: "stop MODEL",
Short: "Stop a running model",
Args: cobra.ExactArgs(1),
PreRunE: checkServerHeartbeat,
RunE: StopHandler,
}
serveCmd := &cobra.Command{ serveCmd := &cobra.Command{
Use: "serve", Use: "serve",
Aliases: []string{"start"}, Aliases: []string{"start"},
@ -1395,6 +1411,7 @@ func NewCLI() *cobra.Command {
createCmd, createCmd,
showCmd, showCmd,
runCmd, runCmd,
stopCmd,
pullCmd, pullCmd,
pushCmd, pushCmd,
listCmd, listCmd,
@ -1434,6 +1451,7 @@ func NewCLI() *cobra.Command {
createCmd, createCmd,
showCmd, showCmd,
runCmd, runCmd,
stopCmd,
pullCmd, pullCmd,
pushCmd, pushCmd,
listCmd, listCmd,

206
cmd/cmd_test.go Normal file
View file

@ -0,0 +1,206 @@
package cmd
import (
"bytes"
"os"
"path/filepath"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/ollama/ollama/api"
)
func TestShowInfo(t *testing.T) {
t.Run("bare details", func(t *testing.T) {
var b bytes.Buffer
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
quantization FP16
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
t.Run("bare model info", func(t *testing.T) {
var b bytes.Buffer
if err := showInfo(&api.ShowResponse{
ModelInfo: map[string]any{
"general.architecture": "test",
"general.parameter_count": float64(7_000_000_000),
"test.context_length": float64(0),
"test.embedding_length": float64(0),
},
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
context length 0
embedding length 0
quantization FP16
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
t.Run("parameters", func(t *testing.T) {
var b bytes.Buffer
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
Parameters: `
stop never
stop gonna
stop give
stop you
stop up
temperature 99`,
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
quantization FP16
Parameters
stop never
stop gonna
stop give
stop you
stop up
temperature 99
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
t.Run("project info", func(t *testing.T) {
var b bytes.Buffer
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
ProjectorInfo: map[string]any{
"general.architecture": "clip",
"general.parameter_count": float64(133_700_000),
"clip.vision.embedding_length": float64(0),
"clip.vision.projection_dim": float64(0),
},
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
quantization FP16
Projector
architecture clip
parameters 133.70M
embedding length 0
dimensions 0
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
t.Run("system", func(t *testing.T) {
var b bytes.Buffer
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
System: `You are a pirate!
Ahoy, matey!
Weigh anchor!
`,
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
quantization FP16
System
You are a pirate!
Ahoy, matey!
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
t.Run("license", func(t *testing.T) {
var b bytes.Buffer
license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
if err != nil {
t.Fatal(err)
}
if err := showInfo(&api.ShowResponse{
Details: api.ModelDetails{
Family: "test",
ParameterSize: "7B",
QuantizationLevel: "FP16",
},
License: string(license),
}, &b); err != nil {
t.Fatal(err)
}
expect := ` Model
architecture test
parameters 7B
quantization FP16
License
MIT License
Copyright (c) Ollama
`
if diff := cmp.Diff(expect, b.String()); diff != "" {
t.Errorf("unexpected output (-want +got):\n%s", diff)
}
})
}

View file

@ -18,7 +18,6 @@ import (
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/parser"
"github.com/ollama/ollama/progress"
"github.com/ollama/ollama/readline" "github.com/ollama/ollama/readline"
"github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/errtypes"
) )
@ -31,26 +30,6 @@ const (
MultilineSystem MultilineSystem
) )
func loadModel(cmd *cobra.Command, opts *runOptions) error {
p := progress.NewProgress(os.Stderr)
defer p.StopAndClear()
spinner := progress.NewSpinner("")
p.Add("", spinner)
client, err := api.ClientFromEnvironment()
if err != nil {
return err
}
chatReq := &api.ChatRequest{
Model: opts.Model,
KeepAlive: opts.KeepAlive,
}
return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
}
func generateInteractive(cmd *cobra.Command, opts runOptions) error { func generateInteractive(cmd *cobra.Command, opts runOptions) error {
usage := func() { usage := func() {
fmt.Fprintln(os.Stderr, "Available Commands:") fmt.Fprintln(os.Stderr, "Available Commands:")
@ -217,7 +196,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
opts.Model = args[1] opts.Model = args[1]
opts.Messages = []api.Message{} opts.Messages = []api.Message{}
fmt.Printf("Loading model '%s'\n", opts.Model) fmt.Printf("Loading model '%s'\n", opts.Model)
if err := loadModel(cmd, &opts); err != nil { if err := loadOrUnloadModel(cmd, &opts); err != nil {
return err return err
} }
continue continue
@ -371,7 +350,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
switch args[1] { switch args[1] {
case "info": case "info":
showInfo(resp) _ = showInfo(resp, os.Stderr)
case "license": case "license":
if resp.License == "" { if resp.License == "" {
fmt.Println("No license was specified for this model.") fmt.Println("No license was specified for this model.")

View file

@ -208,14 +208,18 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
return err return err
} }
if vocabSize := int(p.VocabSize); vocabSize > len(t.Vocabulary.Tokens) { vocabSize := int(p.VocabSize)
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", p.VocabSize, "actual", len(t.Vocabulary.Tokens)) switch {
case vocabSize > len(t.Vocabulary.Tokens):
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
for i := range vocabSize - len(t.Vocabulary.Tokens) { for i := range vocabSize - len(t.Vocabulary.Tokens) {
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i)) t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1) t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined) t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
} }
} else { case vocabSize < len(t.Vocabulary.Tokens):
return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
default:
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
} }

View file

@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama3", "model": "llama3.1",
"prompt": "Why is the sky blue?" "prompt": "Why is the sky blue?"
}' }'
``` ```
@ -80,7 +80,7 @@ A stream of JSON objects is returned:
```json ```json
{ {
"model": "llama3", "model": "llama3.1",
"created_at": "2023-08-04T08:52:19.385406455-07:00", "created_at": "2023-08-04T08:52:19.385406455-07:00",
"response": "The", "response": "The",
"done": false "done": false
@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
```json ```json
{ {
"model": "llama3", "model": "llama3.1",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"response": "", "response": "",
"done": true, "done": true,
@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off.
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama3", "model": "llama3.1",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"stream": false "stream": false
}' }'
@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
```json ```json
{ {
"model": "llama3", "model": "llama3.1",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.", "response": "The sky is blue because it is the color of the sky.",
"done": true, "done": true,
@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama3", "model": "llama3.1",
"prompt": "What color is the sky at different times of the day? Respond using JSON", "prompt": "What color is the sky at different times of the day? Respond using JSON",
"format": "json", "format": "json",
"stream": false "stream": false
@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{
```json ```json
{ {
"model": "llama3", "model": "llama3.1",
"created_at": "2023-11-09T21:07:55.186497Z", "created_at": "2023-11-09T21:07:55.186497Z",
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n", "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
"done": true, "done": true,
@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama3", "model": "llama3.1",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"stream": false, "stream": false,
"options": { "options": {
@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{
```json ```json
{ {
"model": "llama3", "model": "llama3.1",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"response": "The sky is blue because it is the color of the sky.", "response": "The sky is blue because it is the color of the sky.",
"done": true, "done": true,
@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory.
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama3" "model": "llama3.1"
}' }'
``` ```
@ -400,7 +400,7 @@ A single JSON object is returned:
```json ```json
{ {
"model": "llama3", "model": "llama3.1",
"created_at": "2023-12-18T19:52:07.071755Z", "created_at": "2023-12-18T19:52:07.071755Z",
"response": "", "response": "",
"done": true "done": true
@ -445,7 +445,7 @@ Send a chat message with a streaming response.
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama3", "model": "llama3.1",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
@ -461,7 +461,7 @@ A stream of JSON objects is returned:
```json ```json
{ {
"model": "llama3", "model": "llama3.1",
"created_at": "2023-08-04T08:52:19.385406455-07:00", "created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": { "message": {
"role": "assistant", "role": "assistant",
@ -476,7 +476,7 @@ Final response:
```json ```json
{ {
"model": "llama3", "model": "llama3.1",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"done": true, "done": true,
"total_duration": 4883583458, "total_duration": 4883583458,
@ -494,7 +494,7 @@ Final response:
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama3", "model": "llama3.1",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
@ -509,7 +509,7 @@ curl http://localhost:11434/api/chat -d '{
```json ```json
{ {
"model": "registry.ollama.ai/library/llama3:latest", "model": "llama3.1",
"created_at": "2023-12-12T14:13:43.416799Z", "created_at": "2023-12-12T14:13:43.416799Z",
"message": { "message": {
"role": "assistant", "role": "assistant",
@ -533,7 +533,7 @@ Send a chat message with a conversation history. You can use this same approach
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama3", "model": "llama3.1",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
@ -557,7 +557,7 @@ A stream of JSON objects is returned:
```json ```json
{ {
"model": "llama3", "model": "llama3.1",
"created_at": "2023-08-04T08:52:19.385406455-07:00", "created_at": "2023-08-04T08:52:19.385406455-07:00",
"message": { "message": {
"role": "assistant", "role": "assistant",
@ -571,7 +571,7 @@ Final response:
```json ```json
{ {
"model": "llama3", "model": "llama3.1",
"created_at": "2023-08-04T19:22:45.499127Z", "created_at": "2023-08-04T19:22:45.499127Z",
"done": true, "done": true,
"total_duration": 8113331500, "total_duration": 8113331500,
@ -629,7 +629,7 @@ curl http://localhost:11434/api/chat -d '{
```shell ```shell
curl http://localhost:11434/api/chat -d '{ curl http://localhost:11434/api/chat -d '{
"model": "llama3", "model": "llama3.1",
"messages": [ "messages": [
{ {
"role": "user", "role": "user",
@ -647,7 +647,7 @@ curl http://localhost:11434/api/chat -d '{
```json ```json
{ {
"model": "registry.ollama.ai/library/llama3:latest", "model": "llama3.1",
"created_at": "2023-12-12T14:13:43.416799Z", "created_at": "2023-12-12T14:13:43.416799Z",
"message": { "message": {
"role": "assistant", "role": "assistant",
@ -904,7 +904,7 @@ Show information about a model including details, modelfile, template, parameter
```shell ```shell
curl http://localhost:11434/api/show -d '{ curl http://localhost:11434/api/show -d '{
"name": "llama3" "name": "llama3.1"
}' }'
``` ```
@ -965,7 +965,7 @@ Copy a model. Creates a model with another name from an existing model.
```shell ```shell
curl http://localhost:11434/api/copy -d '{ curl http://localhost:11434/api/copy -d '{
"source": "llama3", "source": "llama3.1",
"destination": "llama3-backup" "destination": "llama3-backup"
}' }'
``` ```
@ -1020,7 +1020,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
```shell ```shell
curl http://localhost:11434/api/pull -d '{ curl http://localhost:11434/api/pull -d '{
"name": "llama3" "name": "llama3.1"
}' }'
``` ```

View file

@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
```shell ```shell
curl http://localhost:11434/api/generate -d '{ curl http://localhost:11434/api/generate -d '{
"model": "llama3", "model": "llama3.1",
"prompt": "Why is the sky blue?", "prompt": "Why is the sky blue?",
"options": { "options": {
"num_ctx": 4096 "num_ctx": 4096
@ -247,12 +247,12 @@ The `keep_alive` parameter can be set to:
For example, to preload a model and leave it in memory use: For example, to preload a model and leave it in memory use:
```shell ```shell
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}' curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": -1}'
``` ```
To unload the model and free up memory use: To unload the model and free up memory use:
```shell ```shell
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}' curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}'
``` ```
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable. Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.

View file

@ -11,7 +11,7 @@ A model file is the blueprint to create and share models with Ollama.
- [Examples](#examples) - [Examples](#examples)
- [Instructions](#instructions) - [Instructions](#instructions)
- [FROM (Required)](#from-required) - [FROM (Required)](#from-required)
- [Build from llama3.1](#build-from-llama31) - [Build from existing model](#build-from-existing-model)
- [Build from a Safetensors model](#build-from-a-safetensors-model) - [Build from a Safetensors model](#build-from-a-safetensors-model)
- [Build from a GGUF file](#build-from-a-gguf-file) - [Build from a GGUF file](#build-from-a-gguf-file)
- [PARAMETER](#parameter) - [PARAMETER](#parameter)
@ -50,7 +50,7 @@ INSTRUCTION arguments
An example of a `Modelfile` creating a mario blueprint: An example of a `Modelfile` creating a mario blueprint:
```modelfile ```modelfile
FROM llama3 FROM llama3.1
# sets the temperature to 1 [higher is more creative, lower is more coherent] # sets the temperature to 1 [higher is more creative, lower is more coherent]
PARAMETER temperature 1 PARAMETER temperature 1
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@ -72,10 +72,10 @@ More examples are available in the [examples directory](../examples).
To view the Modelfile of a given model, use the `ollama show --modelfile` command. To view the Modelfile of a given model, use the `ollama show --modelfile` command.
```bash ```bash
> ollama show --modelfile llama3 > ollama show --modelfile llama3.1
# Modelfile generated by "ollama show" # Modelfile generated by "ollama show"
# To build a new Modelfile based on this one, replace the FROM line with: # To build a new Modelfile based on this one, replace the FROM line with:
# FROM llama3:latest # FROM llama3.1:latest
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29 FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|> TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
@ -100,7 +100,7 @@ The `FROM` instruction defines the base model to use when creating a model.
FROM <model name>:<tag> FROM <model name>:<tag>
``` ```
#### Build from llama3.1 #### Build from existing model
```modelfile ```modelfile
FROM llama3.1 FROM llama3.1

View file

@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
'content': 'Say this is a test', 'content': 'Say this is a test',
} }
], ],
model='llama3', model='llama3.1',
) )
response = client.chat.completions.create( response = client.chat.completions.create(
@ -46,13 +46,13 @@ response = client.chat.completions.create(
) )
completion = client.completions.create( completion = client.completions.create(
model="llama3", model="llama3.1",
prompt="Say this is a test", prompt="Say this is a test",
) )
list_completion = client.models.list() list_completion = client.models.list()
model = client.models.retrieve("llama3") model = client.models.retrieve("llama3.1")
embeddings = client.embeddings.create( embeddings = client.embeddings.create(
model="all-minilm", model="all-minilm",
@ -74,7 +74,7 @@ const openai = new OpenAI({
const chatCompletion = await openai.chat.completions.create({ const chatCompletion = await openai.chat.completions.create({
messages: [{ role: 'user', content: 'Say this is a test' }], messages: [{ role: 'user', content: 'Say this is a test' }],
model: 'llama3', model: 'llama3.1',
}) })
const response = await openai.chat.completions.create({ const response = await openai.chat.completions.create({
@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({
}) })
const completion = await openai.completions.create({ const completion = await openai.completions.create({
model: "llama3", model: "llama3.1",
prompt: "Say this is a test.", prompt: "Say this is a test.",
}) })
const listCompletion = await openai.models.list() const listCompletion = await openai.models.list()
const model = await openai.models.retrieve("llama3") const model = await openai.models.retrieve("llama3.1")
const embedding = await openai.embeddings.create({ const embedding = await openai.embeddings.create({
model: "all-minilm", model: "all-minilm",
@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({
curl http://localhost:11434/v1/chat/completions \ curl http://localhost:11434/v1/chat/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "llama3", "model": "llama3.1",
"messages": [ "messages": [
{ {
"role": "system", "role": "system",
@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \
curl http://localhost:11434/v1/completions \ curl http://localhost:11434/v1/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
"model": "llama3", "model": "llama3.1",
"prompt": "Say this is a test" "prompt": "Say this is a test"
}' }'
curl http://localhost:11434/v1/models curl http://localhost:11434/v1/models
curl http://localhost:11434/v1/models/llama3 curl http://localhost:11434/v1/models/llama3.1
curl http://localhost:11434/v1/embeddings \ curl http://localhost:11434/v1/embeddings \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
@ -274,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \
Before using a model, pull it locally `ollama pull`: Before using a model, pull it locally `ollama pull`:
```shell ```shell
ollama pull llama3 ollama pull llama3.1
``` ```
### Default model names ### Default model names
@ -282,7 +282,7 @@ ollama pull llama3
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name: For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
``` ```
ollama cp llama3 gpt-3.5-turbo ollama cp llama3.1 gpt-3.5-turbo
``` ```
Afterwards, this new model name can be specified the `model` field: Afterwards, this new model name can be specified the `model` field:

View file

@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat
To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3. To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
```dockerfile ```dockerfile
FROM llama3 FROM llama3.1
TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|> TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>

View file

@ -91,6 +91,17 @@ If none of those resolve the problem, gather additional information and file an
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia` - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
## AMD GPU Discovery
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
## Windows Terminal Errors ## Windows Terminal Errors
Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer. Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer.

View file

@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn
Here's a quick example showing API access from `powershell` Here's a quick example showing API access from `powershell`
```powershell ```powershell
(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json (Invoke-WebRequest -method POST -Body '{"model":"llama3.1", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
``` ```
## Troubleshooting ## Troubleshooting

View file

@ -179,53 +179,6 @@ var (
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION") HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
) )
func RunnersDir() (p string) {
if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
return p
}
if runtime.GOOS != "windows" {
return
}
defer func() {
if p == "" {
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
}
}()
// On Windows we do not carry the payloads inside the main executable
exe, err := os.Executable()
if err != nil {
return
}
cwd, err := os.Getwd()
if err != nil {
return
}
var paths []string
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} {
paths = append(paths,
root,
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for _, path := range paths {
candidate := filepath.Join(path, "lib", "ollama", "runners")
if _, err := os.Stat(candidate); err == nil {
p = candidate
break
}
}
return p
}
func Uint(key string, defaultValue uint) func() uint { func Uint(key string, defaultValue uint) func() uint {
return func() uint { return func() uint {
if s := Var(key); s != "" { if s := Var(key); s != "" {
@ -290,10 +243,22 @@ func AsMap() map[string]EnvVar {
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"}, "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"}, "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"}, "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
// Informational
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
"NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
} }
if runtime.GOOS != "windows" {
// Windows environment variables are case-insensitive so there's no need to duplicate them
ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
}
if runtime.GOOS != "darwin" { if runtime.GOOS != "darwin" {
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"} ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"} ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
@ -302,6 +267,7 @@ func AsMap() map[string]EnvVar {
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"} ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"} ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
} }
return ret return ret
} }

View file

@ -1,6 +1,6 @@
langchain==0.0.274 langchain==0.0.274
gpt4all==1.0.8 gpt4all==1.0.8
chromadb==0.4.7 chromadb==0.5.0
llama-cpp-python==0.1.81 llama-cpp-python==0.1.81
urllib3==2.0.4 urllib3==2.0.4
PyMuPDF==1.23.5 PyMuPDF==1.23.5

View file

@ -4,5 +4,5 @@ SYSTEM """
You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer. You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer.
""" """
PARAMETER TEMPERATURE 0.3 PARAMETER temperature 0.3

View file

@ -21,6 +21,8 @@ You can try this with the `logtest.logfile` file included in this directory.
2. Install the Python Requirements. 2. Install the Python Requirements.
```bash ```bash
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt pip install -r requirements.txt
``` ```

View file

@ -1 +1 @@
Requests==2.31.0 Requests>=2.32.3

View file

@ -5,6 +5,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"io" "io"
"io/fs"
"log/slog" "log/slog"
"os" "os"
"path/filepath" "path/filepath"
@ -359,6 +360,10 @@ func AMDGetGPUInfo() []RocmGPUInfo {
if len(resp) == 0 { if len(resp) == 0 {
slog.Info("no compatible amdgpu devices detected") slog.Info("no compatible amdgpu devices detected")
} }
if err := verifyKFDDriverAccess(); err != nil {
slog.Error("amdgpu devices detected but permission problems block access", "error", err)
return nil
}
return resp return resp
} }
@ -455,3 +460,19 @@ func getFreeMemory(usedFile string) (uint64, error) {
} }
return usedMemory, nil return usedMemory, nil
} }
func verifyKFDDriverAccess() error {
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
return fmt.Errorf("permissions not set up properly. Either run ollama as root, or add you user account to the render group. %w", err)
} else if errors.Is(err, fs.ErrNotExist) {
// Container runtime failure?
return fmt.Errorf("kfd driver not loaded. If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
}
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
return nil
}

View file

@ -1,148 +0,0 @@
package gpu
import (
"errors"
"fmt"
"log/slog"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/ollama/ollama/envconfig"
)
var (
lock sync.Mutex
payloadsDir = ""
)
func PayloadsDir() (string, error) {
lock.Lock()
defer lock.Unlock()
var err error
if payloadsDir == "" {
runnersDir := envconfig.RunnersDir()
if runnersDir != "" {
payloadsDir = runnersDir
return payloadsDir, nil
}
// The remainder only applies on non-windows where we still carry payloads in the main executable
cleanupTmpDirs()
tmpDir := envconfig.TmpDir()
if tmpDir == "" {
tmpDir, err = os.MkdirTemp("", "ollama")
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
}
} else {
err = os.MkdirAll(tmpDir, 0o755)
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
}
}
// Track our pid so we can clean up orphaned tmpdirs
n := filepath.Join(tmpDir, "ollama.pid")
if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
}
// We create a distinct subdirectory for payloads within the tmpdir
// This will typically look like /tmp/ollama3208993108/runners on linux
payloadsDir = filepath.Join(tmpDir, "runners")
}
return payloadsDir, nil
}
// Best effort to clean up prior tmpdirs
func cleanupTmpDirs() {
matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
if err != nil {
return
}
for _, match := range matches {
raw, err := os.ReadFile(match)
if errors.Is(err, os.ErrNotExist) {
slog.Debug("not a ollama runtime directory, skipping", "path", match)
continue
} else if err != nil {
slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
continue
}
pid, err := strconv.Atoi(string(raw))
if err != nil {
slog.Warn("invalid pid, skipping", "path", match, "error", err)
continue
}
p, err := os.FindProcess(pid)
if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
slog.Warn("process still running, skipping", "pid", pid, "path", match)
continue
}
if err := os.Remove(match); err != nil {
slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
}
runners := filepath.Join(filepath.Dir(match), "runners")
if err := os.RemoveAll(runners); err != nil {
slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
}
if err := os.Remove(filepath.Dir(match)); err != nil {
slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
}
}
}
func Cleanup() {
lock.Lock()
defer lock.Unlock()
runnersDir := envconfig.RunnersDir()
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
// We want to fully clean up the tmpdir parent of the payloads dir
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
slog.Debug("cleaning up", "dir", tmpDir)
err := os.RemoveAll(tmpDir)
if err != nil {
// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
time.Sleep(1000 * time.Millisecond)
err = os.RemoveAll(tmpDir)
if err != nil {
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
}
}
}
}
func UpdatePath(dir string) {
if runtime.GOOS == "windows" {
tmpDir := filepath.Dir(dir)
pathComponents := strings.Split(os.Getenv("PATH"), ";")
i := 0
for _, comp := range pathComponents {
if strings.EqualFold(comp, dir) {
return
}
// Remove any other prior paths to our temp dir
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
pathComponents[i] = comp
i++
}
}
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
slog.Info("updating", "PATH", newPath)
os.Setenv("PATH", newPath)
}
// linux and darwin rely on rpath
}

View file

@ -93,10 +93,9 @@ func initCudaHandles() *cudaHandles {
localAppData := os.Getenv("LOCALAPPDATA") localAppData := os.Getenv("LOCALAPPDATA")
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)} cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
} }
tmpDir, _ := PayloadsDir() libDir := LibraryDir()
if tmpDir != "" { if libDir != "" {
// TODO - add "payloads" for subprocess cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
} }
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...) cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)

View file

@ -913,7 +913,9 @@ struct llama_server_context
slot.sampled = result.tok; slot.sampled = result.tok;
// search stop word and delete it // search stop word and delete it
slot.generated_text += token_str; if (!llama_token_is_eog(model, result.tok))
slot.generated_text += token_str;
slot.has_next_token = true; slot.has_next_token = true;
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
@ -954,30 +956,36 @@ struct llama_server_context
if (!incomplete) if (!incomplete)
{ {
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
const std::string str_test = slot.generated_text.substr(pos);
bool is_stop_full = false;
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
if (stop_pos != std::string::npos)
{
is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
}
else
{
is_stop_full = false;
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
}
// check if there is any token to predict if (!llama_token_is_eog(model, result.tok)) {
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) const std::string str_test = slot.generated_text.substr(pos);
{ bool is_stop_full = false;
// no send the stop word in the response size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
result.text_to_send = slot.generated_text.substr(pos, std::string::npos); if (stop_pos != std::string::npos)
slot.n_sent_text += result.text_to_send.size(); {
// add the token to slot queue and cache is_stop_full = true;
slot.generated_text.erase(
slot.generated_text.begin() + pos + stop_pos,
slot.generated_text.end());
pos = std::min(slot.n_sent_text, slot.generated_text.size());
}
else
{
is_stop_full = false;
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
}
// check if there is any token to predict
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
{
// no send the stop word in the response
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
// add the token to slot queue and cache
}
} else {
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
slot.n_sent_text += result.text_to_send.size();
} }
if (slot.params.stream) if (slot.params.stream)
@ -1117,9 +1125,7 @@ struct llama_server_context
{"multimodal", multimodal} {"multimodal", multimodal}
}; };
if (!llama_token_is_eog(model, tkn.tok)) { res.result_json["content"] = tkn.text_to_send;
res.result_json["content"] = tkn.text_to_send;
}
if (slot.sparams.n_probs > 0) if (slot.sparams.n_probs > 0)
{ {

View file

@ -31,6 +31,7 @@ init_vars() {
NO_WHOLE_ARCHIVE="" NO_WHOLE_ARCHIVE=""
GCC_ARCH="-arch ${ARCH}" GCC_ARCH="-arch ${ARCH}"
DIST_BASE=../../dist/darwin-${GOARCH}/ DIST_BASE=../../dist/darwin-${GOARCH}/
PAYLOAD_BASE=../../build/darwin/${GOARCH}
;; ;;
"Linux") "Linux")
LIB_EXT="so" LIB_EXT="so"
@ -40,6 +41,7 @@ init_vars() {
# Cross compiling not supported on linux - Use docker # Cross compiling not supported on linux - Use docker
GCC_ARCH="" GCC_ARCH=""
DIST_BASE=../../dist/linux-${GOARCH}/ DIST_BASE=../../dist/linux-${GOARCH}/
PAYLOAD_BASE=../../build/linux/${GOARCH}
;; ;;
*) *)
;; ;;
@ -47,7 +49,8 @@ init_vars() {
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
fi fi
GZIP=$(which pigz 2>/dev/null || echo "gzip") GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
} }
git_module_setup() { git_module_setup() {
@ -91,17 +94,34 @@ build() {
rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
} }
compress() { dist() {
echo "Compressing payloads to reduce overall binary size..." [ -z "${RUNNER}" ] && exit 1
rm -rf ${BUILD_DIR}/bin/*.gz mkdir -p ${RUNNER_BASE}/${RUNNER}/
for f in ${BUILD_DIR}/bin/* ; do for f in ${BUILD_DIR}/bin/* ; do
${GZIP} -n --best -f ${f} & cp ${f} ${RUNNER_BASE}/${RUNNER}/
done
# check for lib directory
if [ -d ${BUILD_DIR}/lib ]; then
for f in ${BUILD_DIR}/lib/* ; do
cp ${f} ${RUNNER_BASE}/${RUNNER}/
done
fi
}
# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
compress() {
[ -z "${RUNNER}" ] && exit 1
echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
for f in ${BUILD_DIR}/bin/* ; do
${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
compress_pids+=" $!" compress_pids+=" $!"
done done
# check for lib directory # check for lib directory
if [ -d ${BUILD_DIR}/lib ]; then if [ -d ${BUILD_DIR}/lib ]; then
for f in ${BUILD_DIR}/lib/* ; do for f in ${BUILD_DIR}/lib/* ; do
${GZIP} -n --best -f ${f} & ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
compress_pids+=" $!" compress_pids+=" $!"
done done
fi fi
@ -117,7 +137,7 @@ wait_for_compress() {
install() { install() {
echo "Installing libraries to bin dir ${BUILD_DIR}/bin/" echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
rm -f "${BUILD_DIR}/bin/$(basename ${lib})" rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
cp -af "${lib}" "${BUILD_DIR}/bin/" cp -af "${lib}" "${BUILD_DIR}/bin/"
done done

View file

@ -39,7 +39,8 @@ case "${GOARCH}" in
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu" RUNNER=cpu
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
echo "Building LCD CPU" echo "Building LCD CPU"
build build
sign ${BUILD_DIR}/bin/ollama_llama_server sign ${BUILD_DIR}/bin/ollama_llama_server
@ -51,7 +52,8 @@ case "${GOARCH}" in
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx" RUNNER=cpu_avx
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
echo "Building AVX CPU" echo "Building AVX CPU"
build build
sign ${BUILD_DIR}/bin/ollama_llama_server sign ${BUILD_DIR}/bin/ollama_llama_server
@ -63,7 +65,8 @@ case "${GOARCH}" in
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2" RUNNER=cpu_avx2
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
echo "Building AVX2 CPU" echo "Building AVX2 CPU"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
build build
@ -84,7 +87,8 @@ case "${GOARCH}" in
if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
init_vars init_vars
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
BUILD_DIR="../build/darwin/${ARCH}/metal" RUNNER="metal"
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
build build
sign ${BUILD_DIR}/bin/ollama_llama_server sign ${BUILD_DIR}/bin/ollama_llama_server

View file

@ -79,10 +79,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
init_vars init_vars
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu" RUNNER="cpu"
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
echo "Building custom CPU" echo "Building custom CPU"
build build
install install
dist
compress compress
else else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@ -102,10 +104,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu" RUNNER=cpu
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
echo "Building LCD CPU" echo "Building LCD CPU"
build build
install install
dist
compress compress
fi fi
@ -120,10 +124,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu_avx" RUNNER=cpu_avx
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
echo "Building AVX CPU" echo "Building AVX CPU"
build build
install install
dist
compress compress
fi fi
@ -134,10 +140,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# #
init_vars init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}" CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2" RUNNER=cpu_avx2
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
echo "Building AVX2 CPU" echo "Building AVX2 CPU"
build build
install install
dist
compress compress
fi fi
fi fi
@ -187,11 +195,13 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
fi fi
export CUDAFLAGS="-t8" export CUDAFLAGS="-t8"
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" RUNNER=cuda${CUDA_VARIANT}
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}" CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
build build
install install
dist
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
mkdir -p "${CUDA_DIST_DIR}" mkdir -p "${CUDA_DIST_DIR}"
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
@ -212,7 +222,8 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
CC=icx CC=icx
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
BUILD_DIR="../build/linux/${ARCH}/oneapi" RUNNER=oneapi
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama" ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
@ -231,6 +242,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
install install
dist
compress compress
fi fi
@ -259,7 +271,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}" CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
echo "Building custom ROCM GPU" echo "Building custom ROCM GPU"
fi fi
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" RUNNER=rocm${ROCM_VARIANT}
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
# ROCm dependencies are too large to fit into a unified bundle # ROCm dependencies are too large to fit into a unified bundle
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama" ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
# TODO figure out how to disable runpath (rpath) # TODO figure out how to disable runpath (rpath)
@ -269,13 +282,17 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
# copy the ROCM dependencies # copy the ROCM dependencies
mkdir -p "${ROCM_DIST_DIR}" mkdir -p "${ROCM_DIST_DIR}"
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
cp -a "${dep}"* "${ROCM_DIST_DIR}" cp -a "${dep}"* "${ROCM_DIST_DIR}"
if [ $(readlink -f "${dep}") != "${dep}" ] ; then
cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
fi
done done
install install
dist
compress compress
fi fi
cleanup cleanup
wait_for_compress wait_for_compress
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" echo "go generate completed. LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"

View file

@ -1,11 +1,7 @@
package llm package llm
import ( import (
"embed"
"syscall" "syscall"
) )
//go:embed build/darwin/arm64/*/bin/*
var libEmbed embed.FS
var LlamaServerSysProcAttr = &syscall.SysProcAttr{} var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

View file

@ -1,11 +0,0 @@
package llm
import (
"embed"
"syscall"
)
//go:embed build/darwin/x86_64/*/bin/*
var libEmbed embed.FS
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

View file

@ -1,11 +1,7 @@
package llm package llm
import ( import (
"embed"
"syscall" "syscall"
) )
//go:embed build/linux/*/*/bin/*
var libEmbed embed.FS
var LlamaServerSysProcAttr = &syscall.SysProcAttr{} var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

View file

@ -1,13 +1,9 @@
package llm package llm
import ( import (
"embed"
"syscall" "syscall"
) )
// unused on windows
var libEmbed embed.FS
const CREATE_DEFAULT_ERROR_MODE = 0x04000000 const CREATE_DEFAULT_ERROR_MODE = 0x04000000
var LlamaServerSysProcAttr = &syscall.SysProcAttr{ var LlamaServerSysProcAttr = &syscall.SysProcAttr{

View file

@ -1,233 +0,0 @@
package llm
import (
"compress/gzip"
"errors"
"fmt"
"io"
"io/fs"
"log/slog"
"os"
"path/filepath"
"runtime"
"slices"
"strings"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/gpu"
)
var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")
func Init() error {
payloadsDir, err := gpu.PayloadsDir()
if err != nil {
return err
}
if runtime.GOOS != "windows" {
slog.Info("extracting embedded files", "dir", payloadsDir)
binGlob := "build/*/*/*/bin/*"
// extract server libraries
err = extractFiles(payloadsDir, binGlob)
if err != nil {
return fmt.Errorf("extract binaries: %v", err)
}
}
var variants []string
for v := range getAvailableServers() {
variants = append(variants, v)
}
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
return nil
}
// binary names may contain an optional variant separated by '_'
// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
// Any library without a variant is the lowest common denominator
func getAvailableServers() map[string]string {
payloadsDir, err := gpu.PayloadsDir()
if err != nil {
slog.Error("payload lookup error", "error", err)
return nil
}
// glob payloadsDir for files that start with ollama_
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
files, err := filepath.Glob(pattern)
if err != nil {
slog.Debug("could not glob", "pattern", pattern, "error", err)
return nil
}
servers := make(map[string]string)
for _, file := range files {
slog.Debug("availableServers : found", "file", file)
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
}
return servers
}
// serversForGpu returns a list of compatible servers give the provided GPU
// info, ordered by performance. assumes Init() has been called
// TODO - switch to metadata based mapping
func serversForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_
availableServers := getAvailableServers()
requested := info.Library
if info.Variant != gpu.CPUCapabilityNone.String() {
requested += "_" + info.Variant
}
servers := []string{}
// exact match first
for a := range availableServers {
if a == requested {
servers = []string{a}
if a == "metal" {
return servers
}
break
}
}
alt := []string{}
// Then for GPUs load alternates and sort the list for consistent load ordering
if info.Library != "cpu" {
for a := range availableServers {
if info.Library == strings.Split(a, "_")[0] && a != requested {
alt = append(alt, a)
}
}
slices.Sort(alt)
servers = append(servers, alt...)
}
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
// Load up the best CPU variant if not primary requested
if info.Library != "cpu" {
variant := gpu.GetCPUCapability()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+variant.String() {
servers = append(servers, cmp)
break
}
}
} else {
servers = append(servers, "cpu")
}
}
if len(servers) == 0 {
servers = []string{"cpu"}
}
}
return servers
}
// Return the optimal server for this CPU architecture
func serverForCpu() string {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return "metal"
}
variant := gpu.GetCPUCapability()
availableServers := getAvailableServers()
if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+variant.String() {
return cmp
}
}
}
return "cpu"
}
// extract extracts the embedded files to the target directory
func extractFiles(targetDir string, glob string) error {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return errPayloadMissing
}
if err := os.MkdirAll(targetDir, 0o755); err != nil {
return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
}
g := new(errgroup.Group)
// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
for _, file := range files {
filename := file
variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
slog.Debug("extracting", "variant", variant, "file", filename)
g.Go(func() error {
srcf, err := libEmbed.Open(filename)
if err != nil {
return err
}
defer srcf.Close()
src := io.Reader(srcf)
if strings.HasSuffix(filename, ".gz") {
src, err = gzip.NewReader(src)
if err != nil {
return fmt.Errorf("decompress payload %s: %v", filename, err)
}
filename = strings.TrimSuffix(filename, ".gz")
}
variantDir := filepath.Join(targetDir, variant)
if err := os.MkdirAll(variantDir, 0o755); err != nil {
return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
}
base := filepath.Base(filename)
destFilename := filepath.Join(variantDir, base)
_, err = os.Stat(destFilename)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", filename, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, src); err != nil {
return fmt.Errorf("copy payload %s: %v", filename, err)
}
case err != nil:
return fmt.Errorf("stat payload %s: %v", filename, err)
}
return nil
})
}
err = g.Wait()
if err != nil {
// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
gpu.Cleanup()
return err
}
return nil
}

View file

@ -24,9 +24,11 @@ import (
"golang.org/x/sync/semaphore" "golang.org/x/sync/semaphore"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/build"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format" "github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/runners"
) )
type LlamaServer interface { type LlamaServer interface {
@ -106,7 +108,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
gpus = gpu.GetCPUInfo() gpus = gpu.GetCPUInfo()
} }
if len(gpus) == 1 && gpus[0].Library == "cpu" { if len(gpus) == 1 && gpus[0].Library == "cpu" {
cpuRunner = serverForCpu() cpuRunner = runners.ServerForCpu()
estimate = EstimateGPULayers(gpus, ggml, projectors, opts) estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
} else { } else {
estimate = EstimateGPULayers(gpus, ggml, projectors, opts) estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
@ -118,7 +120,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
opts.NumGPU = 0 opts.NumGPU = 0
case gpus[0].Library != "metal" && estimate.Layers == 0: case gpus[0].Library != "metal" && estimate.Layers == 0:
// Don't bother loading into the GPU if no layers can fit // Don't bother loading into the GPU if no layers can fit
cpuRunner = serverForCpu() cpuRunner = runners.ServerForCpu()
gpus = gpu.GetCPUInfo() gpus = gpu.GetCPUInfo()
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
opts.NumGPU = estimate.Layers opts.NumGPU = estimate.Layers
@ -145,25 +147,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
} }
availableServers := getAvailableServers() rDir, err := runners.Refresh(build.EmbedFS)
if err != nil {
return nil, err
}
availableServers := runners.GetAvailableServers(rDir)
if len(availableServers) == 0 { if len(availableServers) == 0 {
if runtime.GOOS != "windows" { return nil, finalErr
slog.Warn("llama server binary disappeared, reinitializing payloads")
err = Init()
if err != nil {
slog.Warn("failed to reinitialize payloads", "error", err)
return nil, err
}
availableServers = getAvailableServers()
} else {
return nil, finalErr
}
} }
var servers []string var servers []string
if cpuRunner != "" { if cpuRunner != "" {
servers = []string{cpuRunner} servers = []string{cpuRunner}
} else { } else {
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
} }
demandLib := envconfig.LLMLibrary() demandLib := envconfig.LLMLibrary()
if demandLib != "" { if demandLib != "" {
@ -274,7 +271,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--tensor-split", estimate.TensorSplit) params = append(params, "--tensor-split", estimate.TensorSplit)
} }
for i := range len(servers) { for i := range servers {
dir := availableServers[servers[i]] dir := availableServers[servers[i]]
if dir == "" { if dir == "" {
// Shouldn't happen // Shouldn't happen
@ -330,7 +327,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
_, err := os.Stat(server) _, err := os.Stat(server)
if errors.Is(err, os.ErrNotExist) { if errors.Is(err, os.ErrNotExist) {
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err) slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
err = Init() _, err = runners.Refresh(build.EmbedFS)
if err != nil { if err != nil {
slog.Warn("failed to reinitialize payloads", "error", err) slog.Warn("failed to reinitialize payloads", "error", err)
return nil, err return nil, err

384
runners/common.go Normal file
View file

@ -0,0 +1,384 @@
package runners
import (
"compress/gzip"
"errors"
"fmt"
"io"
"io/fs"
"log/slog"
"os"
"path/filepath"
"runtime"
"slices"
"strconv"
"strings"
"sync"
"syscall"
"golang.org/x/sync/errgroup"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/gpu"
)
const (
binGlob = "*/*/*/*"
)
var (
lock sync.Mutex
runnersDir = ""
)
// Return the location where runners are stored
// If runners are payloads, this will either extract them
// or refresh them if any have disappeared due to tmp cleaners
func Refresh(payloadFS fs.FS) (string, error) {
lock.Lock()
defer lock.Unlock()
var err error
// Wire up extra logging on our first load
if runnersDir == "" {
defer func() {
var runners []string
for v := range GetAvailableServers(runnersDir) {
runners = append(runners, v)
}
slog.Info("Dynamic LLM libraries", "runners", runners)
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
}()
}
if hasPayloads(payloadFS) {
if runnersDir == "" {
runnersDir, err = extractRunners(payloadFS)
} else {
err = refreshRunners(payloadFS, runnersDir)
}
} else if runnersDir == "" {
runnersDir, err = locateRunners()
}
return runnersDir, err
}
func Cleanup(payloadFS fs.FS) {
lock.Lock()
defer lock.Unlock()
if hasPayloads(payloadFS) && runnersDir != "" {
// We want to fully clean up the tmpdir parent of the payloads dir
tmpDir := filepath.Clean(filepath.Join(runnersDir, ".."))
slog.Debug("cleaning up", "dir", tmpDir)
err := os.RemoveAll(tmpDir)
if err != nil {
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
}
}
}
func locateRunners() (string, error) {
exe, err := os.Executable()
if err != nil {
return "", err
}
cwd, err := os.Getwd()
if err != nil {
return "", err
}
var paths []string
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
paths = append(paths,
root,
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
)
}
// Try a few variations to improve developer experience when building from source in the local tree
for _, path := range paths {
candidate := filepath.Join(path, "lib", "ollama", "runners")
if _, err := os.Stat(candidate); err == nil {
return candidate, nil
}
}
return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
}
// Return true if we're carying nested payloads for the runners
func hasPayloads(payloadFS fs.FS) bool {
files, err := fs.Glob(payloadFS, binGlob)
if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) {
return false
}
return true
}
func extractRunners(payloadFS fs.FS) (string, error) {
cleanupTmpDirs()
tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama")
if err != nil {
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
}
// Track our pid so we can clean up orphaned tmpdirs
n := filepath.Join(tmpDir, "ollama.pid")
if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
slog.Warn("failed to write pid file", "file", n, "error", err)
}
// We create a distinct subdirectory for payloads within the tmpdir
// This will typically look like /tmp/ollama3208993108/runners on linux
rDir := filepath.Join(tmpDir, "runners")
slog.Info("extracting embedded files", "dir", rDir)
return rDir, refreshRunners(payloadFS, rDir)
}
func refreshRunners(payloadFS fs.FS, rDir string) error {
// extract or refresh server libraries
err := extractFiles(payloadFS, rDir, binGlob)
if err != nil {
return fmt.Errorf("extract binaries: %v", err)
}
return nil
}
// extract extracts the embedded files to the target directory
func extractFiles(payloadFS fs.FS, targetDir string, glob string) error {
files, err := fs.Glob(payloadFS, glob)
if err != nil || len(files) == 0 {
// Should not happen
return fmt.Errorf("extractFiles called without payload present")
}
if err := os.MkdirAll(targetDir, 0o755); err != nil {
return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
}
g := new(errgroup.Group)
// $OS/$GOARCH/$RUNNER/$FILE
for _, file := range files {
filename := file
runner := filepath.Base(filepath.Dir(filename))
slog.Debug("extracting", "runner", runner, "payload", filename)
g.Go(func() error {
srcf, err := payloadFS.Open(filename)
if err != nil {
return err
}
defer srcf.Close()
src := io.Reader(srcf)
if strings.HasSuffix(filename, ".gz") {
src, err = gzip.NewReader(src)
if err != nil {
return fmt.Errorf("decompress payload %s: %v", filename, err)
}
filename = strings.TrimSuffix(filename, ".gz")
}
runnerDir := filepath.Join(targetDir, runner)
if err := os.MkdirAll(runnerDir, 0o755); err != nil {
return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err)
}
base := filepath.Base(filename)
destFilename := filepath.Join(runnerDir, base)
_, err = os.Stat(destFilename)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", filename, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, src); err != nil {
return fmt.Errorf("copy payload %s: %v", filename, err)
}
case err != nil:
return fmt.Errorf("stat payload %s: %v", filename, err)
}
return nil
})
}
err = g.Wait()
if err != nil {
slog.Error("failed to extract files", "error", err)
// If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted
err := os.RemoveAll(targetDir)
if err != nil {
slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err)
}
return err
}
return nil
}
// Best effort to clean up prior tmpdirs
func cleanupTmpDirs() {
tmpDir := envconfig.TmpDir()
if tmpDir == "" {
tmpDir = os.TempDir()
}
matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid"))
if err != nil {
return
}
for _, match := range matches {
raw, err := os.ReadFile(match)
if errors.Is(err, os.ErrNotExist) {
slog.Debug("not a ollama runtime directory, skipping", "path", match)
continue
} else if err != nil {
slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
continue
}
pid, err := strconv.Atoi(string(raw))
if err != nil {
slog.Warn("invalid pid, skipping", "path", match, "error", err)
continue
}
p, err := os.FindProcess(pid)
if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
slog.Warn("process still running, skipping", "pid", pid, "path", match)
continue
}
if err := os.Remove(match); err != nil {
slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
}
runners := filepath.Join(filepath.Dir(match), "runners")
if err := os.RemoveAll(runners); err != nil {
slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
}
if err := os.Remove(filepath.Dir(match)); err != nil {
slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
}
}
}
// directory names are the name of the runner and may contain an optional
// variant prefixed with '_' as the separator. For example, "cuda_v11" and
// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
// lowest common denominator
func GetAvailableServers(payloadsDir string) map[string]string {
if payloadsDir == "" {
slog.Error("empty runner dir")
return nil
}
// glob payloadsDir for files that start with ollama_
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
files, err := filepath.Glob(pattern)
if err != nil {
slog.Debug("could not glob", "pattern", pattern, "error", err)
return nil
}
servers := make(map[string]string)
for _, file := range files {
slog.Debug("availableServers : found", "file", file)
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
}
return servers
}
// serversForGpu returns a list of compatible servers give the provided GPU
// info, ordered by performance. assumes Init() has been called
// TODO - switch to metadata based mapping
func ServersForGpu(info gpu.GpuInfo) []string {
// glob workDir for files that start with ollama_
availableServers := GetAvailableServers(runnersDir)
requested := info.Library
if info.Variant != gpu.CPUCapabilityNone.String() {
requested += "_" + info.Variant
}
servers := []string{}
// exact match first
for a := range availableServers {
if a == requested {
servers = []string{a}
if a == "metal" {
return servers
}
break
}
}
alt := []string{}
// Then for GPUs load alternates and sort the list for consistent load ordering
if info.Library != "cpu" {
for a := range availableServers {
if info.Library == strings.Split(a, "_")[0] && a != requested {
alt = append(alt, a)
}
}
slices.Sort(alt)
servers = append(servers, alt...)
}
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
// Load up the best CPU variant if not primary requested
if info.Library != "cpu" {
variant := gpu.GetCPUCapability()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+variant.String() {
servers = append(servers, cmp)
break
}
}
} else {
servers = append(servers, "cpu")
}
}
if len(servers) == 0 {
servers = []string{"cpu"}
}
}
return servers
}
// Return the optimal server for this CPU architecture
func ServerForCpu() string {
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
return "metal"
}
variant := gpu.GetCPUCapability()
availableServers := GetAvailableServers(runnersDir)
if variant != gpu.CPUCapabilityNone {
for cmp := range availableServers {
if cmp == "cpu_"+variant.String() {
return cmp
}
}
}
return "cpu"
}

50
runners/runners_test.go Normal file
View file

@ -0,0 +1,50 @@
package runners
import (
"log/slog"
"os"
"path"
"runtime"
"strings"
"testing"
"testing/fstest"
)
func TestRefreshRunners(t *testing.T) {
slog.SetLogLoggerLevel(slog.LevelDebug)
payloadFS := fstest.MapFS{
path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")},
}
tmpDir, err := os.MkdirTemp("", "testing")
if err != nil {
t.Fatalf("failed to make tmp dir %s", err)
}
t.Setenv("OLLAMA_TMPDIR", tmpDir)
rDir, err := Refresh(payloadFS)
if err != nil {
t.Fatalf("failed to extract to %s %s", tmpDir, err)
}
if !strings.Contains(rDir, tmpDir) {
t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
}
// spot check results
servers := GetAvailableServers(rDir)
if len(servers) < 1 {
t.Fatalf("expected at least 1 server")
}
// Refresh contents
rDir, err = extractRunners(payloadFS)
if err != nil {
t.Fatalf("failed to extract to %s %s", tmpDir, err)
}
if !strings.Contains(rDir, tmpDir) {
t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
}
cleanupTmpDirs()
Cleanup(payloadFS)
}

View file

@ -2,8 +2,7 @@
set -e set -e
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} . $(dirname $0)/env.sh
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
mkdir -p dist mkdir -p dist

View file

@ -2,76 +2,34 @@
set -eu set -eu
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} . $(dirname $0)/env.sh
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
# We use 2 different image repositories to handle combining architecture images into multiarch manifest
# (The ROCm image is x86 only and is not a multiarch manifest)
# For developers, you can override the DOCKER_ORG to generate multiarch manifests
# DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh
DOCKER_ORG=${DOCKER_ORG:-"ollama"}
RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
# Set PUSH to a non-empty string to trigger push instead of load # Set PUSH to a non-empty string to trigger push instead of load
PUSH=${PUSH:-""} PUSH=${PUSH:-""}
# In CI mode, we break things down
OLLAMA_SKIP_MANIFEST_CREATE=${OLLAMA_SKIP_MANIFEST_CREATE:-""}
OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""}
if [ -z "${PUSH}" ] ; then if [ -z "${PUSH}" ] ; then
echo "Building ${FINAL_IMAGE_REPO}:$VERSION locally. set PUSH=1 to push"
LOAD_OR_PUSH="--load" LOAD_OR_PUSH="--load"
else else
echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}" echo "Will be pushing ${FINAL_IMAGE_REPO}:$VERSION"
LOAD_OR_PUSH="--push" LOAD_OR_PUSH="--push"
fi fi
if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then docker buildx build \
for TARGETARCH in ${BUILD_ARCH}; do ${LOAD_OR_PUSH} \
docker build \ --platform=${PLATFORM} \
${LOAD_OR_PUSH} \ ${OLLAMA_COMMON_BUILD_ARGS} \
--platform=linux/${TARGETARCH} \ -f Dockerfile \
--build-arg=VERSION \ -t ${FINAL_IMAGE_REPO}:$VERSION \
--build-arg=GOFLAGS \ .
-f Dockerfile \
-t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \
.
done
if echo ${BUILD_ARCH} | grep "amd64" > /dev/null; then if echo $PLATFORM | grep "amd64" > /dev/null; then
docker build \ docker buildx build \
${LOAD_OR_PUSH} \ ${LOAD_OR_PUSH} \
--platform=linux/amd64 \ --platform=linux/amd64 \
--build-arg=VERSION \ ${OLLAMA_COMMON_BUILD_ARGS} \
--build-arg=GOFLAGS \ --target runtime-rocm \
--target runtime-rocm \ -f Dockerfile \
-f Dockerfile \ -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
-t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \ .
.
fi
fi
if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then
if [ -n "${PUSH}" ]; then
docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \
${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
${RELEASE_IMAGE_REPO}:$VERSION-arm64
docker manifest push ${FINAL_IMAGE_REPO}:$VERSION
# For symmetry, tag/push the rocm image
if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
echo "Tagging and pushing rocm image"
docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm
fi
else
echo "Skipping manifest generation when not pushing images are available locally as "
echo " ${RELEASE_IMAGE_REPO}:$VERSION-amd64"
echo " ${RELEASE_IMAGE_REPO}:$VERSION-arm64"
echo " ${RELEASE_IMAGE_REPO}:$VERSION-rocm"
fi
fi fi

View file

@ -1,37 +1,29 @@
#!/bin/sh #!/bin/sh
#
# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder
#
# docker context create amd64 --docker host=ssh://mybuildhost
# docker buildx create --name mybuilder amd64 --platform linux/amd64
# docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64
# docker buildx use mybuilder
set -eu set -eu
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} . $(dirname $0)/env.sh
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
GZIP=$(which pigz 2>/dev/null || echo "gzip")
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
mkdir -p dist mkdir -p dist
for TARGETARCH in ${BUILD_ARCH}; do docker buildx build \
docker build \ --output type=local,dest=./dist/ \
--platform=linux/$TARGETARCH \ --platform=${PLATFORM} \
--build-arg=GOFLAGS \ ${OLLAMA_COMMON_BUILD_ARGS} \
--build-arg=CGO_CFLAGS \ --target dist \
--build-arg=OLLAMA_CUSTOM_CPU_DEFS \
--build-arg=AMDGPU_TARGETS \
--target build-$TARGETARCH \
-f Dockerfile \ -f Dockerfile \
-t builder:$TARGETARCH \
. .
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
rm -rf ./dist/linux-$TARGETARCH # buildx behavior changes for single vs. multiplatform
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist if echo $PLATFORM | grep "," > /dev/null ; then
if echo ${TARGETARCH} | grep "amd64" > /dev/null; then mv -f ./dist/linux_*64/ollama* ./dist/
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist rmdir ./dist/linux_*64
fi fi
docker rm builder-$TARGETARCH
echo "Compressing final linux bundle..."
rm -f ./dist/ollama-linux-$TARGETARCH.tgz
(cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
if [ -d dist/linux-$TARGETARCH-rocm ]; then
(cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
fi
done

14
scripts/env.sh Normal file
View file

@ -0,0 +1,14 @@
# Common environment setup across build*.sh scripts
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
# TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
DOCKER_ORG=${DOCKER_ORG:-"ollama"}
RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION --build-arg=GOFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS --build-arg=AMDGPU_TARGETS"
echo "Building Ollama"
echo "VERSION=$VERSION"
echo "PLATFORM=$PLATFORM"

View file

@ -26,11 +26,13 @@ import (
"golang.org/x/sync/errgroup" "golang.org/x/sync/errgroup"
"github.com/ollama/ollama/api" "github.com/ollama/ollama/api"
"github.com/ollama/ollama/build"
"github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/gpu" "github.com/ollama/ollama/gpu"
"github.com/ollama/ollama/llm" "github.com/ollama/ollama/llm"
"github.com/ollama/ollama/openai" "github.com/ollama/ollama/openai"
"github.com/ollama/ollama/parser" "github.com/ollama/ollama/parser"
"github.com/ollama/ollama/runners"
"github.com/ollama/ollama/template" "github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/errtypes"
"github.com/ollama/ollama/types/model" "github.com/ollama/ollama/types/model"
@ -117,6 +119,32 @@ func (s *Server) GenerateHandler(c *gin.Context) {
return return
} }
// expire the runner
if req.Prompt == "" && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
model, err := GetModel(req.Model)
if err != nil {
switch {
case os.IsNotExist(err):
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
case err.Error() == "invalid model name":
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
return
}
s.sched.expireRunner(model)
c.JSON(http.StatusOK, api.GenerateResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Response: "",
Done: true,
DoneReason: "unload",
})
return
}
if req.Format != "" && req.Format != "json" { if req.Format != "" && req.Format != "json" {
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""}) c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""})
return return
@ -1190,12 +1218,12 @@ func Serve(ln net.Listener) error {
srvr.Close() srvr.Close()
schedDone() schedDone()
sched.unloadAllRunners() sched.unloadAllRunners()
gpu.Cleanup() runners.Cleanup(build.EmbedFS)
done() done()
}() }()
if err := llm.Init(); err != nil { if _, err := runners.Refresh(build.EmbedFS); err != nil {
return fmt.Errorf("unable to initialize llm library %w", err) return fmt.Errorf("unable to initialize llm runners %w", err)
} }
s.sched.Run(schedCtx) s.sched.Run(schedCtx)
@ -1322,6 +1350,32 @@ func (s *Server) ChatHandler(c *gin.Context) {
return return
} }
// expire the runner
if len(req.Messages) == 0 && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
model, err := GetModel(req.Model)
if err != nil {
switch {
case os.IsNotExist(err):
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
case err.Error() == "invalid model name":
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
default:
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
}
return
}
s.sched.expireRunner(model)
c.JSON(http.StatusOK, api.ChatResponse{
Model: req.Model,
CreatedAt: time.Now().UTC(),
Message: api.Message{Role: "assistant"},
Done: true,
DoneReason: "unload",
})
return
}
caps := []Capability{CapabilityCompletion} caps := []Capability{CapabilityCompletion}
if len(req.Tools) > 0 { if len(req.Tools) > 0 {
caps = append(caps, CapabilityTools) caps = append(caps, CapabilityTools)

View file

@ -360,7 +360,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
slog.Debug("runner expired event received", "modelPath", runner.modelPath) slog.Debug("runner expired event received", "modelPath", runner.modelPath)
runner.refMu.Lock() runner.refMu.Lock()
if runner.refCount > 0 { if runner.refCount > 0 {
// Shouldn't happen, but safeguard to ensure no leaked runners
slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount) slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount)
go func(runner *runnerRef) { go func(runner *runnerRef) {
// We can't unload yet, but want to as soon as the current request completes // We can't unload yet, but want to as soon as the current request completes
@ -802,6 +801,25 @@ func (s *Scheduler) unloadAllRunners() {
} }
} }
func (s *Scheduler) expireRunner(model *Model) {
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
runner, ok := s.loaded[model.ModelPath]
if ok {
runner.refMu.Lock()
runner.expiresAt = time.Now()
if runner.expireTimer != nil {
runner.expireTimer.Stop()
runner.expireTimer = nil
}
runner.sessionDuration = 0
if runner.refCount <= 0 {
s.expiredCh <- runner
}
runner.refMu.Unlock()
}
}
// If other runners are loaded, make sure the pending request will fit in system memory // If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded // If not, pick a runner to unload, else return nil and the request can be loaded
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef { func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {

View file

@ -406,6 +406,52 @@ func TestGetRunner(t *testing.T) {
b.ctxDone() b.ctxDone()
} }
func TestExpireRunner(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
defer done()
s := InitScheduler(ctx)
req := &LlmRequest{
ctx: ctx,
model: &Model{ModelPath: "foo"},
opts: api.DefaultOptions(),
successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1),
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
}
var ggml *llm.GGML
gpus := gpu.GpuInfoList{}
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
return server, nil
}
s.load(req, ggml, gpus, 0)
select {
case err := <-req.errCh:
if err != nil {
t.Fatalf("expected no errors when loading, got '%s'", err.Error())
}
case resp := <-req.successCh:
s.loadedMu.Lock()
if resp.refCount != uint(1) || len(s.loaded) != 1 {
t.Fatalf("expected a model to be loaded")
}
s.loadedMu.Unlock()
}
s.expireRunner(&Model{ModelPath: "foo"})
s.finishedReqCh <- req
s.processCompleted(ctx)
s.loadedMu.Lock()
if len(s.loaded) != 0 {
t.Fatalf("expected model to be unloaded")
}
s.loadedMu.Unlock()
}
// TODO - add one scenario that triggers the bogus finished event with positive ref count // TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) { func TestPrematureExpired(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)