Compare commits
22 commits
76c9dc57fd
...
7f1565721c
Author | SHA1 | Date | |
---|---|---|---|
7f1565721c | |||
|
d889c6fd07 | ||
|
56b9af336a | ||
|
fda0d3be52 | ||
|
cd5c8f6471 | ||
|
fef257c5c5 | ||
|
d066d9b8e0 | ||
|
5a00dc9fc9 | ||
|
c354e87809 | ||
|
93ac3760cb | ||
|
abed273de3 | ||
|
034392624c | ||
|
ecab6f1cc5 | ||
|
7d6900827d | ||
|
9246e6dd15 | ||
|
735a0ca2e4 | ||
|
dddb72e084 | ||
|
83a9b5271a | ||
|
4a8069f9c4 | ||
|
84b84ce2db | ||
|
bb6a086d63 | ||
|
30c8f201cc |
51 changed files with 1383 additions and 868 deletions
|
@ -7,3 +7,5 @@ llm/llama.cpp
|
||||||
.env
|
.env
|
||||||
.cache
|
.cache
|
||||||
test_data
|
test_data
|
||||||
|
llm/build
|
||||||
|
llama/build
|
||||||
|
|
209
.github/workflows/release.yaml
vendored
209
.github/workflows/release.yaml
vendored
|
@ -102,8 +102,8 @@ jobs:
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cpu
|
name: generate-windows-cpu
|
||||||
path: |
|
path: |
|
||||||
llm/build/**/bin/*
|
build/**/*
|
||||||
llm/build/**/*.a
|
build/**/*.a
|
||||||
dist/windows-amd64/**
|
dist/windows-amd64/**
|
||||||
|
|
||||||
# ROCm generation step
|
# ROCm generation step
|
||||||
|
@ -176,7 +176,7 @@ jobs:
|
||||||
with:
|
with:
|
||||||
name: generate-windows-rocm
|
name: generate-windows-rocm
|
||||||
path: |
|
path: |
|
||||||
llm/build/**/bin/*
|
build/**/*
|
||||||
dist/windows-amd64/**
|
dist/windows-amd64/**
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
|
@ -265,7 +265,7 @@ jobs:
|
||||||
with:
|
with:
|
||||||
name: generate-windows-cuda-${{ matrix.cuda.version }}
|
name: generate-windows-cuda-${{ matrix.cuda.version }}
|
||||||
path: |
|
path: |
|
||||||
llm/build/**/bin/*
|
build/**/*
|
||||||
dist/windows-amd64/**
|
dist/windows-amd64/**
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
|
@ -338,7 +338,7 @@ jobs:
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: generate-windows-rocm
|
name: generate-windows-rocm
|
||||||
- run: dir llm/build
|
- run: dir build
|
||||||
- run: |
|
- run: |
|
||||||
$gopath=(get-command go).source | split-path -parent
|
$gopath=(get-command go).source | split-path -parent
|
||||||
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
|
& "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
|
||||||
|
@ -359,9 +359,7 @@ jobs:
|
||||||
environment: release
|
environment: release
|
||||||
runs-on: linux
|
runs-on: linux
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_MANIFEST_CREATE: '1'
|
|
||||||
BUILD_ARCH: amd64
|
BUILD_ARCH: amd64
|
||||||
PUSH: '1'
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
|
@ -369,14 +367,8 @@ jobs:
|
||||||
- name: Set Version
|
- name: Set Version
|
||||||
shell: bash
|
shell: bash
|
||||||
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ vars.DOCKER_USER }}
|
|
||||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
|
||||||
- run: |
|
- run: |
|
||||||
./scripts/build_linux.sh
|
./scripts/build_linux.sh
|
||||||
./scripts/build_docker.sh
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: dist-linux-amd64
|
name: dist-linux-amd64
|
||||||
|
@ -390,9 +382,7 @@ jobs:
|
||||||
environment: release
|
environment: release
|
||||||
runs-on: linux-arm64
|
runs-on: linux-arm64
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_MANIFEST_CREATE: '1'
|
|
||||||
BUILD_ARCH: arm64
|
BUILD_ARCH: arm64
|
||||||
PUSH: '1'
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
|
@ -421,14 +411,8 @@ jobs:
|
||||||
sudo usermod -aG docker $USER
|
sudo usermod -aG docker $USER
|
||||||
sudo apt-get install acl
|
sudo apt-get install acl
|
||||||
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
|
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ vars.DOCKER_USER }}
|
|
||||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
|
||||||
- run: |
|
- run: |
|
||||||
./scripts/build_linux.sh
|
./scripts/build_linux.sh
|
||||||
./scripts/build_docker.sh
|
|
||||||
- uses: actions/upload-artifact@v4
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: dist-linux-arm64
|
name: dist-linux-arm64
|
||||||
|
@ -436,6 +420,181 @@ jobs:
|
||||||
dist/*linux*
|
dist/*linux*
|
||||||
!dist/*-cov
|
!dist/*-cov
|
||||||
|
|
||||||
|
# Container image build
|
||||||
|
build-linux:
|
||||||
|
environment: release
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
runner:
|
||||||
|
- linux
|
||||||
|
- linux-arm64
|
||||||
|
runs-on: ${{ matrix.runner }}
|
||||||
|
env:
|
||||||
|
FINAL_IMAGE_REPO: ollama/ollama
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
- name: 'Install Docker'
|
||||||
|
if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y ca-certificates curl
|
||||||
|
sudo install -m 0755 -d /etc/apt/keyrings
|
||||||
|
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
|
||||||
|
sudo chmod a+r /etc/apt/keyrings/docker.asc
|
||||||
|
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
|
||||||
|
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
|
||||||
|
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y docker-ce docker-ce-cli containerd.io
|
||||||
|
sudo usermod -aG docker $USER
|
||||||
|
sudo apt-get install acl
|
||||||
|
sudo setfacl --modify user:$USER:rw /var/run/docker.sock
|
||||||
|
- name: Docker meta
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: ${{ env.FINAL_IMAGE_REPO }}
|
||||||
|
flavor: |
|
||||||
|
latest=false
|
||||||
|
tags: |
|
||||||
|
type=ref,event=tag
|
||||||
|
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
||||||
|
type=semver,pattern={{version}}
|
||||||
|
- name: Set Version
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
machine=$(uname -m)
|
||||||
|
case ${machine} in
|
||||||
|
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
|
||||||
|
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
|
||||||
|
esac >>$GITHUB_ENV
|
||||||
|
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ vars.DOCKER_USER }}
|
||||||
|
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||||
|
- name: Build and push by digest
|
||||||
|
id: build
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: "."
|
||||||
|
platforms: linux/${{ env.ARCH }}
|
||||||
|
build-args: |
|
||||||
|
GOFLAGS
|
||||||
|
outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
|
||||||
|
- name: Export digest
|
||||||
|
run: |
|
||||||
|
mkdir -p /tmp/digests
|
||||||
|
digest="${{ steps.build.outputs.digest }}"
|
||||||
|
touch "/tmp/digests/${digest#sha256:}"
|
||||||
|
- name: Upload digest
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: digests-${{ env.PLATFORM_PAIR }}
|
||||||
|
path: /tmp/digests/*
|
||||||
|
if-no-files-found: error
|
||||||
|
retention-days: 1
|
||||||
|
merge:
|
||||||
|
environment: release
|
||||||
|
runs-on: linux
|
||||||
|
needs:
|
||||||
|
- build-linux
|
||||||
|
env:
|
||||||
|
FINAL_IMAGE_REPO: ollama/ollama
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
- name: Download digests
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
path: /tmp/digests
|
||||||
|
pattern: digests-*
|
||||||
|
merge-multiple: true
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
- name: Docker meta
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: ${{ env.FINAL_IMAGE_REPO }}
|
||||||
|
flavor: |
|
||||||
|
latest=false
|
||||||
|
tags: |
|
||||||
|
type=ref,event=tag
|
||||||
|
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
||||||
|
type=semver,pattern={{version}}
|
||||||
|
- name: Set Version
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
machine=$(uname -m)
|
||||||
|
case ${machine} in
|
||||||
|
x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
|
||||||
|
aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
|
||||||
|
esac >>$GITHUB_ENV
|
||||||
|
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ vars.DOCKER_USER }}
|
||||||
|
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||||
|
- name: Create manifest list and push
|
||||||
|
working-directory: /tmp/digests
|
||||||
|
run: |
|
||||||
|
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
||||||
|
$(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
|
||||||
|
- name: Inspect image
|
||||||
|
run: |
|
||||||
|
docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}
|
||||||
|
build-linux-rocm:
|
||||||
|
environment: release
|
||||||
|
runs-on: linux
|
||||||
|
env:
|
||||||
|
FINAL_IMAGE_REPO: ollama/ollama
|
||||||
|
ARCH: amd64
|
||||||
|
PLATFORM_PAIR: linux-amd64
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
- name: Docker meta
|
||||||
|
id: meta
|
||||||
|
uses: docker/metadata-action@v5
|
||||||
|
with:
|
||||||
|
images: ${{ env.FINAL_IMAGE_REPO }}
|
||||||
|
flavor: |
|
||||||
|
latest=false
|
||||||
|
tags: |
|
||||||
|
type=ref,event=tag
|
||||||
|
type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
|
||||||
|
type=semver,pattern={{version}}
|
||||||
|
- name: Set Version
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
- name: Login to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ vars.DOCKER_USER }}
|
||||||
|
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
||||||
|
- name: Build and push by digest
|
||||||
|
id: build
|
||||||
|
uses: docker/build-push-action@v6
|
||||||
|
with:
|
||||||
|
context: "."
|
||||||
|
target: runtime-rocm
|
||||||
|
build-args: |
|
||||||
|
GOFLAGS
|
||||||
|
tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm,${{ env.FINAL_IMAGE_REPO }}:rocm
|
||||||
|
push: true
|
||||||
|
|
||||||
# Aggregate all the assets and ship a release
|
# Aggregate all the assets and ship a release
|
||||||
release:
|
release:
|
||||||
needs:
|
needs:
|
||||||
|
@ -448,8 +607,6 @@ jobs:
|
||||||
permissions:
|
permissions:
|
||||||
contents: write
|
contents: write
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_IMAGE_BUILD: '1'
|
|
||||||
PUSH: '1'
|
|
||||||
GH_TOKEN: ${{ github.token }}
|
GH_TOKEN: ${{ github.token }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
@ -458,12 +615,6 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
|
||||||
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
|
echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
|
||||||
- name: Login to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ vars.DOCKER_USER }}
|
|
||||||
password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
|
|
||||||
- run: ./scripts/build_docker.sh
|
|
||||||
- name: Retrieve built artifact
|
- name: Retrieve built artifact
|
||||||
uses: actions/download-artifact@v4
|
uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
|
|
43
.github/workflows/test.yaml
vendored
43
.github/workflows/test.yaml
vendored
|
@ -81,12 +81,6 @@ jobs:
|
||||||
if: ${{ ! startsWith(matrix.os, 'windows-') }}
|
if: ${{ ! startsWith(matrix.os, 'windows-') }}
|
||||||
name: 'Unix Go Generate'
|
name: 'Unix Go Generate'
|
||||||
- run: go build .
|
- run: go build .
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
|
|
||||||
path: |
|
|
||||||
llm/build/**/bin/*
|
|
||||||
llm/build/**/*.a
|
|
||||||
generate-cuda:
|
generate-cuda:
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
|
if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
|
||||||
|
@ -114,12 +108,6 @@ jobs:
|
||||||
go generate -x ./...
|
go generate -x ./...
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: cuda-${{ matrix.cuda-version }}-libraries
|
|
||||||
path: |
|
|
||||||
llm/build/**/bin/*
|
|
||||||
dist/windows-amd64/**
|
|
||||||
generate-rocm:
|
generate-rocm:
|
||||||
needs: [changes]
|
needs: [changes]
|
||||||
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
|
if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
|
||||||
|
@ -147,12 +135,6 @@ jobs:
|
||||||
go generate -x ./...
|
go generate -x ./...
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: rocm-${{ matrix.rocm-version }}-libraries
|
|
||||||
path: |
|
|
||||||
llm/build/**/bin/*
|
|
||||||
dist/windows-amd64/**
|
|
||||||
|
|
||||||
# ROCm generation step
|
# ROCm generation step
|
||||||
generate-windows-rocm:
|
generate-windows-rocm:
|
||||||
|
@ -189,7 +171,6 @@ jobs:
|
||||||
name: go generate
|
name: go generate
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||||
# TODO - do we need any artifacts?
|
|
||||||
|
|
||||||
# CUDA generation step
|
# CUDA generation step
|
||||||
generate-windows-cuda:
|
generate-windows-cuda:
|
||||||
|
@ -231,7 +212,6 @@ jobs:
|
||||||
go generate -x ./...
|
go generate -x ./...
|
||||||
env:
|
env:
|
||||||
OLLAMA_SKIP_CPU_GENERATE: '1'
|
OLLAMA_SKIP_CPU_GENERATE: '1'
|
||||||
# TODO - do we need any artifacts?
|
|
||||||
|
|
||||||
lint:
|
lint:
|
||||||
strategy:
|
strategy:
|
||||||
|
@ -263,14 +243,6 @@ jobs:
|
||||||
arm64) echo ARCH=arm64 ;;
|
arm64) echo ARCH=arm64 ;;
|
||||||
esac >>$GITHUB_ENV
|
esac >>$GITHUB_ENV
|
||||||
shell: bash
|
shell: bash
|
||||||
- run: |
|
|
||||||
mkdir -p llm/build/linux/$ARCH/stub/bin
|
|
||||||
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
|
|
||||||
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
|
|
||||||
- run: |
|
|
||||||
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
|
||||||
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
|
||||||
if: ${{ startsWith(matrix.os, 'macos-') }}
|
|
||||||
- uses: golangci/golangci-lint-action@v6
|
- uses: golangci/golangci-lint-action@v6
|
||||||
with:
|
with:
|
||||||
args: --timeout 8m0s -v
|
args: --timeout 8m0s -v
|
||||||
|
@ -301,23 +273,10 @@ jobs:
|
||||||
cache: true
|
cache: true
|
||||||
- run: |
|
- run: |
|
||||||
case ${{ matrix.arch }} in
|
case ${{ matrix.arch }} in
|
||||||
amd64) echo ARCH=x86_64 ;;
|
amd64) echo ARCH=amd64 ;;
|
||||||
arm64) echo ARCH=arm64 ;;
|
arm64) echo ARCH=arm64 ;;
|
||||||
esac >>$GITHUB_ENV
|
esac >>$GITHUB_ENV
|
||||||
shell: bash
|
shell: bash
|
||||||
- run: |
|
|
||||||
mkdir -p llm/build/linux/$ARCH/stub/bin
|
|
||||||
touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
|
|
||||||
if: ${{ startsWith(matrix.os, 'ubuntu-') }}
|
|
||||||
- run: |
|
|
||||||
mkdir -p llm/build/darwin/$ARCH/stub/bin
|
|
||||||
touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
|
|
||||||
if: ${{ startsWith(matrix.os, 'macos-') }}
|
|
||||||
shell: bash
|
|
||||||
- run: go generate ./...
|
- run: go generate ./...
|
||||||
- run: go build
|
- run: go build
|
||||||
- run: go test -v ./...
|
- run: go test -v ./...
|
||||||
- uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: ${{ matrix.os }}-binaries
|
|
||||||
path: ollama
|
|
||||||
|
|
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -12,4 +12,7 @@ ggml-metal.metal
|
||||||
test_data
|
test_data
|
||||||
*.crt
|
*.crt
|
||||||
llm/build
|
llm/build
|
||||||
|
build/*/*/*
|
||||||
|
!build/**/placeholder
|
||||||
|
llama/build
|
||||||
__debug_bin*
|
__debug_bin*
|
|
@ -312,6 +312,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
|
- [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support)
|
||||||
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
||||||
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
|
- [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
|
||||||
|
- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
|
||||||
|
|
||||||
### Terminal
|
### Terminal
|
||||||
|
|
||||||
|
@ -336,6 +337,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
|
- [podman-ollama](https://github.com/ericcurtin/podman-ollama)
|
||||||
- [gollama](https://github.com/sammcj/gollama)
|
- [gollama](https://github.com/sammcj/gollama)
|
||||||
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
- [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/)
|
||||||
|
- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe)
|
||||||
|
|
||||||
### Apple Vision Pro
|
### Apple Vision Pro
|
||||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||||
|
@ -358,6 +360,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
|
|
||||||
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
||||||
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
|
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
|
||||||
|
- [crewAI](https://github.com/crewAIInc/crewAI)
|
||||||
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
|
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
|
||||||
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
|
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
|
||||||
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
||||||
|
@ -427,6 +430,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
- [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server)
|
||||||
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
- [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links)
|
||||||
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
- [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality)
|
||||||
|
- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator)
|
||||||
|
- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator)
|
||||||
|
|
||||||
### Supported backends
|
### Supported backends
|
||||||
|
|
||||||
|
|
1
build/darwin/amd64/placeholder
Normal file
1
build/darwin/amd64/placeholder
Normal file
|
@ -0,0 +1 @@
|
||||||
|
This is here to make sure the build/ directory exists for the go:embed command
|
1
build/darwin/arm64/placeholder
Normal file
1
build/darwin/arm64/placeholder
Normal file
|
@ -0,0 +1 @@
|
||||||
|
This is here to make sure the build/ directory exists for the go:embed command
|
8
build/embed_darwin_amd64.go
Normal file
8
build/embed_darwin_amd64.go
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
package build
|
||||||
|
|
||||||
|
import "embed"
|
||||||
|
|
||||||
|
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
|
||||||
|
|
||||||
|
//go:embed darwin/amd64/*
|
||||||
|
var EmbedFS embed.FS
|
8
build/embed_darwin_arm64.go
Normal file
8
build/embed_darwin_arm64.go
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
package build
|
||||||
|
|
||||||
|
import "embed"
|
||||||
|
|
||||||
|
// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
|
||||||
|
|
||||||
|
//go:embed darwin/arm64/*
|
||||||
|
var EmbedFS embed.FS
|
6
build/embed_linux.go
Normal file
6
build/embed_linux.go
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
package build
|
||||||
|
|
||||||
|
import "embed"
|
||||||
|
|
||||||
|
//go:embed linux/*
|
||||||
|
var EmbedFS embed.FS
|
8
build/embed_unused.go
Normal file
8
build/embed_unused.go
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
//go:build !linux && !darwin
|
||||||
|
|
||||||
|
package build
|
||||||
|
|
||||||
|
import "embed"
|
||||||
|
|
||||||
|
// unused on windows
|
||||||
|
var EmbedFS embed.FS
|
1
build/linux/amd64/placeholder
Normal file
1
build/linux/amd64/placeholder
Normal file
|
@ -0,0 +1 @@
|
||||||
|
This is here to make sure the build/ directory exists for the go:embed command
|
1
build/linux/arm64/placeholder
Normal file
1
build/linux/arm64/placeholder
Normal file
|
@ -0,0 +1 @@
|
||||||
|
This is here to make sure the build/ directory exists for the go:embed command
|
230
cmd/cmd.go
230
cmd/cmd.go
|
@ -2,6 +2,7 @@ package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"archive/zip"
|
"archive/zip"
|
||||||
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"crypto/ed25519"
|
"crypto/ed25519"
|
||||||
|
@ -21,6 +22,7 @@ import (
|
||||||
"regexp"
|
"regexp"
|
||||||
"runtime"
|
"runtime"
|
||||||
"slices"
|
"slices"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
@ -344,6 +346,39 @@ func (w *progressWriter) Write(p []byte) (n int, err error) {
|
||||||
return len(p), nil
|
return len(p), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
|
||||||
|
p := progress.NewProgress(os.Stderr)
|
||||||
|
defer p.StopAndClear()
|
||||||
|
|
||||||
|
spinner := progress.NewSpinner("")
|
||||||
|
p.Add("", spinner)
|
||||||
|
|
||||||
|
client, err := api.ClientFromEnvironment()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
req := &api.GenerateRequest{
|
||||||
|
Model: opts.Model,
|
||||||
|
KeepAlive: opts.KeepAlive,
|
||||||
|
}
|
||||||
|
|
||||||
|
return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
|
||||||
|
}
|
||||||
|
|
||||||
|
func StopHandler(cmd *cobra.Command, args []string) error {
|
||||||
|
opts := &runOptions{
|
||||||
|
Model: args[0],
|
||||||
|
KeepAlive: &api.Duration{Duration: 0},
|
||||||
|
}
|
||||||
|
if err := loadOrUnloadModel(cmd, opts); err != nil {
|
||||||
|
if strings.Contains(err.Error(), "not found") {
|
||||||
|
return fmt.Errorf("couldn't find model \"%s\" to stop", args[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func RunHandler(cmd *cobra.Command, args []string) error {
|
func RunHandler(cmd *cobra.Command, args []string) error {
|
||||||
interactive := true
|
interactive := true
|
||||||
|
|
||||||
|
@ -422,7 +457,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||||
opts.ParentModel = info.Details.ParentModel
|
opts.ParentModel = info.Details.ParentModel
|
||||||
|
|
||||||
if interactive {
|
if interactive {
|
||||||
if err := loadModel(cmd, &opts); err != nil {
|
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -578,7 +613,7 @@ func ListHandler(cmd *cobra.Command, args []string) error {
|
||||||
table.SetHeaderLine(false)
|
table.SetHeaderLine(false)
|
||||||
table.SetBorder(false)
|
table.SetBorder(false)
|
||||||
table.SetNoWhiteSpace(true)
|
table.SetNoWhiteSpace(true)
|
||||||
table.SetTablePadding("\t")
|
table.SetTablePadding(" ")
|
||||||
table.AppendBulk(data)
|
table.AppendBulk(data)
|
||||||
table.Render()
|
table.Render()
|
||||||
|
|
||||||
|
@ -613,7 +648,15 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
|
||||||
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
|
cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
|
||||||
procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
|
procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
|
||||||
}
|
}
|
||||||
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
|
|
||||||
|
var until string
|
||||||
|
delta := time.Since(m.ExpiresAt)
|
||||||
|
if delta > 0 {
|
||||||
|
until = "Stopping..."
|
||||||
|
} else {
|
||||||
|
until = format.HumanTime(m.ExpiresAt, "Never")
|
||||||
|
}
|
||||||
|
data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -624,7 +667,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
|
||||||
table.SetHeaderLine(false)
|
table.SetHeaderLine(false)
|
||||||
table.SetBorder(false)
|
table.SetBorder(false)
|
||||||
table.SetNoWhiteSpace(true)
|
table.SetNoWhiteSpace(true)
|
||||||
table.SetTablePadding("\t")
|
table.SetTablePadding(" ")
|
||||||
table.AppendBulk(data)
|
table.AppendBulk(data)
|
||||||
table.Render()
|
table.Render()
|
||||||
|
|
||||||
|
@ -720,125 +763,89 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
showInfo(resp)
|
return showInfo(resp, os.Stdout)
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func showInfo(resp *api.ShowResponse) {
|
func showInfo(resp *api.ShowResponse, w io.Writer) error {
|
||||||
modelData := [][]string{
|
tableRender := func(header string, rows func() [][]string) {
|
||||||
{"parameters", resp.Details.ParameterSize},
|
fmt.Fprintln(w, " ", header)
|
||||||
{"quantization", resp.Details.QuantizationLevel},
|
table := tablewriter.NewWriter(w)
|
||||||
}
|
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
||||||
if resp.ModelInfo != nil {
|
table.SetBorder(false)
|
||||||
arch := resp.ModelInfo["general.architecture"].(string)
|
table.SetNoWhiteSpace(true)
|
||||||
modelData = append(modelData,
|
table.SetTablePadding(" ")
|
||||||
[]string{"arch", arch},
|
|
||||||
[]string{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))},
|
switch header {
|
||||||
[]string{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))},
|
case "Template", "System", "License":
|
||||||
)
|
table.SetColWidth(100)
|
||||||
|
}
|
||||||
|
|
||||||
|
table.AppendBulk(rows())
|
||||||
|
table.Render()
|
||||||
|
fmt.Fprintln(w)
|
||||||
}
|
}
|
||||||
|
|
||||||
mainTableData := [][]string{
|
tableRender("Model", func() (rows [][]string) {
|
||||||
{"Model"},
|
if resp.ModelInfo != nil {
|
||||||
{renderSubTable(modelData, false)},
|
arch := resp.ModelInfo["general.architecture"].(string)
|
||||||
}
|
rows = append(rows, []string{"", "architecture", arch})
|
||||||
|
rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))})
|
||||||
|
rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)})
|
||||||
|
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)})
|
||||||
|
} else {
|
||||||
|
rows = append(rows, []string{"", "architecture", resp.Details.Family})
|
||||||
|
rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize})
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel})
|
||||||
|
return
|
||||||
|
})
|
||||||
|
|
||||||
if resp.ProjectorInfo != nil {
|
if resp.ProjectorInfo != nil {
|
||||||
projectorData := [][]string{
|
tableRender("Projector", func() (rows [][]string) {
|
||||||
{"arch", "clip"},
|
arch := resp.ProjectorInfo["general.architecture"].(string)
|
||||||
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
|
rows = append(rows, []string{"", "architecture", arch})
|
||||||
}
|
rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))})
|
||||||
|
rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.embedding_length", arch)].(float64), 'f', -1, 64)})
|
||||||
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
|
rows = append(rows, []string{"", "dimensions", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.projection_dim", arch)].(float64), 'f', -1, 64)})
|
||||||
projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
|
return
|
||||||
}
|
})
|
||||||
|
|
||||||
projectorData = append(projectorData,
|
|
||||||
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
|
|
||||||
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
|
|
||||||
)
|
|
||||||
|
|
||||||
mainTableData = append(mainTableData,
|
|
||||||
[]string{"Projector"},
|
|
||||||
[]string{renderSubTable(projectorData, false)},
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.Parameters != "" {
|
if resp.Parameters != "" {
|
||||||
mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)})
|
tableRender("Parameters", func() (rows [][]string) {
|
||||||
|
scanner := bufio.NewScanner(strings.NewReader(resp.Parameters))
|
||||||
|
for scanner.Scan() {
|
||||||
|
if text := scanner.Text(); text != "" {
|
||||||
|
rows = append(rows, append([]string{""}, strings.Fields(text)...))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
head := func(s string, n int) (rows [][]string) {
|
||||||
|
scanner := bufio.NewScanner(strings.NewReader(s))
|
||||||
|
for scanner.Scan() && (len(rows) < n || n < 0) {
|
||||||
|
if text := scanner.Text(); text != "" {
|
||||||
|
rows = append(rows, []string{"", strings.TrimSpace(text)})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.System != "" {
|
if resp.System != "" {
|
||||||
mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)})
|
tableRender("System", func() [][]string {
|
||||||
|
return head(resp.System, 2)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
if resp.License != "" {
|
if resp.License != "" {
|
||||||
mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)})
|
tableRender("License", func() [][]string {
|
||||||
|
return head(resp.License, 2)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
table := tablewriter.NewWriter(os.Stdout)
|
return nil
|
||||||
table.SetAutoWrapText(false)
|
|
||||||
table.SetBorder(false)
|
|
||||||
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
|
||||||
|
|
||||||
for _, v := range mainTableData {
|
|
||||||
table.Append(v)
|
|
||||||
}
|
|
||||||
|
|
||||||
table.Render()
|
|
||||||
}
|
|
||||||
|
|
||||||
func renderSubTable(data [][]string, file bool) string {
|
|
||||||
var buf bytes.Buffer
|
|
||||||
table := tablewriter.NewWriter(&buf)
|
|
||||||
table.SetAutoWrapText(!file)
|
|
||||||
table.SetBorder(false)
|
|
||||||
table.SetNoWhiteSpace(true)
|
|
||||||
table.SetTablePadding("\t")
|
|
||||||
table.SetAlignment(tablewriter.ALIGN_LEFT)
|
|
||||||
|
|
||||||
for _, v := range data {
|
|
||||||
table.Append(v)
|
|
||||||
}
|
|
||||||
|
|
||||||
table.Render()
|
|
||||||
|
|
||||||
renderedTable := buf.String()
|
|
||||||
lines := strings.Split(renderedTable, "\n")
|
|
||||||
for i, line := range lines {
|
|
||||||
lines[i] = "\t" + line
|
|
||||||
}
|
|
||||||
|
|
||||||
return strings.Join(lines, "\n")
|
|
||||||
}
|
|
||||||
|
|
||||||
func twoLines(s string) [][]string {
|
|
||||||
lines := strings.Split(s, "\n")
|
|
||||||
res := [][]string{}
|
|
||||||
|
|
||||||
count := 0
|
|
||||||
for _, line := range lines {
|
|
||||||
line = strings.TrimSpace(line)
|
|
||||||
if line != "" {
|
|
||||||
count++
|
|
||||||
res = append(res, []string{line})
|
|
||||||
if count == 2 {
|
|
||||||
return res
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res
|
|
||||||
}
|
|
||||||
|
|
||||||
func formatParams(s string) string {
|
|
||||||
lines := strings.Split(s, "\n")
|
|
||||||
table := [][]string{}
|
|
||||||
|
|
||||||
for _, line := range lines {
|
|
||||||
table = append(table, strings.Fields(line))
|
|
||||||
}
|
|
||||||
return renderSubTable(table, false)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func CopyHandler(cmd *cobra.Command, args []string) error {
|
func CopyHandler(cmd *cobra.Command, args []string) error {
|
||||||
|
@ -1328,6 +1335,15 @@ func NewCLI() *cobra.Command {
|
||||||
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
|
||||||
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
|
||||||
runCmd.Flags().String("format", "", "Response format (e.g. json)")
|
runCmd.Flags().String("format", "", "Response format (e.g. json)")
|
||||||
|
|
||||||
|
stopCmd := &cobra.Command{
|
||||||
|
Use: "stop MODEL",
|
||||||
|
Short: "Stop a running model",
|
||||||
|
Args: cobra.ExactArgs(1),
|
||||||
|
PreRunE: checkServerHeartbeat,
|
||||||
|
RunE: StopHandler,
|
||||||
|
}
|
||||||
|
|
||||||
serveCmd := &cobra.Command{
|
serveCmd := &cobra.Command{
|
||||||
Use: "serve",
|
Use: "serve",
|
||||||
Aliases: []string{"start"},
|
Aliases: []string{"start"},
|
||||||
|
@ -1395,6 +1411,7 @@ func NewCLI() *cobra.Command {
|
||||||
createCmd,
|
createCmd,
|
||||||
showCmd,
|
showCmd,
|
||||||
runCmd,
|
runCmd,
|
||||||
|
stopCmd,
|
||||||
pullCmd,
|
pullCmd,
|
||||||
pushCmd,
|
pushCmd,
|
||||||
listCmd,
|
listCmd,
|
||||||
|
@ -1434,6 +1451,7 @@ func NewCLI() *cobra.Command {
|
||||||
createCmd,
|
createCmd,
|
||||||
showCmd,
|
showCmd,
|
||||||
runCmd,
|
runCmd,
|
||||||
|
stopCmd,
|
||||||
pullCmd,
|
pullCmd,
|
||||||
pushCmd,
|
pushCmd,
|
||||||
listCmd,
|
listCmd,
|
||||||
|
|
206
cmd/cmd_test.go
Normal file
206
cmd/cmd_test.go
Normal file
|
@ -0,0 +1,206 @@
|
||||||
|
package cmd
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestShowInfo(t *testing.T) {
|
||||||
|
t.Run("bare details", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
`
|
||||||
|
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("bare model info", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
ModelInfo: map[string]any{
|
||||||
|
"general.architecture": "test",
|
||||||
|
"general.parameter_count": float64(7_000_000_000),
|
||||||
|
"test.context_length": float64(0),
|
||||||
|
"test.embedding_length": float64(0),
|
||||||
|
},
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
context length 0
|
||||||
|
embedding length 0
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
`
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("parameters", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
Parameters: `
|
||||||
|
stop never
|
||||||
|
stop gonna
|
||||||
|
stop give
|
||||||
|
stop you
|
||||||
|
stop up
|
||||||
|
temperature 99`,
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
stop never
|
||||||
|
stop gonna
|
||||||
|
stop give
|
||||||
|
stop you
|
||||||
|
stop up
|
||||||
|
temperature 99
|
||||||
|
|
||||||
|
`
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("project info", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
ProjectorInfo: map[string]any{
|
||||||
|
"general.architecture": "clip",
|
||||||
|
"general.parameter_count": float64(133_700_000),
|
||||||
|
"clip.vision.embedding_length": float64(0),
|
||||||
|
"clip.vision.projection_dim": float64(0),
|
||||||
|
},
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
Projector
|
||||||
|
architecture clip
|
||||||
|
parameters 133.70M
|
||||||
|
embedding length 0
|
||||||
|
dimensions 0
|
||||||
|
|
||||||
|
`
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("system", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
System: `You are a pirate!
|
||||||
|
Ahoy, matey!
|
||||||
|
Weigh anchor!
|
||||||
|
`,
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
System
|
||||||
|
You are a pirate!
|
||||||
|
Ahoy, matey!
|
||||||
|
|
||||||
|
`
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("license", func(t *testing.T) {
|
||||||
|
var b bytes.Buffer
|
||||||
|
license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := showInfo(&api.ShowResponse{
|
||||||
|
Details: api.ModelDetails{
|
||||||
|
Family: "test",
|
||||||
|
ParameterSize: "7B",
|
||||||
|
QuantizationLevel: "FP16",
|
||||||
|
},
|
||||||
|
License: string(license),
|
||||||
|
}, &b); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expect := ` Model
|
||||||
|
architecture test
|
||||||
|
parameters 7B
|
||||||
|
quantization FP16
|
||||||
|
|
||||||
|
License
|
||||||
|
MIT License
|
||||||
|
Copyright (c) Ollama
|
||||||
|
|
||||||
|
`
|
||||||
|
if diff := cmp.Diff(expect, b.String()); diff != "" {
|
||||||
|
t.Errorf("unexpected output (-want +got):\n%s", diff)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
|
@ -18,7 +18,6 @@ import (
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
"github.com/ollama/ollama/progress"
|
|
||||||
"github.com/ollama/ollama/readline"
|
"github.com/ollama/ollama/readline"
|
||||||
"github.com/ollama/ollama/types/errtypes"
|
"github.com/ollama/ollama/types/errtypes"
|
||||||
)
|
)
|
||||||
|
@ -31,26 +30,6 @@ const (
|
||||||
MultilineSystem
|
MultilineSystem
|
||||||
)
|
)
|
||||||
|
|
||||||
func loadModel(cmd *cobra.Command, opts *runOptions) error {
|
|
||||||
p := progress.NewProgress(os.Stderr)
|
|
||||||
defer p.StopAndClear()
|
|
||||||
|
|
||||||
spinner := progress.NewSpinner("")
|
|
||||||
p.Add("", spinner)
|
|
||||||
|
|
||||||
client, err := api.ClientFromEnvironment()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
chatReq := &api.ChatRequest{
|
|
||||||
Model: opts.Model,
|
|
||||||
KeepAlive: opts.KeepAlive,
|
|
||||||
}
|
|
||||||
|
|
||||||
return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
|
|
||||||
}
|
|
||||||
|
|
||||||
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
usage := func() {
|
usage := func() {
|
||||||
fmt.Fprintln(os.Stderr, "Available Commands:")
|
fmt.Fprintln(os.Stderr, "Available Commands:")
|
||||||
|
@ -217,7 +196,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
opts.Model = args[1]
|
opts.Model = args[1]
|
||||||
opts.Messages = []api.Message{}
|
opts.Messages = []api.Message{}
|
||||||
fmt.Printf("Loading model '%s'\n", opts.Model)
|
fmt.Printf("Loading model '%s'\n", opts.Model)
|
||||||
if err := loadModel(cmd, &opts); err != nil {
|
if err := loadOrUnloadModel(cmd, &opts); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
|
@ -371,7 +350,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
|
|
||||||
switch args[1] {
|
switch args[1] {
|
||||||
case "info":
|
case "info":
|
||||||
showInfo(resp)
|
_ = showInfo(resp, os.Stderr)
|
||||||
case "license":
|
case "license":
|
||||||
if resp.License == "" {
|
if resp.License == "" {
|
||||||
fmt.Println("No license was specified for this model.")
|
fmt.Println("No license was specified for this model.")
|
||||||
|
|
|
@ -208,14 +208,18 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if vocabSize := int(p.VocabSize); vocabSize > len(t.Vocabulary.Tokens) {
|
vocabSize := int(p.VocabSize)
|
||||||
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", p.VocabSize, "actual", len(t.Vocabulary.Tokens))
|
switch {
|
||||||
|
case vocabSize > len(t.Vocabulary.Tokens):
|
||||||
|
slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens))
|
||||||
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
for i := range vocabSize - len(t.Vocabulary.Tokens) {
|
||||||
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i))
|
||||||
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1)
|
||||||
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined)
|
||||||
}
|
}
|
||||||
} else {
|
case vocabSize < len(t.Vocabulary.Tokens):
|
||||||
|
return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize)
|
||||||
|
default:
|
||||||
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
48
docs/api.md
48
docs/api.md
|
@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "Why is the sky blue?"
|
"prompt": "Why is the sky blue?"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
@ -80,7 +80,7 @@ A stream of JSON objects is returned:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||||
"response": "The",
|
"response": "The",
|
||||||
"done": false
|
"done": false
|
||||||
|
@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
"response": "",
|
"response": "",
|
||||||
"done": true,
|
"done": true,
|
||||||
|
@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "Why is the sky blue?",
|
"prompt": "Why is the sky blue?",
|
||||||
"stream": false
|
"stream": false
|
||||||
}'
|
}'
|
||||||
|
@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
"response": "The sky is blue because it is the color of the sky.",
|
"response": "The sky is blue because it is the color of the sky.",
|
||||||
"done": true,
|
"done": true,
|
||||||
|
@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "What color is the sky at different times of the day? Respond using JSON",
|
"prompt": "What color is the sky at different times of the day? Respond using JSON",
|
||||||
"format": "json",
|
"format": "json",
|
||||||
"stream": false
|
"stream": false
|
||||||
|
@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-11-09T21:07:55.186497Z",
|
"created_at": "2023-11-09T21:07:55.186497Z",
|
||||||
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
|
"response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
|
||||||
"done": true,
|
"done": true,
|
||||||
|
@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "Why is the sky blue?",
|
"prompt": "Why is the sky blue?",
|
||||||
"stream": false,
|
"stream": false,
|
||||||
"options": {
|
"options": {
|
||||||
|
@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
"response": "The sky is blue because it is the color of the sky.",
|
"response": "The sky is blue because it is the color of the sky.",
|
||||||
"done": true,
|
"done": true,
|
||||||
|
@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3"
|
"model": "llama3.1"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -400,7 +400,7 @@ A single JSON object is returned:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-12-18T19:52:07.071755Z",
|
"created_at": "2023-12-18T19:52:07.071755Z",
|
||||||
"response": "",
|
"response": "",
|
||||||
"done": true
|
"done": true
|
||||||
|
@ -445,7 +445,7 @@ Send a chat message with a streaming response.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
@ -461,7 +461,7 @@ A stream of JSON objects is returned:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||||
"message": {
|
"message": {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
|
@ -476,7 +476,7 @@ Final response:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
"done": true,
|
"done": true,
|
||||||
"total_duration": 4883583458,
|
"total_duration": 4883583458,
|
||||||
|
@ -494,7 +494,7 @@ Final response:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
@ -509,7 +509,7 @@ curl http://localhost:11434/api/chat -d '{
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "registry.ollama.ai/library/llama3:latest",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-12-12T14:13:43.416799Z",
|
"created_at": "2023-12-12T14:13:43.416799Z",
|
||||||
"message": {
|
"message": {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
|
@ -533,7 +533,7 @@ Send a chat message with a conversation history. You can use this same approach
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
@ -557,7 +557,7 @@ A stream of JSON objects is returned:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
"created_at": "2023-08-04T08:52:19.385406455-07:00",
|
||||||
"message": {
|
"message": {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
|
@ -571,7 +571,7 @@ Final response:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
"done": true,
|
"done": true,
|
||||||
"total_duration": 8113331500,
|
"total_duration": 8113331500,
|
||||||
|
@ -629,7 +629,7 @@ curl http://localhost:11434/api/chat -d '{
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
@ -647,7 +647,7 @@ curl http://localhost:11434/api/chat -d '{
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "registry.ollama.ai/library/llama3:latest",
|
"model": "llama3.1",
|
||||||
"created_at": "2023-12-12T14:13:43.416799Z",
|
"created_at": "2023-12-12T14:13:43.416799Z",
|
||||||
"message": {
|
"message": {
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
|
@ -904,7 +904,7 @@ Show information about a model including details, modelfile, template, parameter
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/show -d '{
|
curl http://localhost:11434/api/show -d '{
|
||||||
"name": "llama3"
|
"name": "llama3.1"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -965,7 +965,7 @@ Copy a model. Creates a model with another name from an existing model.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/copy -d '{
|
curl http://localhost:11434/api/copy -d '{
|
||||||
"source": "llama3",
|
"source": "llama3.1",
|
||||||
"destination": "llama3-backup"
|
"destination": "llama3-backup"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
@ -1020,7 +1020,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/pull -d '{
|
curl http://localhost:11434/api/pull -d '{
|
||||||
"name": "llama3"
|
"name": "llama3.1"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "Why is the sky blue?",
|
"prompt": "Why is the sky blue?",
|
||||||
"options": {
|
"options": {
|
||||||
"num_ctx": 4096
|
"num_ctx": 4096
|
||||||
|
@ -247,12 +247,12 @@ The `keep_alive` parameter can be set to:
|
||||||
|
|
||||||
For example, to preload a model and leave it in memory use:
|
For example, to preload a model and leave it in memory use:
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}'
|
curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": -1}'
|
||||||
```
|
```
|
||||||
|
|
||||||
To unload the model and free up memory use:
|
To unload the model and free up memory use:
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}'
|
curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}'
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
|
Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
|
||||||
|
|
|
@ -11,7 +11,7 @@ A model file is the blueprint to create and share models with Ollama.
|
||||||
- [Examples](#examples)
|
- [Examples](#examples)
|
||||||
- [Instructions](#instructions)
|
- [Instructions](#instructions)
|
||||||
- [FROM (Required)](#from-required)
|
- [FROM (Required)](#from-required)
|
||||||
- [Build from llama3.1](#build-from-llama31)
|
- [Build from existing model](#build-from-existing-model)
|
||||||
- [Build from a Safetensors model](#build-from-a-safetensors-model)
|
- [Build from a Safetensors model](#build-from-a-safetensors-model)
|
||||||
- [Build from a GGUF file](#build-from-a-gguf-file)
|
- [Build from a GGUF file](#build-from-a-gguf-file)
|
||||||
- [PARAMETER](#parameter)
|
- [PARAMETER](#parameter)
|
||||||
|
@ -50,7 +50,7 @@ INSTRUCTION arguments
|
||||||
An example of a `Modelfile` creating a mario blueprint:
|
An example of a `Modelfile` creating a mario blueprint:
|
||||||
|
|
||||||
```modelfile
|
```modelfile
|
||||||
FROM llama3
|
FROM llama3.1
|
||||||
# sets the temperature to 1 [higher is more creative, lower is more coherent]
|
# sets the temperature to 1 [higher is more creative, lower is more coherent]
|
||||||
PARAMETER temperature 1
|
PARAMETER temperature 1
|
||||||
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
|
# sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
|
||||||
|
@ -72,10 +72,10 @@ More examples are available in the [examples directory](../examples).
|
||||||
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
|
To view the Modelfile of a given model, use the `ollama show --modelfile` command.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
> ollama show --modelfile llama3
|
> ollama show --modelfile llama3.1
|
||||||
# Modelfile generated by "ollama show"
|
# Modelfile generated by "ollama show"
|
||||||
# To build a new Modelfile based on this one, replace the FROM line with:
|
# To build a new Modelfile based on this one, replace the FROM line with:
|
||||||
# FROM llama3:latest
|
# FROM llama3.1:latest
|
||||||
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
|
FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
|
||||||
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
|
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
|
||||||
|
|
||||||
|
@ -100,7 +100,7 @@ The `FROM` instruction defines the base model to use when creating a model.
|
||||||
FROM <model name>:<tag>
|
FROM <model name>:<tag>
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Build from llama3.1
|
#### Build from existing model
|
||||||
|
|
||||||
```modelfile
|
```modelfile
|
||||||
FROM llama3.1
|
FROM llama3.1
|
||||||
|
|
|
@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
|
||||||
'content': 'Say this is a test',
|
'content': 'Say this is a test',
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
model='llama3',
|
model='llama3.1',
|
||||||
)
|
)
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
|
@ -46,13 +46,13 @@ response = client.chat.completions.create(
|
||||||
)
|
)
|
||||||
|
|
||||||
completion = client.completions.create(
|
completion = client.completions.create(
|
||||||
model="llama3",
|
model="llama3.1",
|
||||||
prompt="Say this is a test",
|
prompt="Say this is a test",
|
||||||
)
|
)
|
||||||
|
|
||||||
list_completion = client.models.list()
|
list_completion = client.models.list()
|
||||||
|
|
||||||
model = client.models.retrieve("llama3")
|
model = client.models.retrieve("llama3.1")
|
||||||
|
|
||||||
embeddings = client.embeddings.create(
|
embeddings = client.embeddings.create(
|
||||||
model="all-minilm",
|
model="all-minilm",
|
||||||
|
@ -74,7 +74,7 @@ const openai = new OpenAI({
|
||||||
|
|
||||||
const chatCompletion = await openai.chat.completions.create({
|
const chatCompletion = await openai.chat.completions.create({
|
||||||
messages: [{ role: 'user', content: 'Say this is a test' }],
|
messages: [{ role: 'user', content: 'Say this is a test' }],
|
||||||
model: 'llama3',
|
model: 'llama3.1',
|
||||||
})
|
})
|
||||||
|
|
||||||
const response = await openai.chat.completions.create({
|
const response = await openai.chat.completions.create({
|
||||||
|
@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({
|
||||||
})
|
})
|
||||||
|
|
||||||
const completion = await openai.completions.create({
|
const completion = await openai.completions.create({
|
||||||
model: "llama3",
|
model: "llama3.1",
|
||||||
prompt: "Say this is a test.",
|
prompt: "Say this is a test.",
|
||||||
})
|
})
|
||||||
|
|
||||||
const listCompletion = await openai.models.list()
|
const listCompletion = await openai.models.list()
|
||||||
|
|
||||||
const model = await openai.models.retrieve("llama3")
|
const model = await openai.models.retrieve("llama3.1")
|
||||||
|
|
||||||
const embedding = await openai.embeddings.create({
|
const embedding = await openai.embeddings.create({
|
||||||
model: "all-minilm",
|
model: "all-minilm",
|
||||||
|
@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({
|
||||||
curl http://localhost:11434/v1/chat/completions \
|
curl http://localhost:11434/v1/chat/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
|
@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \
|
||||||
curl http://localhost:11434/v1/completions \
|
curl http://localhost:11434/v1/completions \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
-d '{
|
-d '{
|
||||||
"model": "llama3",
|
"model": "llama3.1",
|
||||||
"prompt": "Say this is a test"
|
"prompt": "Say this is a test"
|
||||||
}'
|
}'
|
||||||
|
|
||||||
curl http://localhost:11434/v1/models
|
curl http://localhost:11434/v1/models
|
||||||
|
|
||||||
curl http://localhost:11434/v1/models/llama3
|
curl http://localhost:11434/v1/models/llama3.1
|
||||||
|
|
||||||
curl http://localhost:11434/v1/embeddings \
|
curl http://localhost:11434/v1/embeddings \
|
||||||
-H "Content-Type: application/json" \
|
-H "Content-Type: application/json" \
|
||||||
|
@ -274,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \
|
||||||
Before using a model, pull it locally `ollama pull`:
|
Before using a model, pull it locally `ollama pull`:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
ollama pull llama3
|
ollama pull llama3.1
|
||||||
```
|
```
|
||||||
|
|
||||||
### Default model names
|
### Default model names
|
||||||
|
@ -282,7 +282,7 @@ ollama pull llama3
|
||||||
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
|
For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama cp llama3 gpt-3.5-turbo
|
ollama cp llama3.1 gpt-3.5-turbo
|
||||||
```
|
```
|
||||||
|
|
||||||
Afterwards, this new model name can be specified the `model` field:
|
Afterwards, this new model name can be specified the `model` field:
|
||||||
|
|
|
@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat
|
||||||
To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
|
To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
|
||||||
|
|
||||||
```dockerfile
|
```dockerfile
|
||||||
FROM llama3
|
FROM llama3.1
|
||||||
|
|
||||||
TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
|
TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
|
||||||
|
|
||||||
|
|
|
@ -91,6 +91,17 @@ If none of those resolve the problem, gather additional information and file an
|
||||||
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
|
- Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia`
|
||||||
|
|
||||||
|
|
||||||
|
## AMD GPU Discovery
|
||||||
|
|
||||||
|
On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log.
|
||||||
|
|
||||||
|
When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices.
|
||||||
|
|
||||||
|
If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure.
|
||||||
|
- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems
|
||||||
|
- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported
|
||||||
|
- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd`
|
||||||
|
|
||||||
## Windows Terminal Errors
|
## Windows Terminal Errors
|
||||||
|
|
||||||
Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer.
|
Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer.
|
||||||
|
|
|
@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn
|
||||||
|
|
||||||
Here's a quick example showing API access from `powershell`
|
Here's a quick example showing API access from `powershell`
|
||||||
```powershell
|
```powershell
|
||||||
(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
(Invoke-WebRequest -method POST -Body '{"model":"llama3.1", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
|
||||||
```
|
```
|
||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
|
|
|
@ -179,53 +179,6 @@ var (
|
||||||
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
|
HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
|
||||||
)
|
)
|
||||||
|
|
||||||
func RunnersDir() (p string) {
|
|
||||||
if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
|
|
||||||
return p
|
|
||||||
}
|
|
||||||
|
|
||||||
if runtime.GOOS != "windows" {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
defer func() {
|
|
||||||
if p == "" {
|
|
||||||
slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// On Windows we do not carry the payloads inside the main executable
|
|
||||||
exe, err := os.Executable()
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
cwd, err := os.Getwd()
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
var paths []string
|
|
||||||
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} {
|
|
||||||
paths = append(paths,
|
|
||||||
root,
|
|
||||||
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
|
|
||||||
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try a few variations to improve developer experience when building from source in the local tree
|
|
||||||
for _, path := range paths {
|
|
||||||
candidate := filepath.Join(path, "lib", "ollama", "runners")
|
|
||||||
if _, err := os.Stat(candidate); err == nil {
|
|
||||||
p = candidate
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return p
|
|
||||||
}
|
|
||||||
|
|
||||||
func Uint(key string, defaultValue uint) func() uint {
|
func Uint(key string, defaultValue uint) func() uint {
|
||||||
return func() uint {
|
return func() uint {
|
||||||
if s := Var(key); s != "" {
|
if s := Var(key); s != "" {
|
||||||
|
@ -290,10 +243,22 @@ func AsMap() map[string]EnvVar {
|
||||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
||||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
||||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
||||||
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
|
|
||||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||||
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
|
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
|
||||||
|
|
||||||
|
// Informational
|
||||||
|
"HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},
|
||||||
|
"HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"},
|
||||||
|
"NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if runtime.GOOS != "windows" {
|
||||||
|
// Windows environment variables are case-insensitive so there's no need to duplicate them
|
||||||
|
ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"}
|
||||||
|
ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"}
|
||||||
|
ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"}
|
||||||
|
}
|
||||||
|
|
||||||
if runtime.GOOS != "darwin" {
|
if runtime.GOOS != "darwin" {
|
||||||
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
||||||
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
|
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
|
||||||
|
@ -302,6 +267,7 @@ func AsMap() map[string]EnvVar {
|
||||||
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
||||||
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
langchain==0.0.274
|
langchain==0.0.274
|
||||||
gpt4all==1.0.8
|
gpt4all==1.0.8
|
||||||
chromadb==0.4.7
|
chromadb==0.5.0
|
||||||
llama-cpp-python==0.1.81
|
llama-cpp-python==0.1.81
|
||||||
urllib3==2.0.4
|
urllib3==2.0.4
|
||||||
PyMuPDF==1.23.5
|
PyMuPDF==1.23.5
|
||||||
|
@ -12,4 +12,4 @@ pandoc==2.3
|
||||||
pypandoc==1.11
|
pypandoc==1.11
|
||||||
tqdm==4.66.1
|
tqdm==4.66.1
|
||||||
sentence_transformers==2.2.2
|
sentence_transformers==2.2.2
|
||||||
numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability
|
numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability
|
||||||
|
|
|
@ -4,5 +4,5 @@ SYSTEM """
|
||||||
You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer.
|
You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PARAMETER TEMPERATURE 0.3
|
PARAMETER temperature 0.3
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,8 @@ You can try this with the `logtest.logfile` file included in this directory.
|
||||||
2. Install the Python Requirements.
|
2. Install the Python Requirements.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
python3 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Requests==2.31.0
|
Requests>=2.32.3
|
||||||
|
|
|
@ -5,6 +5,7 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"io/fs"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
@ -359,6 +360,10 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
if len(resp) == 0 {
|
if len(resp) == 0 {
|
||||||
slog.Info("no compatible amdgpu devices detected")
|
slog.Info("no compatible amdgpu devices detected")
|
||||||
}
|
}
|
||||||
|
if err := verifyKFDDriverAccess(); err != nil {
|
||||||
|
slog.Error("amdgpu devices detected but permission problems block access", "error", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return resp
|
return resp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -455,3 +460,19 @@ func getFreeMemory(usedFile string) (uint64, error) {
|
||||||
}
|
}
|
||||||
return usedMemory, nil
|
return usedMemory, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func verifyKFDDriverAccess() error {
|
||||||
|
// Verify we have permissions - either running as root, or we have group access to the driver
|
||||||
|
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, fs.ErrPermission) {
|
||||||
|
return fmt.Errorf("permissions not set up properly. Either run ollama as root, or add you user account to the render group. %w", err)
|
||||||
|
} else if errors.Is(err, fs.ErrNotExist) {
|
||||||
|
// Container runtime failure?
|
||||||
|
return fmt.Errorf("kfd driver not loaded. If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
|
||||||
|
}
|
||||||
|
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
|
||||||
|
}
|
||||||
|
fd.Close()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
148
gpu/assets.go
148
gpu/assets.go
|
@ -1,148 +0,0 @@
|
||||||
package gpu
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
"sync"
|
|
||||||
"syscall"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/envconfig"
|
|
||||||
)
|
|
||||||
|
|
||||||
var (
|
|
||||||
lock sync.Mutex
|
|
||||||
payloadsDir = ""
|
|
||||||
)
|
|
||||||
|
|
||||||
func PayloadsDir() (string, error) {
|
|
||||||
lock.Lock()
|
|
||||||
defer lock.Unlock()
|
|
||||||
var err error
|
|
||||||
if payloadsDir == "" {
|
|
||||||
runnersDir := envconfig.RunnersDir()
|
|
||||||
|
|
||||||
if runnersDir != "" {
|
|
||||||
payloadsDir = runnersDir
|
|
||||||
return payloadsDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// The remainder only applies on non-windows where we still carry payloads in the main executable
|
|
||||||
cleanupTmpDirs()
|
|
||||||
tmpDir := envconfig.TmpDir()
|
|
||||||
if tmpDir == "" {
|
|
||||||
tmpDir, err = os.MkdirTemp("", "ollama")
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
err = os.MkdirAll(tmpDir, 0o755)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Track our pid so we can clean up orphaned tmpdirs
|
|
||||||
n := filepath.Join(tmpDir, "ollama.pid")
|
|
||||||
if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
|
|
||||||
return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// We create a distinct subdirectory for payloads within the tmpdir
|
|
||||||
// This will typically look like /tmp/ollama3208993108/runners on linux
|
|
||||||
payloadsDir = filepath.Join(tmpDir, "runners")
|
|
||||||
}
|
|
||||||
return payloadsDir, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Best effort to clean up prior tmpdirs
|
|
||||||
func cleanupTmpDirs() {
|
|
||||||
matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, match := range matches {
|
|
||||||
raw, err := os.ReadFile(match)
|
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
|
||||||
slog.Debug("not a ollama runtime directory, skipping", "path", match)
|
|
||||||
continue
|
|
||||||
} else if err != nil {
|
|
||||||
slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
pid, err := strconv.Atoi(string(raw))
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("invalid pid, skipping", "path", match, "error", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
p, err := os.FindProcess(pid)
|
|
||||||
if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
|
|
||||||
slog.Warn("process still running, skipping", "pid", pid, "path", match)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := os.Remove(match); err != nil {
|
|
||||||
slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
runners := filepath.Join(filepath.Dir(match), "runners")
|
|
||||||
if err := os.RemoveAll(runners); err != nil {
|
|
||||||
slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := os.Remove(filepath.Dir(match)); err != nil {
|
|
||||||
slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func Cleanup() {
|
|
||||||
lock.Lock()
|
|
||||||
defer lock.Unlock()
|
|
||||||
runnersDir := envconfig.RunnersDir()
|
|
||||||
if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
|
|
||||||
// We want to fully clean up the tmpdir parent of the payloads dir
|
|
||||||
tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
|
|
||||||
slog.Debug("cleaning up", "dir", tmpDir)
|
|
||||||
err := os.RemoveAll(tmpDir)
|
|
||||||
if err != nil {
|
|
||||||
// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
|
|
||||||
time.Sleep(1000 * time.Millisecond)
|
|
||||||
err = os.RemoveAll(tmpDir)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func UpdatePath(dir string) {
|
|
||||||
if runtime.GOOS == "windows" {
|
|
||||||
tmpDir := filepath.Dir(dir)
|
|
||||||
pathComponents := strings.Split(os.Getenv("PATH"), ";")
|
|
||||||
i := 0
|
|
||||||
for _, comp := range pathComponents {
|
|
||||||
if strings.EqualFold(comp, dir) {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Remove any other prior paths to our temp dir
|
|
||||||
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
|
|
||||||
pathComponents[i] = comp
|
|
||||||
i++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
|
||||||
slog.Info("updating", "PATH", newPath)
|
|
||||||
os.Setenv("PATH", newPath)
|
|
||||||
}
|
|
||||||
// linux and darwin rely on rpath
|
|
||||||
}
|
|
|
@ -93,10 +93,9 @@ func initCudaHandles() *cudaHandles {
|
||||||
localAppData := os.Getenv("LOCALAPPDATA")
|
localAppData := os.Getenv("LOCALAPPDATA")
|
||||||
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
|
cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
|
||||||
}
|
}
|
||||||
tmpDir, _ := PayloadsDir()
|
libDir := LibraryDir()
|
||||||
if tmpDir != "" {
|
if libDir != "" {
|
||||||
// TODO - add "payloads" for subprocess
|
cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
|
||||||
cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
|
|
||||||
}
|
}
|
||||||
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
|
cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
|
||||||
|
|
||||||
|
|
60
llm/ext_server/server.cpp
vendored
60
llm/ext_server/server.cpp
vendored
|
@ -913,7 +913,9 @@ struct llama_server_context
|
||||||
slot.sampled = result.tok;
|
slot.sampled = result.tok;
|
||||||
|
|
||||||
// search stop word and delete it
|
// search stop word and delete it
|
||||||
slot.generated_text += token_str;
|
if (!llama_token_is_eog(model, result.tok))
|
||||||
|
slot.generated_text += token_str;
|
||||||
|
|
||||||
slot.has_next_token = true;
|
slot.has_next_token = true;
|
||||||
|
|
||||||
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
|
||||||
|
@ -954,30 +956,36 @@ struct llama_server_context
|
||||||
if (!incomplete)
|
if (!incomplete)
|
||||||
{
|
{
|
||||||
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
const std::string str_test = slot.generated_text.substr(pos);
|
|
||||||
bool is_stop_full = false;
|
|
||||||
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
|
||||||
if (stop_pos != std::string::npos)
|
|
||||||
{
|
|
||||||
is_stop_full = true;
|
|
||||||
slot.generated_text.erase(
|
|
||||||
slot.generated_text.begin() + pos + stop_pos,
|
|
||||||
slot.generated_text.end());
|
|
||||||
pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
is_stop_full = false;
|
|
||||||
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
|
|
||||||
}
|
|
||||||
|
|
||||||
// check if there is any token to predict
|
if (!llama_token_is_eog(model, result.tok)) {
|
||||||
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
|
const std::string str_test = slot.generated_text.substr(pos);
|
||||||
{
|
bool is_stop_full = false;
|
||||||
// no send the stop word in the response
|
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
||||||
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
if (stop_pos != std::string::npos)
|
||||||
slot.n_sent_text += result.text_to_send.size();
|
{
|
||||||
// add the token to slot queue and cache
|
is_stop_full = true;
|
||||||
|
slot.generated_text.erase(
|
||||||
|
slot.generated_text.begin() + pos + stop_pos,
|
||||||
|
slot.generated_text.end());
|
||||||
|
pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
is_stop_full = false;
|
||||||
|
stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot);
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if there is any token to predict
|
||||||
|
if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0))
|
||||||
|
{
|
||||||
|
// no send the stop word in the response
|
||||||
|
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||||
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
|
// add the token to slot queue and cache
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||||
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.params.stream)
|
if (slot.params.stream)
|
||||||
|
@ -1117,9 +1125,7 @@ struct llama_server_context
|
||||||
{"multimodal", multimodal}
|
{"multimodal", multimodal}
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!llama_token_is_eog(model, tkn.tok)) {
|
res.result_json["content"] = tkn.text_to_send;
|
||||||
res.result_json["content"] = tkn.text_to_send;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (slot.sparams.n_probs > 0)
|
if (slot.sparams.n_probs > 0)
|
||||||
{
|
{
|
||||||
|
|
|
@ -31,6 +31,7 @@ init_vars() {
|
||||||
NO_WHOLE_ARCHIVE=""
|
NO_WHOLE_ARCHIVE=""
|
||||||
GCC_ARCH="-arch ${ARCH}"
|
GCC_ARCH="-arch ${ARCH}"
|
||||||
DIST_BASE=../../dist/darwin-${GOARCH}/
|
DIST_BASE=../../dist/darwin-${GOARCH}/
|
||||||
|
PAYLOAD_BASE=../../build/darwin/${GOARCH}
|
||||||
;;
|
;;
|
||||||
"Linux")
|
"Linux")
|
||||||
LIB_EXT="so"
|
LIB_EXT="so"
|
||||||
|
@ -40,6 +41,7 @@ init_vars() {
|
||||||
# Cross compiling not supported on linux - Use docker
|
# Cross compiling not supported on linux - Use docker
|
||||||
GCC_ARCH=""
|
GCC_ARCH=""
|
||||||
DIST_BASE=../../dist/linux-${GOARCH}/
|
DIST_BASE=../../dist/linux-${GOARCH}/
|
||||||
|
PAYLOAD_BASE=../../build/linux/${GOARCH}
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
;;
|
;;
|
||||||
|
@ -47,7 +49,8 @@ init_vars() {
|
||||||
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
|
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
|
||||||
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||||
fi
|
fi
|
||||||
GZIP=$(which pigz 2>/dev/null || echo "gzip")
|
GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
|
||||||
|
RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
|
||||||
}
|
}
|
||||||
|
|
||||||
git_module_setup() {
|
git_module_setup() {
|
||||||
|
@ -91,17 +94,34 @@ build() {
|
||||||
rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
|
rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
|
||||||
}
|
}
|
||||||
|
|
||||||
compress() {
|
dist() {
|
||||||
echo "Compressing payloads to reduce overall binary size..."
|
[ -z "${RUNNER}" ] && exit 1
|
||||||
rm -rf ${BUILD_DIR}/bin/*.gz
|
mkdir -p ${RUNNER_BASE}/${RUNNER}/
|
||||||
for f in ${BUILD_DIR}/bin/* ; do
|
for f in ${BUILD_DIR}/bin/* ; do
|
||||||
${GZIP} -n --best -f ${f} &
|
cp ${f} ${RUNNER_BASE}/${RUNNER}/
|
||||||
|
done
|
||||||
|
# check for lib directory
|
||||||
|
if [ -d ${BUILD_DIR}/lib ]; then
|
||||||
|
for f in ${BUILD_DIR}/lib/* ; do
|
||||||
|
cp ${f} ${RUNNER_BASE}/${RUNNER}/
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
|
||||||
|
compress() {
|
||||||
|
[ -z "${RUNNER}" ] && exit 1
|
||||||
|
echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
|
||||||
|
rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
|
||||||
|
mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
|
||||||
|
for f in ${BUILD_DIR}/bin/* ; do
|
||||||
|
${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
|
||||||
compress_pids+=" $!"
|
compress_pids+=" $!"
|
||||||
done
|
done
|
||||||
# check for lib directory
|
# check for lib directory
|
||||||
if [ -d ${BUILD_DIR}/lib ]; then
|
if [ -d ${BUILD_DIR}/lib ]; then
|
||||||
for f in ${BUILD_DIR}/lib/* ; do
|
for f in ${BUILD_DIR}/lib/* ; do
|
||||||
${GZIP} -n --best -f ${f} &
|
${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
|
||||||
compress_pids+=" $!"
|
compress_pids+=" $!"
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
@ -117,7 +137,7 @@ wait_for_compress() {
|
||||||
|
|
||||||
install() {
|
install() {
|
||||||
echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
|
echo "Installing libraries to bin dir ${BUILD_DIR}/bin/"
|
||||||
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do
|
for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do
|
||||||
rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
|
rm -f "${BUILD_DIR}/bin/$(basename ${lib})"
|
||||||
cp -af "${lib}" "${BUILD_DIR}/bin/"
|
cp -af "${lib}" "${BUILD_DIR}/bin/"
|
||||||
done
|
done
|
||||||
|
|
|
@ -39,7 +39,8 @@ case "${GOARCH}" in
|
||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/cpu"
|
RUNNER=cpu
|
||||||
|
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||||
echo "Building LCD CPU"
|
echo "Building LCD CPU"
|
||||||
build
|
build
|
||||||
sign ${BUILD_DIR}/bin/ollama_llama_server
|
sign ${BUILD_DIR}/bin/ollama_llama_server
|
||||||
|
@ -51,7 +52,8 @@ case "${GOARCH}" in
|
||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
|
RUNNER=cpu_avx
|
||||||
|
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||||
echo "Building AVX CPU"
|
echo "Building AVX CPU"
|
||||||
build
|
build
|
||||||
sign ${BUILD_DIR}/bin/ollama_llama_server
|
sign ${BUILD_DIR}/bin/ollama_llama_server
|
||||||
|
@ -63,7 +65,8 @@ case "${GOARCH}" in
|
||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
|
RUNNER=cpu_avx2
|
||||||
|
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||||
echo "Building AVX2 CPU"
|
echo "Building AVX2 CPU"
|
||||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
|
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
|
||||||
build
|
build
|
||||||
|
@ -84,7 +87,8 @@ case "${GOARCH}" in
|
||||||
if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
|
if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}/metal"
|
RUNNER="metal"
|
||||||
|
BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}"
|
||||||
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
|
EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
|
||||||
build
|
build
|
||||||
sign ${BUILD_DIR}/bin/ollama_llama_server
|
sign ${BUILD_DIR}/bin/ollama_llama_server
|
||||||
|
|
|
@ -79,10 +79,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
init_vars
|
init_vars
|
||||||
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
||||||
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
RUNNER="cpu"
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
echo "Building custom CPU"
|
echo "Building custom CPU"
|
||||||
build
|
build
|
||||||
install
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
else
|
else
|
||||||
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
||||||
|
@ -102,10 +104,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
RUNNER=cpu
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
echo "Building LCD CPU"
|
echo "Building LCD CPU"
|
||||||
build
|
build
|
||||||
install
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -120,10 +124,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
|
RUNNER=cpu_avx
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
echo "Building AVX CPU"
|
echo "Building AVX CPU"
|
||||||
build
|
build
|
||||||
install
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -134,10 +140,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||||
#
|
#
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
|
RUNNER=cpu_avx2
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
echo "Building AVX2 CPU"
|
echo "Building AVX2 CPU"
|
||||||
build
|
build
|
||||||
install
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
@ -187,11 +195,13 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
||||||
fi
|
fi
|
||||||
export CUDAFLAGS="-t8"
|
export CUDAFLAGS="-t8"
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
RUNNER=cuda${CUDA_VARIANT}
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||||
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
|
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
|
||||||
build
|
build
|
||||||
install
|
install
|
||||||
|
dist
|
||||||
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
|
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
|
||||||
mkdir -p "${CUDA_DIST_DIR}"
|
mkdir -p "${CUDA_DIST_DIR}"
|
||||||
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
|
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
|
||||||
|
@ -212,7 +222,8 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
|
||||||
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
|
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
|
||||||
CC=icx
|
CC=icx
|
||||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/oneapi"
|
RUNNER=oneapi
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
|
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
|
||||||
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
|
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
|
||||||
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
|
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
|
||||||
|
@ -231,6 +242,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
|
||||||
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
|
||||||
install
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -259,7 +271,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
||||||
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
|
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
|
||||||
echo "Building custom ROCM GPU"
|
echo "Building custom ROCM GPU"
|
||||||
fi
|
fi
|
||||||
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
RUNNER=rocm${ROCM_VARIANT}
|
||||||
|
BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
|
||||||
# ROCm dependencies are too large to fit into a unified bundle
|
# ROCm dependencies are too large to fit into a unified bundle
|
||||||
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
|
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
|
||||||
# TODO figure out how to disable runpath (rpath)
|
# TODO figure out how to disable runpath (rpath)
|
||||||
|
@ -269,13 +282,17 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
||||||
|
|
||||||
# copy the ROCM dependencies
|
# copy the ROCM dependencies
|
||||||
mkdir -p "${ROCM_DIST_DIR}"
|
mkdir -p "${ROCM_DIST_DIR}"
|
||||||
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
|
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
|
||||||
cp -a "${dep}"* "${ROCM_DIST_DIR}"
|
cp -a "${dep}"* "${ROCM_DIST_DIR}"
|
||||||
|
if [ $(readlink -f "${dep}") != "${dep}" ] ; then
|
||||||
|
cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
install
|
install
|
||||||
|
dist
|
||||||
compress
|
compress
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cleanup
|
cleanup
|
||||||
wait_for_compress
|
wait_for_compress
|
||||||
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|
echo "go generate completed. LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"
|
||||||
|
|
|
@ -1,11 +1,7 @@
|
||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"embed"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
//go:embed build/darwin/arm64/*/bin/*
|
|
||||||
var libEmbed embed.FS
|
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
|
@ -1,11 +0,0 @@
|
||||||
package llm
|
|
||||||
|
|
||||||
import (
|
|
||||||
"embed"
|
|
||||||
"syscall"
|
|
||||||
)
|
|
||||||
|
|
||||||
//go:embed build/darwin/x86_64/*/bin/*
|
|
||||||
var libEmbed embed.FS
|
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
|
|
@ -1,11 +1,7 @@
|
||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"embed"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
//go:embed build/linux/*/*/bin/*
|
|
||||||
var libEmbed embed.FS
|
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"embed"
|
|
||||||
"syscall"
|
"syscall"
|
||||||
)
|
)
|
||||||
|
|
||||||
// unused on windows
|
|
||||||
var libEmbed embed.FS
|
|
||||||
|
|
||||||
const CREATE_DEFAULT_ERROR_MODE = 0x04000000
|
const CREATE_DEFAULT_ERROR_MODE = 0x04000000
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
|
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
|
||||||
|
|
233
llm/payload.go
233
llm/payload.go
|
@ -1,233 +0,0 @@
|
||||||
package llm
|
|
||||||
|
|
||||||
import (
|
|
||||||
"compress/gzip"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"io"
|
|
||||||
"io/fs"
|
|
||||||
"log/slog"
|
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"runtime"
|
|
||||||
"slices"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"golang.org/x/sync/errgroup"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/gpu"
|
|
||||||
)
|
|
||||||
|
|
||||||
var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")
|
|
||||||
|
|
||||||
func Init() error {
|
|
||||||
payloadsDir, err := gpu.PayloadsDir()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
if runtime.GOOS != "windows" {
|
|
||||||
slog.Info("extracting embedded files", "dir", payloadsDir)
|
|
||||||
binGlob := "build/*/*/*/bin/*"
|
|
||||||
|
|
||||||
// extract server libraries
|
|
||||||
err = extractFiles(payloadsDir, binGlob)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("extract binaries: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var variants []string
|
|
||||||
for v := range getAvailableServers() {
|
|
||||||
variants = append(variants, v)
|
|
||||||
}
|
|
||||||
slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
|
|
||||||
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// binary names may contain an optional variant separated by '_'
|
|
||||||
// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
|
|
||||||
// Any library without a variant is the lowest common denominator
|
|
||||||
func getAvailableServers() map[string]string {
|
|
||||||
payloadsDir, err := gpu.PayloadsDir()
|
|
||||||
if err != nil {
|
|
||||||
slog.Error("payload lookup error", "error", err)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// glob payloadsDir for files that start with ollama_
|
|
||||||
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
|
|
||||||
|
|
||||||
files, err := filepath.Glob(pattern)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("could not glob", "pattern", pattern, "error", err)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
servers := make(map[string]string)
|
|
||||||
for _, file := range files {
|
|
||||||
slog.Debug("availableServers : found", "file", file)
|
|
||||||
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
|
|
||||||
}
|
|
||||||
|
|
||||||
return servers
|
|
||||||
}
|
|
||||||
|
|
||||||
// serversForGpu returns a list of compatible servers give the provided GPU
|
|
||||||
// info, ordered by performance. assumes Init() has been called
|
|
||||||
// TODO - switch to metadata based mapping
|
|
||||||
func serversForGpu(info gpu.GpuInfo) []string {
|
|
||||||
// glob workDir for files that start with ollama_
|
|
||||||
availableServers := getAvailableServers()
|
|
||||||
requested := info.Library
|
|
||||||
if info.Variant != gpu.CPUCapabilityNone.String() {
|
|
||||||
requested += "_" + info.Variant
|
|
||||||
}
|
|
||||||
|
|
||||||
servers := []string{}
|
|
||||||
|
|
||||||
// exact match first
|
|
||||||
for a := range availableServers {
|
|
||||||
if a == requested {
|
|
||||||
servers = []string{a}
|
|
||||||
|
|
||||||
if a == "metal" {
|
|
||||||
return servers
|
|
||||||
}
|
|
||||||
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
alt := []string{}
|
|
||||||
|
|
||||||
// Then for GPUs load alternates and sort the list for consistent load ordering
|
|
||||||
if info.Library != "cpu" {
|
|
||||||
for a := range availableServers {
|
|
||||||
if info.Library == strings.Split(a, "_")[0] && a != requested {
|
|
||||||
alt = append(alt, a)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
slices.Sort(alt)
|
|
||||||
servers = append(servers, alt...)
|
|
||||||
}
|
|
||||||
|
|
||||||
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
|
|
||||||
// Load up the best CPU variant if not primary requested
|
|
||||||
if info.Library != "cpu" {
|
|
||||||
variant := gpu.GetCPUCapability()
|
|
||||||
// If no variant, then we fall back to default
|
|
||||||
// If we have a variant, try that if we find an exact match
|
|
||||||
// Attempting to run the wrong CPU instructions will panic the
|
|
||||||
// process
|
|
||||||
if variant != gpu.CPUCapabilityNone {
|
|
||||||
for cmp := range availableServers {
|
|
||||||
if cmp == "cpu_"+variant.String() {
|
|
||||||
servers = append(servers, cmp)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
servers = append(servers, "cpu")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(servers) == 0 {
|
|
||||||
servers = []string{"cpu"}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return servers
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the optimal server for this CPU architecture
|
|
||||||
func serverForCpu() string {
|
|
||||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
|
||||||
return "metal"
|
|
||||||
}
|
|
||||||
variant := gpu.GetCPUCapability()
|
|
||||||
availableServers := getAvailableServers()
|
|
||||||
if variant != gpu.CPUCapabilityNone {
|
|
||||||
for cmp := range availableServers {
|
|
||||||
if cmp == "cpu_"+variant.String() {
|
|
||||||
return cmp
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return "cpu"
|
|
||||||
}
|
|
||||||
|
|
||||||
// extract extracts the embedded files to the target directory
|
|
||||||
func extractFiles(targetDir string, glob string) error {
|
|
||||||
files, err := fs.Glob(libEmbed, glob)
|
|
||||||
if err != nil || len(files) == 0 {
|
|
||||||
return errPayloadMissing
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := os.MkdirAll(targetDir, 0o755); err != nil {
|
|
||||||
return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
g := new(errgroup.Group)
|
|
||||||
|
|
||||||
// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
|
|
||||||
for _, file := range files {
|
|
||||||
filename := file
|
|
||||||
|
|
||||||
variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
|
|
||||||
|
|
||||||
slog.Debug("extracting", "variant", variant, "file", filename)
|
|
||||||
|
|
||||||
g.Go(func() error {
|
|
||||||
srcf, err := libEmbed.Open(filename)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer srcf.Close()
|
|
||||||
|
|
||||||
src := io.Reader(srcf)
|
|
||||||
if strings.HasSuffix(filename, ".gz") {
|
|
||||||
src, err = gzip.NewReader(src)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("decompress payload %s: %v", filename, err)
|
|
||||||
}
|
|
||||||
filename = strings.TrimSuffix(filename, ".gz")
|
|
||||||
}
|
|
||||||
|
|
||||||
variantDir := filepath.Join(targetDir, variant)
|
|
||||||
if err := os.MkdirAll(variantDir, 0o755); err != nil {
|
|
||||||
return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
base := filepath.Base(filename)
|
|
||||||
destFilename := filepath.Join(variantDir, base)
|
|
||||||
|
|
||||||
_, err = os.Stat(destFilename)
|
|
||||||
switch {
|
|
||||||
case errors.Is(err, os.ErrNotExist):
|
|
||||||
destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("write payload %s: %v", filename, err)
|
|
||||||
}
|
|
||||||
defer destFile.Close()
|
|
||||||
if _, err := io.Copy(destFile, src); err != nil {
|
|
||||||
return fmt.Errorf("copy payload %s: %v", filename, err)
|
|
||||||
}
|
|
||||||
case err != nil:
|
|
||||||
return fmt.Errorf("stat payload %s: %v", filename, err)
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
err = g.Wait()
|
|
||||||
if err != nil {
|
|
||||||
// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
|
|
||||||
gpu.Cleanup()
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
|
@ -24,9 +24,11 @@ import (
|
||||||
"golang.org/x/sync/semaphore"
|
"golang.org/x/sync/semaphore"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
|
"github.com/ollama/ollama/build"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
|
"github.com/ollama/ollama/runners"
|
||||||
)
|
)
|
||||||
|
|
||||||
type LlamaServer interface {
|
type LlamaServer interface {
|
||||||
|
@ -106,7 +108,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
gpus = gpu.GetCPUInfo()
|
gpus = gpu.GetCPUInfo()
|
||||||
}
|
}
|
||||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
cpuRunner = serverForCpu()
|
cpuRunner = runners.ServerForCpu()
|
||||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
} else {
|
} else {
|
||||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
|
@ -118,7 +120,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
opts.NumGPU = 0
|
opts.NumGPU = 0
|
||||||
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||||
// Don't bother loading into the GPU if no layers can fit
|
// Don't bother loading into the GPU if no layers can fit
|
||||||
cpuRunner = serverForCpu()
|
cpuRunner = runners.ServerForCpu()
|
||||||
gpus = gpu.GetCPUInfo()
|
gpus = gpu.GetCPUInfo()
|
||||||
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||||
opts.NumGPU = estimate.Layers
|
opts.NumGPU = estimate.Layers
|
||||||
|
@ -145,25 +147,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||||
}
|
}
|
||||||
|
|
||||||
availableServers := getAvailableServers()
|
rDir, err := runners.Refresh(build.EmbedFS)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
availableServers := runners.GetAvailableServers(rDir)
|
||||||
if len(availableServers) == 0 {
|
if len(availableServers) == 0 {
|
||||||
if runtime.GOOS != "windows" {
|
return nil, finalErr
|
||||||
slog.Warn("llama server binary disappeared, reinitializing payloads")
|
|
||||||
err = Init()
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
availableServers = getAvailableServers()
|
|
||||||
} else {
|
|
||||||
return nil, finalErr
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
var servers []string
|
var servers []string
|
||||||
if cpuRunner != "" {
|
if cpuRunner != "" {
|
||||||
servers = []string{cpuRunner}
|
servers = []string{cpuRunner}
|
||||||
} else {
|
} else {
|
||||||
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||||
}
|
}
|
||||||
demandLib := envconfig.LLMLibrary()
|
demandLib := envconfig.LLMLibrary()
|
||||||
if demandLib != "" {
|
if demandLib != "" {
|
||||||
|
@ -274,7 +271,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
params = append(params, "--tensor-split", estimate.TensorSplit)
|
params = append(params, "--tensor-split", estimate.TensorSplit)
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := range len(servers) {
|
for i := range servers {
|
||||||
dir := availableServers[servers[i]]
|
dir := availableServers[servers[i]]
|
||||||
if dir == "" {
|
if dir == "" {
|
||||||
// Shouldn't happen
|
// Shouldn't happen
|
||||||
|
@ -330,7 +327,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
_, err := os.Stat(server)
|
_, err := os.Stat(server)
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
||||||
err = Init()
|
_, err = runners.Refresh(build.EmbedFS)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
slog.Warn("failed to reinitialize payloads", "error", err)
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
384
runners/common.go
Normal file
384
runners/common.go
Normal file
|
@ -0,0 +1,384 @@
|
||||||
|
package runners
|
||||||
|
|
||||||
|
import (
|
||||||
|
"compress/gzip"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"io/fs"
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"slices"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/envconfig"
|
||||||
|
"github.com/ollama/ollama/gpu"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
binGlob = "*/*/*/*"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
lock sync.Mutex
|
||||||
|
runnersDir = ""
|
||||||
|
)
|
||||||
|
|
||||||
|
// Return the location where runners are stored
|
||||||
|
// If runners are payloads, this will either extract them
|
||||||
|
// or refresh them if any have disappeared due to tmp cleaners
|
||||||
|
func Refresh(payloadFS fs.FS) (string, error) {
|
||||||
|
lock.Lock()
|
||||||
|
defer lock.Unlock()
|
||||||
|
var err error
|
||||||
|
|
||||||
|
// Wire up extra logging on our first load
|
||||||
|
if runnersDir == "" {
|
||||||
|
defer func() {
|
||||||
|
var runners []string
|
||||||
|
for v := range GetAvailableServers(runnersDir) {
|
||||||
|
runners = append(runners, v)
|
||||||
|
}
|
||||||
|
slog.Info("Dynamic LLM libraries", "runners", runners)
|
||||||
|
slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
if hasPayloads(payloadFS) {
|
||||||
|
if runnersDir == "" {
|
||||||
|
runnersDir, err = extractRunners(payloadFS)
|
||||||
|
} else {
|
||||||
|
err = refreshRunners(payloadFS, runnersDir)
|
||||||
|
}
|
||||||
|
} else if runnersDir == "" {
|
||||||
|
runnersDir, err = locateRunners()
|
||||||
|
}
|
||||||
|
|
||||||
|
return runnersDir, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func Cleanup(payloadFS fs.FS) {
|
||||||
|
lock.Lock()
|
||||||
|
defer lock.Unlock()
|
||||||
|
if hasPayloads(payloadFS) && runnersDir != "" {
|
||||||
|
// We want to fully clean up the tmpdir parent of the payloads dir
|
||||||
|
tmpDir := filepath.Clean(filepath.Join(runnersDir, ".."))
|
||||||
|
slog.Debug("cleaning up", "dir", tmpDir)
|
||||||
|
err := os.RemoveAll(tmpDir)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func locateRunners() (string, error) {
|
||||||
|
exe, err := os.Executable()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
cwd, err := os.Getwd()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
var paths []string
|
||||||
|
for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
|
||||||
|
paths = append(paths,
|
||||||
|
root,
|
||||||
|
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
|
||||||
|
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try a few variations to improve developer experience when building from source in the local tree
|
||||||
|
for _, path := range paths {
|
||||||
|
candidate := filepath.Join(path, "lib", "ollama", "runners")
|
||||||
|
if _, err := os.Stat(candidate); err == nil {
|
||||||
|
return candidate, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return true if we're carying nested payloads for the runners
|
||||||
|
func hasPayloads(payloadFS fs.FS) bool {
|
||||||
|
files, err := fs.Glob(payloadFS, binGlob)
|
||||||
|
if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractRunners(payloadFS fs.FS) (string, error) {
|
||||||
|
cleanupTmpDirs()
|
||||||
|
tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama")
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to generate tmp dir: %w", err)
|
||||||
|
}
|
||||||
|
// Track our pid so we can clean up orphaned tmpdirs
|
||||||
|
n := filepath.Join(tmpDir, "ollama.pid")
|
||||||
|
if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
|
||||||
|
slog.Warn("failed to write pid file", "file", n, "error", err)
|
||||||
|
}
|
||||||
|
// We create a distinct subdirectory for payloads within the tmpdir
|
||||||
|
// This will typically look like /tmp/ollama3208993108/runners on linux
|
||||||
|
rDir := filepath.Join(tmpDir, "runners")
|
||||||
|
|
||||||
|
slog.Info("extracting embedded files", "dir", rDir)
|
||||||
|
return rDir, refreshRunners(payloadFS, rDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
func refreshRunners(payloadFS fs.FS, rDir string) error {
|
||||||
|
// extract or refresh server libraries
|
||||||
|
err := extractFiles(payloadFS, rDir, binGlob)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("extract binaries: %v", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// extract extracts the embedded files to the target directory
|
||||||
|
func extractFiles(payloadFS fs.FS, targetDir string, glob string) error {
|
||||||
|
files, err := fs.Glob(payloadFS, glob)
|
||||||
|
if err != nil || len(files) == 0 {
|
||||||
|
// Should not happen
|
||||||
|
return fmt.Errorf("extractFiles called without payload present")
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.MkdirAll(targetDir, 0o755); err != nil {
|
||||||
|
return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
g := new(errgroup.Group)
|
||||||
|
|
||||||
|
// $OS/$GOARCH/$RUNNER/$FILE
|
||||||
|
for _, file := range files {
|
||||||
|
filename := file
|
||||||
|
|
||||||
|
runner := filepath.Base(filepath.Dir(filename))
|
||||||
|
|
||||||
|
slog.Debug("extracting", "runner", runner, "payload", filename)
|
||||||
|
|
||||||
|
g.Go(func() error {
|
||||||
|
srcf, err := payloadFS.Open(filename)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer srcf.Close()
|
||||||
|
|
||||||
|
src := io.Reader(srcf)
|
||||||
|
if strings.HasSuffix(filename, ".gz") {
|
||||||
|
src, err = gzip.NewReader(src)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("decompress payload %s: %v", filename, err)
|
||||||
|
}
|
||||||
|
filename = strings.TrimSuffix(filename, ".gz")
|
||||||
|
}
|
||||||
|
|
||||||
|
runnerDir := filepath.Join(targetDir, runner)
|
||||||
|
if err := os.MkdirAll(runnerDir, 0o755); err != nil {
|
||||||
|
return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
base := filepath.Base(filename)
|
||||||
|
destFilename := filepath.Join(runnerDir, base)
|
||||||
|
|
||||||
|
_, err = os.Stat(destFilename)
|
||||||
|
switch {
|
||||||
|
case errors.Is(err, os.ErrNotExist):
|
||||||
|
destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("write payload %s: %v", filename, err)
|
||||||
|
}
|
||||||
|
defer destFile.Close()
|
||||||
|
if _, err := io.Copy(destFile, src); err != nil {
|
||||||
|
return fmt.Errorf("copy payload %s: %v", filename, err)
|
||||||
|
}
|
||||||
|
case err != nil:
|
||||||
|
return fmt.Errorf("stat payload %s: %v", filename, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
err = g.Wait()
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("failed to extract files", "error", err)
|
||||||
|
// If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted
|
||||||
|
err := os.RemoveAll(targetDir)
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err)
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Best effort to clean up prior tmpdirs
|
||||||
|
func cleanupTmpDirs() {
|
||||||
|
tmpDir := envconfig.TmpDir()
|
||||||
|
if tmpDir == "" {
|
||||||
|
tmpDir = os.TempDir()
|
||||||
|
}
|
||||||
|
matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid"))
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, match := range matches {
|
||||||
|
raw, err := os.ReadFile(match)
|
||||||
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
slog.Debug("not a ollama runtime directory, skipping", "path", match)
|
||||||
|
continue
|
||||||
|
} else if err != nil {
|
||||||
|
slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
pid, err := strconv.Atoi(string(raw))
|
||||||
|
if err != nil {
|
||||||
|
slog.Warn("invalid pid, skipping", "path", match, "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
p, err := os.FindProcess(pid)
|
||||||
|
if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
|
||||||
|
slog.Warn("process still running, skipping", "pid", pid, "path", match)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.Remove(match); err != nil {
|
||||||
|
slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
runners := filepath.Join(filepath.Dir(match), "runners")
|
||||||
|
if err := os.RemoveAll(runners); err != nil {
|
||||||
|
slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.Remove(filepath.Dir(match)); err != nil {
|
||||||
|
slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// directory names are the name of the runner and may contain an optional
|
||||||
|
// variant prefixed with '_' as the separator. For example, "cuda_v11" and
|
||||||
|
// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
|
||||||
|
// lowest common denominator
|
||||||
|
func GetAvailableServers(payloadsDir string) map[string]string {
|
||||||
|
if payloadsDir == "" {
|
||||||
|
slog.Error("empty runner dir")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// glob payloadsDir for files that start with ollama_
|
||||||
|
pattern := filepath.Join(payloadsDir, "*", "ollama_*")
|
||||||
|
|
||||||
|
files, err := filepath.Glob(pattern)
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("could not glob", "pattern", pattern, "error", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
servers := make(map[string]string)
|
||||||
|
for _, file := range files {
|
||||||
|
slog.Debug("availableServers : found", "file", file)
|
||||||
|
servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
|
||||||
|
}
|
||||||
|
|
||||||
|
return servers
|
||||||
|
}
|
||||||
|
|
||||||
|
// serversForGpu returns a list of compatible servers give the provided GPU
|
||||||
|
// info, ordered by performance. assumes Init() has been called
|
||||||
|
// TODO - switch to metadata based mapping
|
||||||
|
func ServersForGpu(info gpu.GpuInfo) []string {
|
||||||
|
// glob workDir for files that start with ollama_
|
||||||
|
availableServers := GetAvailableServers(runnersDir)
|
||||||
|
requested := info.Library
|
||||||
|
if info.Variant != gpu.CPUCapabilityNone.String() {
|
||||||
|
requested += "_" + info.Variant
|
||||||
|
}
|
||||||
|
|
||||||
|
servers := []string{}
|
||||||
|
|
||||||
|
// exact match first
|
||||||
|
for a := range availableServers {
|
||||||
|
if a == requested {
|
||||||
|
servers = []string{a}
|
||||||
|
|
||||||
|
if a == "metal" {
|
||||||
|
return servers
|
||||||
|
}
|
||||||
|
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
alt := []string{}
|
||||||
|
|
||||||
|
// Then for GPUs load alternates and sort the list for consistent load ordering
|
||||||
|
if info.Library != "cpu" {
|
||||||
|
for a := range availableServers {
|
||||||
|
if info.Library == strings.Split(a, "_")[0] && a != requested {
|
||||||
|
alt = append(alt, a)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
slices.Sort(alt)
|
||||||
|
servers = append(servers, alt...)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
|
||||||
|
// Load up the best CPU variant if not primary requested
|
||||||
|
if info.Library != "cpu" {
|
||||||
|
variant := gpu.GetCPUCapability()
|
||||||
|
// If no variant, then we fall back to default
|
||||||
|
// If we have a variant, try that if we find an exact match
|
||||||
|
// Attempting to run the wrong CPU instructions will panic the
|
||||||
|
// process
|
||||||
|
if variant != gpu.CPUCapabilityNone {
|
||||||
|
for cmp := range availableServers {
|
||||||
|
if cmp == "cpu_"+variant.String() {
|
||||||
|
servers = append(servers, cmp)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
servers = append(servers, "cpu")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(servers) == 0 {
|
||||||
|
servers = []string{"cpu"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return servers
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the optimal server for this CPU architecture
|
||||||
|
func ServerForCpu() string {
|
||||||
|
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||||
|
return "metal"
|
||||||
|
}
|
||||||
|
variant := gpu.GetCPUCapability()
|
||||||
|
availableServers := GetAvailableServers(runnersDir)
|
||||||
|
if variant != gpu.CPUCapabilityNone {
|
||||||
|
for cmp := range availableServers {
|
||||||
|
if cmp == "cpu_"+variant.String() {
|
||||||
|
return cmp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "cpu"
|
||||||
|
}
|
50
runners/runners_test.go
Normal file
50
runners/runners_test.go
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
package runners
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log/slog"
|
||||||
|
"os"
|
||||||
|
"path"
|
||||||
|
"runtime"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
"testing/fstest"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRefreshRunners(t *testing.T) {
|
||||||
|
slog.SetLogLoggerLevel(slog.LevelDebug)
|
||||||
|
|
||||||
|
payloadFS := fstest.MapFS{
|
||||||
|
path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")},
|
||||||
|
}
|
||||||
|
tmpDir, err := os.MkdirTemp("", "testing")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to make tmp dir %s", err)
|
||||||
|
}
|
||||||
|
t.Setenv("OLLAMA_TMPDIR", tmpDir)
|
||||||
|
rDir, err := Refresh(payloadFS)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to extract to %s %s", tmpDir, err)
|
||||||
|
}
|
||||||
|
if !strings.Contains(rDir, tmpDir) {
|
||||||
|
t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
// spot check results
|
||||||
|
servers := GetAvailableServers(rDir)
|
||||||
|
if len(servers) < 1 {
|
||||||
|
t.Fatalf("expected at least 1 server")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Refresh contents
|
||||||
|
rDir, err = extractRunners(payloadFS)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to extract to %s %s", tmpDir, err)
|
||||||
|
}
|
||||||
|
if !strings.Contains(rDir, tmpDir) {
|
||||||
|
t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanupTmpDirs()
|
||||||
|
|
||||||
|
Cleanup(payloadFS)
|
||||||
|
}
|
|
@ -2,8 +2,7 @@
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
. $(dirname $0)/env.sh
|
||||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
|
||||||
|
|
||||||
mkdir -p dist
|
mkdir -p dist
|
||||||
|
|
||||||
|
|
|
@ -2,76 +2,34 @@
|
||||||
|
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
. $(dirname $0)/env.sh
|
||||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
|
||||||
|
|
||||||
# We use 2 different image repositories to handle combining architecture images into multiarch manifest
|
|
||||||
# (The ROCm image is x86 only and is not a multiarch manifest)
|
|
||||||
# For developers, you can override the DOCKER_ORG to generate multiarch manifests
|
|
||||||
# DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh
|
|
||||||
DOCKER_ORG=${DOCKER_ORG:-"ollama"}
|
|
||||||
RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
|
|
||||||
FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
|
|
||||||
|
|
||||||
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
|
|
||||||
|
|
||||||
# Set PUSH to a non-empty string to trigger push instead of load
|
# Set PUSH to a non-empty string to trigger push instead of load
|
||||||
PUSH=${PUSH:-""}
|
PUSH=${PUSH:-""}
|
||||||
|
|
||||||
# In CI mode, we break things down
|
|
||||||
OLLAMA_SKIP_MANIFEST_CREATE=${OLLAMA_SKIP_MANIFEST_CREATE:-""}
|
|
||||||
OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""}
|
|
||||||
|
|
||||||
if [ -z "${PUSH}" ] ; then
|
if [ -z "${PUSH}" ] ; then
|
||||||
|
echo "Building ${FINAL_IMAGE_REPO}:$VERSION locally. set PUSH=1 to push"
|
||||||
LOAD_OR_PUSH="--load"
|
LOAD_OR_PUSH="--load"
|
||||||
else
|
else
|
||||||
echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}"
|
echo "Will be pushing ${FINAL_IMAGE_REPO}:$VERSION"
|
||||||
LOAD_OR_PUSH="--push"
|
LOAD_OR_PUSH="--push"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then
|
docker buildx build \
|
||||||
for TARGETARCH in ${BUILD_ARCH}; do
|
${LOAD_OR_PUSH} \
|
||||||
docker build \
|
--platform=${PLATFORM} \
|
||||||
${LOAD_OR_PUSH} \
|
${OLLAMA_COMMON_BUILD_ARGS} \
|
||||||
--platform=linux/${TARGETARCH} \
|
-f Dockerfile \
|
||||||
--build-arg=VERSION \
|
-t ${FINAL_IMAGE_REPO}:$VERSION \
|
||||||
--build-arg=GOFLAGS \
|
.
|
||||||
-f Dockerfile \
|
|
||||||
-t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \
|
|
||||||
.
|
|
||||||
done
|
|
||||||
|
|
||||||
if echo ${BUILD_ARCH} | grep "amd64" > /dev/null; then
|
if echo $PLATFORM | grep "amd64" > /dev/null; then
|
||||||
docker build \
|
docker buildx build \
|
||||||
${LOAD_OR_PUSH} \
|
${LOAD_OR_PUSH} \
|
||||||
--platform=linux/amd64 \
|
--platform=linux/amd64 \
|
||||||
--build-arg=VERSION \
|
${OLLAMA_COMMON_BUILD_ARGS} \
|
||||||
--build-arg=GOFLAGS \
|
--target runtime-rocm \
|
||||||
--target runtime-rocm \
|
-f Dockerfile \
|
||||||
-f Dockerfile \
|
-t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
|
||||||
-t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \
|
.
|
||||||
.
|
fi
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then
|
|
||||||
if [ -n "${PUSH}" ]; then
|
|
||||||
docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \
|
|
||||||
${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
|
|
||||||
${RELEASE_IMAGE_REPO}:$VERSION-arm64
|
|
||||||
docker manifest push ${FINAL_IMAGE_REPO}:$VERSION
|
|
||||||
|
|
||||||
# For symmetry, tag/push the rocm image
|
|
||||||
if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
|
|
||||||
echo "Tagging and pushing rocm image"
|
|
||||||
docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
|
|
||||||
docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
|
|
||||||
docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
echo "Skipping manifest generation when not pushing images are available locally as "
|
|
||||||
echo " ${RELEASE_IMAGE_REPO}:$VERSION-amd64"
|
|
||||||
echo " ${RELEASE_IMAGE_REPO}:$VERSION-arm64"
|
|
||||||
echo " ${RELEASE_IMAGE_REPO}:$VERSION-rocm"
|
|
||||||
fi
|
|
||||||
fi
|
|
|
@ -1,37 +1,29 @@
|
||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
|
#
|
||||||
|
# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder
|
||||||
|
#
|
||||||
|
# docker context create amd64 --docker host=ssh://mybuildhost
|
||||||
|
# docker buildx create --name mybuilder amd64 --platform linux/amd64
|
||||||
|
# docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64
|
||||||
|
# docker buildx use mybuilder
|
||||||
|
|
||||||
|
|
||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
. $(dirname $0)/env.sh
|
||||||
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
|
||||||
GZIP=$(which pigz 2>/dev/null || echo "gzip")
|
|
||||||
|
|
||||||
BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
|
|
||||||
export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
|
|
||||||
mkdir -p dist
|
mkdir -p dist
|
||||||
|
|
||||||
for TARGETARCH in ${BUILD_ARCH}; do
|
docker buildx build \
|
||||||
docker build \
|
--output type=local,dest=./dist/ \
|
||||||
--platform=linux/$TARGETARCH \
|
--platform=${PLATFORM} \
|
||||||
--build-arg=GOFLAGS \
|
${OLLAMA_COMMON_BUILD_ARGS} \
|
||||||
--build-arg=CGO_CFLAGS \
|
--target dist \
|
||||||
--build-arg=OLLAMA_CUSTOM_CPU_DEFS \
|
|
||||||
--build-arg=AMDGPU_TARGETS \
|
|
||||||
--target build-$TARGETARCH \
|
|
||||||
-f Dockerfile \
|
-f Dockerfile \
|
||||||
-t builder:$TARGETARCH \
|
|
||||||
.
|
.
|
||||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
|
||||||
rm -rf ./dist/linux-$TARGETARCH
|
# buildx behavior changes for single vs. multiplatform
|
||||||
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
|
if echo $PLATFORM | grep "," > /dev/null ; then
|
||||||
if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
|
mv -f ./dist/linux_*64/ollama* ./dist/
|
||||||
docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
|
rmdir ./dist/linux_*64
|
||||||
fi
|
fi
|
||||||
docker rm builder-$TARGETARCH
|
|
||||||
echo "Compressing final linux bundle..."
|
|
||||||
rm -f ./dist/ollama-linux-$TARGETARCH.tgz
|
|
||||||
(cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
|
|
||||||
if [ -d dist/linux-$TARGETARCH-rocm ]; then
|
|
||||||
(cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
|
|
||||||
fi
|
|
||||||
done
|
|
14
scripts/env.sh
Normal file
14
scripts/env.sh
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
# Common environment setup across build*.sh scripts
|
||||||
|
|
||||||
|
export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
|
||||||
|
export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
|
||||||
|
# TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
|
||||||
|
PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
|
||||||
|
DOCKER_ORG=${DOCKER_ORG:-"ollama"}
|
||||||
|
RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
|
||||||
|
FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
|
||||||
|
OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION --build-arg=GOFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS --build-arg=AMDGPU_TARGETS"
|
||||||
|
|
||||||
|
echo "Building Ollama"
|
||||||
|
echo "VERSION=$VERSION"
|
||||||
|
echo "PLATFORM=$PLATFORM"
|
|
@ -26,11 +26,13 @@ import (
|
||||||
"golang.org/x/sync/errgroup"
|
"golang.org/x/sync/errgroup"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
|
"github.com/ollama/ollama/build"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/gpu"
|
"github.com/ollama/ollama/gpu"
|
||||||
"github.com/ollama/ollama/llm"
|
"github.com/ollama/ollama/llm"
|
||||||
"github.com/ollama/ollama/openai"
|
"github.com/ollama/ollama/openai"
|
||||||
"github.com/ollama/ollama/parser"
|
"github.com/ollama/ollama/parser"
|
||||||
|
"github.com/ollama/ollama/runners"
|
||||||
"github.com/ollama/ollama/template"
|
"github.com/ollama/ollama/template"
|
||||||
"github.com/ollama/ollama/types/errtypes"
|
"github.com/ollama/ollama/types/errtypes"
|
||||||
"github.com/ollama/ollama/types/model"
|
"github.com/ollama/ollama/types/model"
|
||||||
|
@ -117,6 +119,32 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// expire the runner
|
||||||
|
if req.Prompt == "" && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
|
||||||
|
model, err := GetModel(req.Model)
|
||||||
|
if err != nil {
|
||||||
|
switch {
|
||||||
|
case os.IsNotExist(err):
|
||||||
|
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
|
||||||
|
case err.Error() == "invalid model name":
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||||
|
default:
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.sched.expireRunner(model)
|
||||||
|
|
||||||
|
c.JSON(http.StatusOK, api.GenerateResponse{
|
||||||
|
Model: req.Model,
|
||||||
|
CreatedAt: time.Now().UTC(),
|
||||||
|
Response: "",
|
||||||
|
Done: true,
|
||||||
|
DoneReason: "unload",
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if req.Format != "" && req.Format != "json" {
|
if req.Format != "" && req.Format != "json" {
|
||||||
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""})
|
c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""})
|
||||||
return
|
return
|
||||||
|
@ -1190,12 +1218,12 @@ func Serve(ln net.Listener) error {
|
||||||
srvr.Close()
|
srvr.Close()
|
||||||
schedDone()
|
schedDone()
|
||||||
sched.unloadAllRunners()
|
sched.unloadAllRunners()
|
||||||
gpu.Cleanup()
|
runners.Cleanup(build.EmbedFS)
|
||||||
done()
|
done()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
if err := llm.Init(); err != nil {
|
if _, err := runners.Refresh(build.EmbedFS); err != nil {
|
||||||
return fmt.Errorf("unable to initialize llm library %w", err)
|
return fmt.Errorf("unable to initialize llm runners %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
s.sched.Run(schedCtx)
|
s.sched.Run(schedCtx)
|
||||||
|
@ -1322,6 +1350,32 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// expire the runner
|
||||||
|
if len(req.Messages) == 0 && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
|
||||||
|
model, err := GetModel(req.Model)
|
||||||
|
if err != nil {
|
||||||
|
switch {
|
||||||
|
case os.IsNotExist(err):
|
||||||
|
c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
|
||||||
|
case err.Error() == "invalid model name":
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||||
|
default:
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.sched.expireRunner(model)
|
||||||
|
|
||||||
|
c.JSON(http.StatusOK, api.ChatResponse{
|
||||||
|
Model: req.Model,
|
||||||
|
CreatedAt: time.Now().UTC(),
|
||||||
|
Message: api.Message{Role: "assistant"},
|
||||||
|
Done: true,
|
||||||
|
DoneReason: "unload",
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
caps := []Capability{CapabilityCompletion}
|
caps := []Capability{CapabilityCompletion}
|
||||||
if len(req.Tools) > 0 {
|
if len(req.Tools) > 0 {
|
||||||
caps = append(caps, CapabilityTools)
|
caps = append(caps, CapabilityTools)
|
||||||
|
|
|
@ -360,7 +360,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
|
||||||
slog.Debug("runner expired event received", "modelPath", runner.modelPath)
|
slog.Debug("runner expired event received", "modelPath", runner.modelPath)
|
||||||
runner.refMu.Lock()
|
runner.refMu.Lock()
|
||||||
if runner.refCount > 0 {
|
if runner.refCount > 0 {
|
||||||
// Shouldn't happen, but safeguard to ensure no leaked runners
|
|
||||||
slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount)
|
slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount)
|
||||||
go func(runner *runnerRef) {
|
go func(runner *runnerRef) {
|
||||||
// We can't unload yet, but want to as soon as the current request completes
|
// We can't unload yet, but want to as soon as the current request completes
|
||||||
|
@ -802,6 +801,25 @@ func (s *Scheduler) unloadAllRunners() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *Scheduler) expireRunner(model *Model) {
|
||||||
|
s.loadedMu.Lock()
|
||||||
|
defer s.loadedMu.Unlock()
|
||||||
|
runner, ok := s.loaded[model.ModelPath]
|
||||||
|
if ok {
|
||||||
|
runner.refMu.Lock()
|
||||||
|
runner.expiresAt = time.Now()
|
||||||
|
if runner.expireTimer != nil {
|
||||||
|
runner.expireTimer.Stop()
|
||||||
|
runner.expireTimer = nil
|
||||||
|
}
|
||||||
|
runner.sessionDuration = 0
|
||||||
|
if runner.refCount <= 0 {
|
||||||
|
s.expiredCh <- runner
|
||||||
|
}
|
||||||
|
runner.refMu.Unlock()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// If other runners are loaded, make sure the pending request will fit in system memory
|
// If other runners are loaded, make sure the pending request will fit in system memory
|
||||||
// If not, pick a runner to unload, else return nil and the request can be loaded
|
// If not, pick a runner to unload, else return nil and the request can be loaded
|
||||||
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
|
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
|
||||||
|
|
|
@ -406,6 +406,52 @@ func TestGetRunner(t *testing.T) {
|
||||||
b.ctxDone()
|
b.ctxDone()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestExpireRunner(t *testing.T) {
|
||||||
|
ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
|
||||||
|
defer done()
|
||||||
|
s := InitScheduler(ctx)
|
||||||
|
req := &LlmRequest{
|
||||||
|
ctx: ctx,
|
||||||
|
model: &Model{ModelPath: "foo"},
|
||||||
|
opts: api.DefaultOptions(),
|
||||||
|
successCh: make(chan *runnerRef, 1),
|
||||||
|
errCh: make(chan error, 1),
|
||||||
|
sessionDuration: &api.Duration{Duration: 2 * time.Minute},
|
||||||
|
}
|
||||||
|
|
||||||
|
var ggml *llm.GGML
|
||||||
|
gpus := gpu.GpuInfoList{}
|
||||||
|
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
|
||||||
|
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||||
|
return server, nil
|
||||||
|
}
|
||||||
|
s.load(req, ggml, gpus, 0)
|
||||||
|
|
||||||
|
select {
|
||||||
|
case err := <-req.errCh:
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("expected no errors when loading, got '%s'", err.Error())
|
||||||
|
}
|
||||||
|
case resp := <-req.successCh:
|
||||||
|
s.loadedMu.Lock()
|
||||||
|
if resp.refCount != uint(1) || len(s.loaded) != 1 {
|
||||||
|
t.Fatalf("expected a model to be loaded")
|
||||||
|
}
|
||||||
|
s.loadedMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
s.expireRunner(&Model{ModelPath: "foo"})
|
||||||
|
|
||||||
|
s.finishedReqCh <- req
|
||||||
|
s.processCompleted(ctx)
|
||||||
|
|
||||||
|
s.loadedMu.Lock()
|
||||||
|
if len(s.loaded) != 0 {
|
||||||
|
t.Fatalf("expected model to be unloaded")
|
||||||
|
}
|
||||||
|
s.loadedMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
// TODO - add one scenario that triggers the bogus finished event with positive ref count
|
||||||
func TestPrematureExpired(t *testing.T) {
|
func TestPrematureExpired(t *testing.T) {
|
||||||
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
|
||||||
|
|
Loading…
Reference in a new issue