From 30c8f201cc5981163a7b12dc6d74657b11446f7b Mon Sep 17 00:00:00 2001 From: RAPID ARCHITECT <126218667+rapidarchitect@users.noreply.github.com> Date: Sun, 8 Sep 2024 02:35:59 -0500 Subject: [PATCH 01/18] readme: add crewAI with mesop to community integrations --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d17b9723..c2c6f795 100644 --- a/README.md +++ b/README.md @@ -312,6 +312,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Cherry Studio](https://github.com/kangfenmao/cherry-studio) (Desktop client with Ollama support) - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption) - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library) +- [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama) ### Terminal From bb6a086d63640f57012dac92eef1da87035466b0 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 8 Sep 2024 00:36:24 -0700 Subject: [PATCH 02/18] readme: add crewAI to community integrations (#6699) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c2c6f795..3ffbb26d 100644 --- a/README.md +++ b/README.md @@ -359,6 +359,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa) - [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama) +- [crewAI](https://github.com/crewAIInc/crewAI) - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example) - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java) - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs) From 84b84ce2db23ff5e8db274237155d3639e20970c Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Mon, 9 Sep 2024 17:18:54 -0700 Subject: [PATCH 03/18] catch when model vocab size is set correctly (#6714) --- convert/convert.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/convert/convert.go b/convert/convert.go index 8c7b0943..44783b6e 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -208,14 +208,18 @@ func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { return err } - if vocabSize := int(p.VocabSize); vocabSize > len(t.Vocabulary.Tokens) { - slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", p.VocabSize, "actual", len(t.Vocabulary.Tokens)) + vocabSize := int(p.VocabSize) + switch { + case vocabSize > len(t.Vocabulary.Tokens): + slog.Warn("vocabulary is smaller than expected, padding with dummy tokens", "expect", vocabSize, "actual", len(t.Vocabulary.Tokens)) for i := range vocabSize - len(t.Vocabulary.Tokens) { t.Vocabulary.Tokens = append(t.Vocabulary.Tokens, fmt.Sprintf("[PAD%d]", i)) t.Vocabulary.Scores = append(t.Vocabulary.Scores, -1) t.Vocabulary.Types = append(t.Vocabulary.Types, tokenTypeUserDefined) } - } else { + case vocabSize < len(t.Vocabulary.Tokens): + return fmt.Errorf("vocabulary is larger than expected '%d' instead of '%d'", len(t.Vocabulary.Tokens), vocabSize) + default: slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) } From 4a8069f9c4c8cb761cd6c10ca5f4be6af21fa0ae Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 9 Sep 2024 17:22:20 -0700 Subject: [PATCH 04/18] Quiet down dockers new lint warnings (#6716) * Quiet down dockers new lint warnings Docker has recently added lint warnings to build. This cleans up those warnings. * Fix go lint regression --- Dockerfile | 46 +++++++++++++++++++++++----------------------- llm/server.go | 2 +- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6743866a..655f1081 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,12 +16,12 @@ FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-1 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH +ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG CUDA_V11_ARCHITECTURES -ENV GOARCH amd64 +ENV GOARCH=amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ @@ -33,12 +33,12 @@ FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-1 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH +ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG CUDA_V12_ARCHITECTURES -ENV GOARCH amd64 +ENV GOARCH=amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ @@ -51,12 +51,12 @@ FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cu ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH +ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG CUDA_V11_ARCHITECTURES -ENV GOARCH arm64 +ENV GOARCH=arm64 RUN OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ @@ -67,12 +67,12 @@ FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cu ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH +ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG CUDA_V12_ARCHITECTURES -ENV GOARCH arm64 +ENV GOARCH=arm64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ @@ -86,13 +86,13 @@ FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-b ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH -ENV LIBRARY_PATH /opt/amdgpu/lib64 +ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH +ENV LIBRARY_PATH=/opt/amdgpu/lib64 COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG AMDGPU_TARGETS -ENV GOARCH amd64 +ENV GOARCH=amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \ @@ -103,11 +103,11 @@ ARG CMAKE_VERSION ARG GOLANG_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH +ENV PATH=/opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS -ENV GOARCH amd64 +ENV GOARCH=amd64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64 @@ -128,11 +128,11 @@ ARG CMAKE_VERSION ARG GOLANG_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh -ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH +ENV PATH=/opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS -ENV GOARCH arm64 +ENV GOARCH=arm64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64 @@ -145,7 +145,7 @@ RUN --mount=type=cache,target=/root/.ccache \ # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64 -ENV CGO_ENABLED 1 +ENV CGO_ENABLED=1 WORKDIR /go/src/github.com/ollama/ollama COPY . . COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ @@ -164,7 +164,7 @@ RUN --mount=type=cache,target=/root/.ccache \ # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 -ENV CGO_ENABLED 1 +ENV CGO_ENABLED=1 ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . @@ -179,37 +179,37 @@ RUN --mount=type=cache,target=/root/.ccache \ go build -trimpath -o dist/linux-arm64/bin/ollama . # Strip out ROCm dependencies to keep the primary image lean -FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm +FROM --platform=linux/amd64 ubuntu:22.04 AS amd64-libs-without-rocm COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/ RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* # Runtime stages -FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 +FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64 COPY --from=amd64-libs-without-rocm /scratch/ /lib/ RUN apt-get update && apt-get install -y ca-certificates && \ apt-get clean && rm -rf /var/lib/apt/lists/* COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ -FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64 +FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64 COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ RUN apt-get update && apt-get install -y ca-certificates && \ apt-get clean && rm -rf /var/lib/apt/lists/* COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/ # Radeon images are much larger so we keep it distinct from the CPU/CUDA image -FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm +FROM rocm/dev-centos-7:${ROCM_VERSION}-complete AS runtime-rocm RUN update-pciids COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ RUN ln -s /opt/rocm/lib /lib/ollama EXPOSE 11434 -ENV OLLAMA_HOST 0.0.0.0 +ENV OLLAMA_HOST=0.0.0.0 ENTRYPOINT ["/bin/ollama"] CMD ["serve"] FROM runtime-$TARGETARCH EXPOSE 11434 -ENV OLLAMA_HOST 0.0.0.0 +ENV OLLAMA_HOST=0.0.0.0 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility diff --git a/llm/server.go b/llm/server.go index 28eb8d6f..5d5b8c4f 100644 --- a/llm/server.go +++ b/llm/server.go @@ -274,7 +274,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--tensor-split", estimate.TensorSplit) } - for i := range len(servers) { + for i := range servers { dir := availableServers[servers[i]] if dir == "" { // Shouldn't happen From 83a9b5271a68c7d1f8443f91c8d8b7d24ab581a9 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 9 Sep 2024 22:47:16 -0700 Subject: [PATCH 05/18] docs: update examples to use llama3.1 (#6718) --- docs/api.md | 48 +++++++++++++++++++++++------------------------ docs/faq.md | 6 +++--- docs/modelfile.md | 10 +++++----- docs/openai.md | 22 +++++++++++----------- docs/template.md | 2 +- docs/windows.md | 2 +- 6 files changed, 45 insertions(+), 45 deletions(-) diff --git a/docs/api.md b/docs/api.md index aed2b69f..1ae60dc7 100644 --- a/docs/api.md +++ b/docs/api.md @@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "Why is the sky blue?" }' ``` @@ -80,7 +80,7 @@ A stream of JSON objects is returned: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T08:52:19.385406455-07:00", "response": "The", "done": false @@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s), ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T19:22:45.499127Z", "response": "", "done": true, @@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off. ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "Why is the sky blue?", "stream": false }' @@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T19:22:45.499127Z", "response": "The sky is blue because it is the color of the sky.", "done": true, @@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{ ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "What color is the sky at different times of the day? Respond using JSON", "format": "json", "stream": false @@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{ ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-11-09T21:07:55.186497Z", "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n", "done": true, @@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "Why is the sky blue?", "stream": false, "options": { @@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{ ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T19:22:45.499127Z", "response": "The sky is blue because it is the color of the sky.", "done": true, @@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory. ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3" + "model": "llama3.1" }' ``` @@ -400,7 +400,7 @@ A single JSON object is returned: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-12-18T19:52:07.071755Z", "response": "", "done": true @@ -445,7 +445,7 @@ Send a chat message with a streaming response. ```shell curl http://localhost:11434/api/chat -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "user", @@ -461,7 +461,7 @@ A stream of JSON objects is returned: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T08:52:19.385406455-07:00", "message": { "role": "assistant", @@ -476,7 +476,7 @@ Final response: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T19:22:45.499127Z", "done": true, "total_duration": 4883583458, @@ -494,7 +494,7 @@ Final response: ```shell curl http://localhost:11434/api/chat -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "user", @@ -509,7 +509,7 @@ curl http://localhost:11434/api/chat -d '{ ```json { - "model": "registry.ollama.ai/library/llama3:latest", + "model": "llama3.1", "created_at": "2023-12-12T14:13:43.416799Z", "message": { "role": "assistant", @@ -533,7 +533,7 @@ Send a chat message with a conversation history. You can use this same approach ```shell curl http://localhost:11434/api/chat -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "user", @@ -557,7 +557,7 @@ A stream of JSON objects is returned: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T08:52:19.385406455-07:00", "message": { "role": "assistant", @@ -571,7 +571,7 @@ Final response: ```json { - "model": "llama3", + "model": "llama3.1", "created_at": "2023-08-04T19:22:45.499127Z", "done": true, "total_duration": 8113331500, @@ -629,7 +629,7 @@ curl http://localhost:11434/api/chat -d '{ ```shell curl http://localhost:11434/api/chat -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "user", @@ -647,7 +647,7 @@ curl http://localhost:11434/api/chat -d '{ ```json { - "model": "registry.ollama.ai/library/llama3:latest", + "model": "llama3.1", "created_at": "2023-12-12T14:13:43.416799Z", "message": { "role": "assistant", @@ -904,7 +904,7 @@ Show information about a model including details, modelfile, template, parameter ```shell curl http://localhost:11434/api/show -d '{ - "name": "llama3" + "name": "llama3.1" }' ``` @@ -965,7 +965,7 @@ Copy a model. Creates a model with another name from an existing model. ```shell curl http://localhost:11434/api/copy -d '{ - "source": "llama3", + "source": "llama3.1", "destination": "llama3-backup" }' ``` @@ -1020,7 +1020,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where ```shell curl http://localhost:11434/api/pull -d '{ - "name": "llama3" + "name": "llama3.1" }' ``` diff --git a/docs/faq.md b/docs/faq.md index 356d5105..6267ad2b 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter: ```shell curl http://localhost:11434/api/generate -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "Why is the sky blue?", "options": { "num_ctx": 4096 @@ -247,12 +247,12 @@ The `keep_alive` parameter can be set to: For example, to preload a model and leave it in memory use: ```shell -curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": -1}' +curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": -1}' ``` To unload the model and free up memory use: ```shell -curl http://localhost:11434/api/generate -d '{"model": "llama3", "keep_alive": 0}' +curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}' ``` Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable. diff --git a/docs/modelfile.md b/docs/modelfile.md index 92df22ef..a33f180b 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -11,7 +11,7 @@ A model file is the blueprint to create and share models with Ollama. - [Examples](#examples) - [Instructions](#instructions) - [FROM (Required)](#from-required) - - [Build from llama3.1](#build-from-llama31) + - [Build from existing model](#build-from-existing-model) - [Build from a Safetensors model](#build-from-a-safetensors-model) - [Build from a GGUF file](#build-from-a-gguf-file) - [PARAMETER](#parameter) @@ -50,7 +50,7 @@ INSTRUCTION arguments An example of a `Modelfile` creating a mario blueprint: ```modelfile -FROM llama3 +FROM llama3.1 # sets the temperature to 1 [higher is more creative, lower is more coherent] PARAMETER temperature 1 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token @@ -72,10 +72,10 @@ More examples are available in the [examples directory](../examples). To view the Modelfile of a given model, use the `ollama show --modelfile` command. ```bash - > ollama show --modelfile llama3 + > ollama show --modelfile llama3.1 # Modelfile generated by "ollama show" # To build a new Modelfile based on this one, replace the FROM line with: - # FROM llama3:latest + # FROM llama3.1:latest FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29 TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|> @@ -100,7 +100,7 @@ The `FROM` instruction defines the base model to use when creating a model. FROM : ``` -#### Build from llama3.1 +#### Build from existing model ```modelfile FROM llama3.1 diff --git a/docs/openai.md b/docs/openai.md index 0cbea6cc..c6df0fec 100644 --- a/docs/openai.md +++ b/docs/openai.md @@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create( 'content': 'Say this is a test', } ], - model='llama3', + model='llama3.1', ) response = client.chat.completions.create( @@ -46,13 +46,13 @@ response = client.chat.completions.create( ) completion = client.completions.create( - model="llama3", + model="llama3.1", prompt="Say this is a test", ) list_completion = client.models.list() -model = client.models.retrieve("llama3") +model = client.models.retrieve("llama3.1") embeddings = client.embeddings.create( model="all-minilm", @@ -74,7 +74,7 @@ const openai = new OpenAI({ const chatCompletion = await openai.chat.completions.create({ messages: [{ role: 'user', content: 'Say this is a test' }], - model: 'llama3', + model: 'llama3.1', }) const response = await openai.chat.completions.create({ @@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({ }) const completion = await openai.completions.create({ - model: "llama3", + model: "llama3.1", prompt: "Say this is a test.", }) const listCompletion = await openai.models.list() -const model = await openai.models.retrieve("llama3") +const model = await openai.models.retrieve("llama3.1") const embedding = await openai.embeddings.create({ model: "all-minilm", @@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({ curl http://localhost:11434/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "llama3", + "model": "llama3.1", "messages": [ { "role": "system", @@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \ curl http://localhost:11434/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "llama3", + "model": "llama3.1", "prompt": "Say this is a test" }' curl http://localhost:11434/v1/models -curl http://localhost:11434/v1/models/llama3 +curl http://localhost:11434/v1/models/llama3.1 curl http://localhost:11434/v1/embeddings \ -H "Content-Type: application/json" \ @@ -274,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \ Before using a model, pull it locally `ollama pull`: ```shell -ollama pull llama3 +ollama pull llama3.1 ``` ### Default model names @@ -282,7 +282,7 @@ ollama pull llama3 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name: ``` -ollama cp llama3 gpt-3.5-turbo +ollama cp llama3.1 gpt-3.5-turbo ``` Afterwards, this new model name can be specified the `model` field: diff --git a/docs/template.md b/docs/template.md index 1d7104de..192d878d 100644 --- a/docs/template.md +++ b/docs/template.md @@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3. ```dockerfile -FROM llama3 +FROM llama3.1 TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|> diff --git a/docs/windows.md b/docs/windows.md index f681ffac..372a35aa 100644 --- a/docs/windows.md +++ b/docs/windows.md @@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn Here's a quick example showing API access from `powershell` ```powershell -(Invoke-WebRequest -method POST -Body '{"model":"llama3", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json +(Invoke-WebRequest -method POST -Body '{"model":"llama3.1", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json ``` ## Troubleshooting From dddb72e08451f18ff94bb4c74bf6ba2fd7894eda Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 10 Sep 2024 09:36:42 -0700 Subject: [PATCH 06/18] add *_proxy for debugging --- envconfig/config.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/envconfig/config.go b/envconfig/config.go index 14e3cb0c..2c4393fe 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -293,7 +293,20 @@ func AsMap() map[string]EnvVar { "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"}, + + // Informational + "HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"}, + "HTTPS_PROXY": {"HTTPS_PROXY", String("HTTPS_PROXY")(), "HTTPS proxy"}, + "NO_PROXY": {"NO_PROXY", String("NO_PROXY")(), "No proxy"}, } + + if runtime.GOOS != "windows" { + // Windows environment variables are case-insensitive so there's no need to duplicate them + ret["http_proxy"] = EnvVar{"http_proxy", String("http_proxy")(), "HTTP proxy"} + ret["https_proxy"] = EnvVar{"https_proxy", String("https_proxy")(), "HTTPS proxy"} + ret["no_proxy"] = EnvVar{"no_proxy", String("no_proxy")(), "No proxy"} + } + if runtime.GOOS != "darwin" { ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"} ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"} @@ -302,6 +315,7 @@ func AsMap() map[string]EnvVar { ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"} ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"} } + return ret } From 9246e6dd150524307b854db3dfe774d7c1099636 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 11 Sep 2024 11:38:25 -0700 Subject: [PATCH 07/18] Verify permissions for AMD GPU (#6736) This adds back a check which was lost many releases back to verify /dev/kfd permissions which when lacking, can lead to confusing failure modes of: "rocBLAS error: Could not initialize Tensile host: No devices found" This implementation does not hard fail the serve command but instead will fall back to CPU with an error log. In the future we can include this in the GPU discovery UX to show detected but unsupported devices we discovered. --- docs/troubleshooting.md | 11 +++++++++++ gpu/amd_linux.go | 21 +++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 589061a8..0a89b87f 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -91,6 +91,17 @@ If none of those resolve the problem, gather additional information and file an - Check dmesg for any errors `sudo dmesg | grep -i nvrm` and `sudo dmesg | grep -i nvidia` +## AMD GPU Discovery + +On linux, AMD GPU access typically requires `video` and/or `render` group membership to access the `/dev/kfd` device. If permissions are not set up correctly, Ollama will detect this and report an error in the server log. + +When running in a container, in some Linux distributions and container runtimes, the ollama process may be unable to access the GPU. Use `ls -ld /dev/kfd /dev/dri /dev/dri/*` on the host system to determine the group assignments on your system, and pass additional `--group-add ...` arguments to the container so it can access the required devices. + +If you are experiencing problems getting Ollama to correctly discover or use your GPU for inference, the following may help isolate the failure. +- `AMD_LOG_LEVEL=3` Enable info log levels in the AMD HIP/ROCm libraries. This can help show more detailed error codes that can help troubleshoot problems +- `OLLAMA_DEBUG=1` During GPU discovery additional information will be reported +- Check dmesg for any errors from amdgpu or kfd drivers `sudo dmesg | grep -i amdgpu` and `sudo dmesg | grep -i kfd` + ## Windows Terminal Errors Older versions of Windows 10 (e.g., 21H1) are known to have a bug where the standard terminal program does not display control characters correctly. This can result in a long string of strings like `←[?25h←[?25l` being displayed, sometimes erroring with `The parameter is incorrect` To resolve this problem, please update to Win 10 22H1 or newer. diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index aab67efe..d3f5b9fc 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "io" + "io/fs" "log/slog" "os" "path/filepath" @@ -359,6 +360,10 @@ func AMDGetGPUInfo() []RocmGPUInfo { if len(resp) == 0 { slog.Info("no compatible amdgpu devices detected") } + if err := verifyKFDDriverAccess(); err != nil { + slog.Error("amdgpu devices detected but permission problems block access", "error", err) + return nil + } return resp } @@ -455,3 +460,19 @@ func getFreeMemory(usedFile string) (uint64, error) { } return usedMemory, nil } + +func verifyKFDDriverAccess() error { + // Verify we have permissions - either running as root, or we have group access to the driver + fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666) + if err != nil { + if errors.Is(err, fs.ErrPermission) { + return fmt.Errorf("permissions not set up properly. Either run ollama as root, or add you user account to the render group. %w", err) + } else if errors.Is(err, fs.ErrNotExist) { + // Container runtime failure? + return fmt.Errorf("kfd driver not loaded. If running in a container, remember to include '--device /dev/kfd --device /dev/dri'") + } + return fmt.Errorf("failed to check permission on /dev/kfd: %w", err) + } + fd.Close() + return nil +} From 7d6900827ded82172bd25d3a504662267c6984bb Mon Sep 17 00:00:00 2001 From: Petr Mironychev <9195189+Palm1r@users.noreply.github.com> Date: Wed, 11 Sep 2024 22:19:49 +0200 Subject: [PATCH 08/18] readme: add QodeAssist to community integrations (#6754) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3ffbb26d..b76f08a6 100644 --- a/README.md +++ b/README.md @@ -429,6 +429,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [Headless Ollama](https://github.com/nischalj10/headless-ollama) (Scripts to automatically install ollama client & models on any OS for apps that depends on ollama server) - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links) - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality) +- [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator) ### Supported backends From ecab6f1cc582a5ce8ee2bfbc780cb9990115a3da Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 11 Sep 2024 11:01:30 -0700 Subject: [PATCH 09/18] refactor show ouput fixes line wrapping on long texts --- cmd/cmd.go | 174 +++++++++++++++----------------------- cmd/cmd_test.go | 206 +++++++++++++++++++++++++++++++++++++++++++++ cmd/interactive.go | 2 +- 3 files changed, 277 insertions(+), 105 deletions(-) create mode 100644 cmd/cmd_test.go diff --git a/cmd/cmd.go b/cmd/cmd.go index 5de1ed1b..1fb721e7 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -2,6 +2,7 @@ package cmd import ( "archive/zip" + "bufio" "bytes" "context" "crypto/ed25519" @@ -21,6 +22,7 @@ import ( "regexp" "runtime" "slices" + "strconv" "strings" "sync/atomic" "syscall" @@ -578,7 +580,7 @@ func ListHandler(cmd *cobra.Command, args []string) error { table.SetHeaderLine(false) table.SetBorder(false) table.SetNoWhiteSpace(true) - table.SetTablePadding("\t") + table.SetTablePadding(" ") table.AppendBulk(data) table.Render() @@ -624,7 +626,7 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error { table.SetHeaderLine(false) table.SetBorder(false) table.SetNoWhiteSpace(true) - table.SetTablePadding("\t") + table.SetTablePadding(" ") table.AppendBulk(data) table.Render() @@ -720,125 +722,89 @@ func ShowHandler(cmd *cobra.Command, args []string) error { return nil } - showInfo(resp) - - return nil + return showInfo(resp, os.Stdout) } -func showInfo(resp *api.ShowResponse) { - modelData := [][]string{ - {"parameters", resp.Details.ParameterSize}, - {"quantization", resp.Details.QuantizationLevel}, - } - if resp.ModelInfo != nil { - arch := resp.ModelInfo["general.architecture"].(string) - modelData = append(modelData, - []string{"arch", arch}, - []string{"context length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64))}, - []string{"embedding length", fmt.Sprintf("%v", resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64))}, - ) +func showInfo(resp *api.ShowResponse, w io.Writer) error { + tableRender := func(header string, rows func() [][]string) { + fmt.Fprintln(w, " ", header) + table := tablewriter.NewWriter(w) + table.SetAlignment(tablewriter.ALIGN_LEFT) + table.SetBorder(false) + table.SetNoWhiteSpace(true) + table.SetTablePadding(" ") + + switch header { + case "Template", "System", "License": + table.SetColWidth(100) + } + + table.AppendBulk(rows()) + table.Render() + fmt.Fprintln(w) } - mainTableData := [][]string{ - {"Model"}, - {renderSubTable(modelData, false)}, - } + tableRender("Model", func() (rows [][]string) { + if resp.ModelInfo != nil { + arch := resp.ModelInfo["general.architecture"].(string) + rows = append(rows, []string{"", "architecture", arch}) + rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ModelInfo["general.parameter_count"].(float64)))}) + rows = append(rows, []string{"", "context length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.context_length", arch)].(float64), 'f', -1, 64)}) + rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ModelInfo[fmt.Sprintf("%s.embedding_length", arch)].(float64), 'f', -1, 64)}) + } else { + rows = append(rows, []string{"", "architecture", resp.Details.Family}) + rows = append(rows, []string{"", "parameters", resp.Details.ParameterSize}) + } + rows = append(rows, []string{"", "quantization", resp.Details.QuantizationLevel}) + return + }) if resp.ProjectorInfo != nil { - projectorData := [][]string{ - {"arch", "clip"}, - {"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))}, - } - - if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok { - projectorData = append(projectorData, []string{"projector type", projectorType.(string)}) - } - - projectorData = append(projectorData, - []string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))}, - []string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))}, - ) - - mainTableData = append(mainTableData, - []string{"Projector"}, - []string{renderSubTable(projectorData, false)}, - ) + tableRender("Projector", func() (rows [][]string) { + arch := resp.ProjectorInfo["general.architecture"].(string) + rows = append(rows, []string{"", "architecture", arch}) + rows = append(rows, []string{"", "parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))}) + rows = append(rows, []string{"", "embedding length", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.embedding_length", arch)].(float64), 'f', -1, 64)}) + rows = append(rows, []string{"", "dimensions", strconv.FormatFloat(resp.ProjectorInfo[fmt.Sprintf("%s.vision.projection_dim", arch)].(float64), 'f', -1, 64)}) + return + }) } if resp.Parameters != "" { - mainTableData = append(mainTableData, []string{"Parameters"}, []string{formatParams(resp.Parameters)}) + tableRender("Parameters", func() (rows [][]string) { + scanner := bufio.NewScanner(strings.NewReader(resp.Parameters)) + for scanner.Scan() { + if text := scanner.Text(); text != "" { + rows = append(rows, append([]string{""}, strings.Fields(text)...)) + } + } + return + }) + } + + head := func(s string, n int) (rows [][]string) { + scanner := bufio.NewScanner(strings.NewReader(s)) + for scanner.Scan() && (len(rows) < n || n < 0) { + if text := scanner.Text(); text != "" { + rows = append(rows, []string{"", strings.TrimSpace(text)}) + } + } + return } if resp.System != "" { - mainTableData = append(mainTableData, []string{"System"}, []string{renderSubTable(twoLines(resp.System), true)}) + tableRender("System", func() [][]string { + return head(resp.System, 2) + }) } if resp.License != "" { - mainTableData = append(mainTableData, []string{"License"}, []string{renderSubTable(twoLines(resp.License), true)}) + tableRender("License", func() [][]string { + return head(resp.License, 2) + }) } - table := tablewriter.NewWriter(os.Stdout) - table.SetAutoWrapText(false) - table.SetBorder(false) - table.SetAlignment(tablewriter.ALIGN_LEFT) - - for _, v := range mainTableData { - table.Append(v) - } - - table.Render() -} - -func renderSubTable(data [][]string, file bool) string { - var buf bytes.Buffer - table := tablewriter.NewWriter(&buf) - table.SetAutoWrapText(!file) - table.SetBorder(false) - table.SetNoWhiteSpace(true) - table.SetTablePadding("\t") - table.SetAlignment(tablewriter.ALIGN_LEFT) - - for _, v := range data { - table.Append(v) - } - - table.Render() - - renderedTable := buf.String() - lines := strings.Split(renderedTable, "\n") - for i, line := range lines { - lines[i] = "\t" + line - } - - return strings.Join(lines, "\n") -} - -func twoLines(s string) [][]string { - lines := strings.Split(s, "\n") - res := [][]string{} - - count := 0 - for _, line := range lines { - line = strings.TrimSpace(line) - if line != "" { - count++ - res = append(res, []string{line}) - if count == 2 { - return res - } - } - } - return res -} - -func formatParams(s string) string { - lines := strings.Split(s, "\n") - table := [][]string{} - - for _, line := range lines { - table = append(table, strings.Fields(line)) - } - return renderSubTable(table, false) + return nil } func CopyHandler(cmd *cobra.Command, args []string) error { diff --git a/cmd/cmd_test.go b/cmd/cmd_test.go new file mode 100644 index 00000000..0f8863cc --- /dev/null +++ b/cmd/cmd_test.go @@ -0,0 +1,206 @@ +package cmd + +import ( + "bytes" + "os" + "path/filepath" + "testing" + + "github.com/google/go-cmp/cmp" + + "github.com/ollama/ollama/api" +) + +func TestShowInfo(t *testing.T) { + t.Run("bare details", func(t *testing.T) { + var b bytes.Buffer + if err := showInfo(&api.ShowResponse{ + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + quantization FP16 + +` + + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) + + t.Run("bare model info", func(t *testing.T) { + var b bytes.Buffer + if err := showInfo(&api.ShowResponse{ + ModelInfo: map[string]any{ + "general.architecture": "test", + "general.parameter_count": float64(7_000_000_000), + "test.context_length": float64(0), + "test.embedding_length": float64(0), + }, + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + context length 0 + embedding length 0 + quantization FP16 + +` + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) + + t.Run("parameters", func(t *testing.T) { + var b bytes.Buffer + if err := showInfo(&api.ShowResponse{ + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + Parameters: ` + stop never + stop gonna + stop give + stop you + stop up + temperature 99`, + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + quantization FP16 + + Parameters + stop never + stop gonna + stop give + stop you + stop up + temperature 99 + +` + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) + + t.Run("project info", func(t *testing.T) { + var b bytes.Buffer + if err := showInfo(&api.ShowResponse{ + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + ProjectorInfo: map[string]any{ + "general.architecture": "clip", + "general.parameter_count": float64(133_700_000), + "clip.vision.embedding_length": float64(0), + "clip.vision.projection_dim": float64(0), + }, + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + quantization FP16 + + Projector + architecture clip + parameters 133.70M + embedding length 0 + dimensions 0 + +` + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) + + t.Run("system", func(t *testing.T) { + var b bytes.Buffer + if err := showInfo(&api.ShowResponse{ + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + System: `You are a pirate! +Ahoy, matey! +Weigh anchor! + `, + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + quantization FP16 + + System + You are a pirate! + Ahoy, matey! + +` + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) + + t.Run("license", func(t *testing.T) { + var b bytes.Buffer + license, err := os.ReadFile(filepath.Join("..", "LICENSE")) + if err != nil { + t.Fatal(err) + } + + if err := showInfo(&api.ShowResponse{ + Details: api.ModelDetails{ + Family: "test", + ParameterSize: "7B", + QuantizationLevel: "FP16", + }, + License: string(license), + }, &b); err != nil { + t.Fatal(err) + } + + expect := ` Model + architecture test + parameters 7B + quantization FP16 + + License + MIT License + Copyright (c) Ollama + +` + if diff := cmp.Diff(expect, b.String()); diff != "" { + t.Errorf("unexpected output (-want +got):\n%s", diff) + } + }) +} diff --git a/cmd/interactive.go b/cmd/interactive.go index 4462cf29..9fe1ed4c 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -371,7 +371,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { switch args[1] { case "info": - showInfo(resp) + _ = showInfo(resp, os.Stderr) case "license": if resp.License == "" { fmt.Println("No license was specified for this model.") From abed273de3a6183d734f0f3f0f129d7bd08ac4b4 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Wed, 11 Sep 2024 16:36:21 -0700 Subject: [PATCH 10/18] add "stop" command (#6739) --- cmd/cmd.go | 56 ++++++++++++++++++++++++++++++++++++++++++-- cmd/interactive.go | 23 +----------------- server/routes.go | 52 ++++++++++++++++++++++++++++++++++++++++ server/sched.go | 20 +++++++++++++++- server/sched_test.go | 46 ++++++++++++++++++++++++++++++++++++ 5 files changed, 172 insertions(+), 25 deletions(-) diff --git a/cmd/cmd.go b/cmd/cmd.go index 1fb721e7..3bb8b06e 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -346,6 +346,39 @@ func (w *progressWriter) Write(p []byte) (n int, err error) { return len(p), nil } +func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error { + p := progress.NewProgress(os.Stderr) + defer p.StopAndClear() + + spinner := progress.NewSpinner("") + p.Add("", spinner) + + client, err := api.ClientFromEnvironment() + if err != nil { + return err + } + + req := &api.GenerateRequest{ + Model: opts.Model, + KeepAlive: opts.KeepAlive, + } + + return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil }) +} + +func StopHandler(cmd *cobra.Command, args []string) error { + opts := &runOptions{ + Model: args[0], + KeepAlive: &api.Duration{Duration: 0}, + } + if err := loadOrUnloadModel(cmd, opts); err != nil { + if strings.Contains(err.Error(), "not found") { + return fmt.Errorf("couldn't find model \"%s\" to stop", args[0]) + } + } + return nil +} + func RunHandler(cmd *cobra.Command, args []string) error { interactive := true @@ -424,7 +457,7 @@ func RunHandler(cmd *cobra.Command, args []string) error { opts.ParentModel = info.Details.ParentModel if interactive { - if err := loadModel(cmd, &opts); err != nil { + if err := loadOrUnloadModel(cmd, &opts); err != nil { return err } @@ -615,7 +648,15 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error { cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100) procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent)) } - data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")}) + + var until string + delta := time.Since(m.ExpiresAt) + if delta > 0 { + until = "Stopping..." + } else { + until = format.HumanTime(m.ExpiresAt, "Never") + } + data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until}) } } @@ -1294,6 +1335,15 @@ func NewCLI() *cobra.Command { runCmd.Flags().Bool("insecure", false, "Use an insecure registry") runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically") runCmd.Flags().String("format", "", "Response format (e.g. json)") + + stopCmd := &cobra.Command{ + Use: "stop MODEL", + Short: "Stop a running model", + Args: cobra.ExactArgs(1), + PreRunE: checkServerHeartbeat, + RunE: StopHandler, + } + serveCmd := &cobra.Command{ Use: "serve", Aliases: []string{"start"}, @@ -1361,6 +1411,7 @@ func NewCLI() *cobra.Command { createCmd, showCmd, runCmd, + stopCmd, pullCmd, pushCmd, listCmd, @@ -1400,6 +1451,7 @@ func NewCLI() *cobra.Command { createCmd, showCmd, runCmd, + stopCmd, pullCmd, pushCmd, listCmd, diff --git a/cmd/interactive.go b/cmd/interactive.go index 9fe1ed4c..94578f11 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -18,7 +18,6 @@ import ( "github.com/ollama/ollama/api" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/parser" - "github.com/ollama/ollama/progress" "github.com/ollama/ollama/readline" "github.com/ollama/ollama/types/errtypes" ) @@ -31,26 +30,6 @@ const ( MultilineSystem ) -func loadModel(cmd *cobra.Command, opts *runOptions) error { - p := progress.NewProgress(os.Stderr) - defer p.StopAndClear() - - spinner := progress.NewSpinner("") - p.Add("", spinner) - - client, err := api.ClientFromEnvironment() - if err != nil { - return err - } - - chatReq := &api.ChatRequest{ - Model: opts.Model, - KeepAlive: opts.KeepAlive, - } - - return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil }) -} - func generateInteractive(cmd *cobra.Command, opts runOptions) error { usage := func() { fmt.Fprintln(os.Stderr, "Available Commands:") @@ -217,7 +196,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { opts.Model = args[1] opts.Messages = []api.Message{} fmt.Printf("Loading model '%s'\n", opts.Model) - if err := loadModel(cmd, &opts); err != nil { + if err := loadOrUnloadModel(cmd, &opts); err != nil { return err } continue diff --git a/server/routes.go b/server/routes.go index 5e9f51e1..f202973e 100644 --- a/server/routes.go +++ b/server/routes.go @@ -117,6 +117,32 @@ func (s *Server) GenerateHandler(c *gin.Context) { return } + // expire the runner + if req.Prompt == "" && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 { + model, err := GetModel(req.Model) + if err != nil { + switch { + case os.IsNotExist(err): + c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)}) + case err.Error() == "invalid model name": + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + default: + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + } + return + } + s.sched.expireRunner(model) + + c.JSON(http.StatusOK, api.GenerateResponse{ + Model: req.Model, + CreatedAt: time.Now().UTC(), + Response: "", + Done: true, + DoneReason: "unload", + }) + return + } + if req.Format != "" && req.Format != "json" { c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""}) return @@ -1322,6 +1348,32 @@ func (s *Server) ChatHandler(c *gin.Context) { return } + // expire the runner + if len(req.Messages) == 0 && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 { + model, err := GetModel(req.Model) + if err != nil { + switch { + case os.IsNotExist(err): + c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)}) + case err.Error() == "invalid model name": + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + default: + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + } + return + } + s.sched.expireRunner(model) + + c.JSON(http.StatusOK, api.ChatResponse{ + Model: req.Model, + CreatedAt: time.Now().UTC(), + Message: api.Message{Role: "assistant"}, + Done: true, + DoneReason: "unload", + }) + return + } + caps := []Capability{CapabilityCompletion} if len(req.Tools) > 0 { caps = append(caps, CapabilityTools) diff --git a/server/sched.go b/server/sched.go index 58071bf0..3c8656ad 100644 --- a/server/sched.go +++ b/server/sched.go @@ -360,7 +360,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) { slog.Debug("runner expired event received", "modelPath", runner.modelPath) runner.refMu.Lock() if runner.refCount > 0 { - // Shouldn't happen, but safeguard to ensure no leaked runners slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount) go func(runner *runnerRef) { // We can't unload yet, but want to as soon as the current request completes @@ -802,6 +801,25 @@ func (s *Scheduler) unloadAllRunners() { } } +func (s *Scheduler) expireRunner(model *Model) { + s.loadedMu.Lock() + defer s.loadedMu.Unlock() + runner, ok := s.loaded[model.ModelPath] + if ok { + runner.refMu.Lock() + runner.expiresAt = time.Now() + if runner.expireTimer != nil { + runner.expireTimer.Stop() + runner.expireTimer = nil + } + runner.sessionDuration = 0 + if runner.refCount <= 0 { + s.expiredCh <- runner + } + runner.refMu.Unlock() + } +} + // If other runners are loaded, make sure the pending request will fit in system memory // If not, pick a runner to unload, else return nil and the request can be loaded func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef { diff --git a/server/sched_test.go b/server/sched_test.go index fb049574..be32065a 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -406,6 +406,52 @@ func TestGetRunner(t *testing.T) { b.ctxDone() } +func TestExpireRunner(t *testing.T) { + ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond) + defer done() + s := InitScheduler(ctx) + req := &LlmRequest{ + ctx: ctx, + model: &Model{ModelPath: "foo"}, + opts: api.DefaultOptions(), + successCh: make(chan *runnerRef, 1), + errCh: make(chan error, 1), + sessionDuration: &api.Duration{Duration: 2 * time.Minute}, + } + + var ggml *llm.GGML + gpus := gpu.GpuInfoList{} + server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} + s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { + return server, nil + } + s.load(req, ggml, gpus, 0) + + select { + case err := <-req.errCh: + if err != nil { + t.Fatalf("expected no errors when loading, got '%s'", err.Error()) + } + case resp := <-req.successCh: + s.loadedMu.Lock() + if resp.refCount != uint(1) || len(s.loaded) != 1 { + t.Fatalf("expected a model to be loaded") + } + s.loadedMu.Unlock() + } + + s.expireRunner(&Model{ModelPath: "foo"}) + + s.finishedReqCh <- req + s.processCompleted(ctx) + + s.loadedMu.Lock() + if len(s.loaded) != 0 { + t.Fatalf("expected model to be unloaded") + } + s.loadedMu.Unlock() +} + // TODO - add one scenario that triggers the bogus finished event with positive ref count func TestPrematureExpired(t *testing.T) { ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond) From 93ac3760cb4abdd0f54ad755ed15ec4254026282 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 11 Sep 2024 14:00:20 -0700 Subject: [PATCH 11/18] runner: Flush pending responses before returning If there are any pending reponses (such as from potential stop tokens) then we should send them back before ending the sequence. Otherwise, we can be missing tokens at the end of a response. Fixes #6707 --- llm/ext_server/server.cpp | 60 +++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index fc673c47..6ce457ae 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -913,7 +913,9 @@ struct llama_server_context slot.sampled = result.tok; // search stop word and delete it - slot.generated_text += token_str; + if (!llama_token_is_eog(model, result.tok)) + slot.generated_text += token_str; + slot.has_next_token = true; if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1) @@ -954,30 +956,36 @@ struct llama_server_context if (!incomplete) { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); - const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; - size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); - if (stop_pos != std::string::npos) - { - is_stop_full = true; - slot.generated_text.erase( - slot.generated_text.begin() + pos + stop_pos, - slot.generated_text.end()); - pos = std::min(slot.n_sent_text, slot.generated_text.size()); - } - else - { - is_stop_full = false; - stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot); - } - // check if there is any token to predict - if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) - { - // no send the stop word in the response - result.text_to_send = slot.generated_text.substr(pos, std::string::npos); - slot.n_sent_text += result.text_to_send.size(); - // add the token to slot queue and cache + if (!llama_token_is_eog(model, result.tok)) { + const std::string str_test = slot.generated_text.substr(pos); + bool is_stop_full = false; + size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); + if (stop_pos != std::string::npos) + { + is_stop_full = true; + slot.generated_text.erase( + slot.generated_text.begin() + pos + stop_pos, + slot.generated_text.end()); + pos = std::min(slot.n_sent_text, slot.generated_text.size()); + } + else + { + is_stop_full = false; + stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_PARTIAL, slot); + } + + // check if there is any token to predict + if (stop_pos == std::string::npos || (!slot.has_next_token && !is_stop_full && stop_pos > 0)) + { + // no send the stop word in the response + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.n_sent_text += result.text_to_send.size(); + // add the token to slot queue and cache + } + } else { + result.text_to_send = slot.generated_text.substr(pos, std::string::npos); + slot.n_sent_text += result.text_to_send.size(); } if (slot.params.stream) @@ -1117,9 +1125,7 @@ struct llama_server_context {"multimodal", multimodal} }; - if (!llama_token_is_eog(model, tkn.tok)) { - res.result_json["content"] = tkn.text_to_send; - } + res.result_json["content"] = tkn.text_to_send; if (slot.sparams.n_probs > 0) { From 5a00dc9fc9cb95936299106ce31c338355e3261e Mon Sep 17 00:00:00 2001 From: RAPID ARCHITECT <126218667+rapidarchitect@users.noreply.github.com> Date: Wed, 11 Sep 2024 20:36:26 -0500 Subject: [PATCH 12/18] readme: add ollama_moe to community integrations (#6752) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b76f08a6..ecbb7023 100644 --- a/README.md +++ b/README.md @@ -337,6 +337,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [podman-ollama](https://github.com/ericcurtin/podman-ollama) - [gollama](https://github.com/sammcj/gollama) - [Ollama eBook Summary](https://github.com/cognitivetech/ollama-ebook-summary/) +- [Ollama Mixture of Experts (MOE) in 50 lines of code](https://github.com/rapidarchitect/ollama_moe) ### Apple Vision Pro - [Enchanted](https://github.com/AugustDev/enchanted) From d066d9b8e0995bbc2107791068892b61b81789cf Mon Sep 17 00:00:00 2001 From: Adrian Cole <64215+codefromthecrypt@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:37:37 +0800 Subject: [PATCH 13/18] examples: polish loganalyzer example (#6744) --- examples/python-loganalysis/Modelfile | 2 +- examples/python-loganalysis/readme.md | 2 ++ examples/python-loganalysis/requirements.txt | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/python-loganalysis/Modelfile b/examples/python-loganalysis/Modelfile index 5237cb6e..b28aa0c0 100644 --- a/examples/python-loganalysis/Modelfile +++ b/examples/python-loganalysis/Modelfile @@ -4,5 +4,5 @@ SYSTEM """ You are a log file analyzer. You will receive a set of lines from a log file for some software application, find the errors and other interesting aspects of the logs, and explain them so a new user can understand what they mean. If there are any steps they can do to resolve them, list the steps in your answer. """ -PARAMETER TEMPERATURE 0.3 +PARAMETER temperature 0.3 diff --git a/examples/python-loganalysis/readme.md b/examples/python-loganalysis/readme.md index 4be0baaa..03bab672 100644 --- a/examples/python-loganalysis/readme.md +++ b/examples/python-loganalysis/readme.md @@ -21,6 +21,8 @@ You can try this with the `logtest.logfile` file included in this directory. 2. Install the Python Requirements. ```bash + python3 -m venv .venv + source .venv/bin/activate pip install -r requirements.txt ``` diff --git a/examples/python-loganalysis/requirements.txt b/examples/python-loganalysis/requirements.txt index 9688b8ec..e7cb17ef 100644 --- a/examples/python-loganalysis/requirements.txt +++ b/examples/python-loganalysis/requirements.txt @@ -1 +1 @@ -Requests==2.31.0 +Requests>=2.32.3 From fef257c5c50347943bb1e5e06ebb5e22fd9b69a0 Mon Sep 17 00:00:00 2001 From: dcasota <14890243+dcasota@users.noreply.github.com> Date: Thu, 12 Sep 2024 03:56:56 +0200 Subject: [PATCH 14/18] examples: updated requirements.txt for privategpt example --- examples/langchain-python-rag-privategpt/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/langchain-python-rag-privategpt/requirements.txt b/examples/langchain-python-rag-privategpt/requirements.txt index 0aad1fe5..4f2cee25 100644 --- a/examples/langchain-python-rag-privategpt/requirements.txt +++ b/examples/langchain-python-rag-privategpt/requirements.txt @@ -1,6 +1,6 @@ langchain==0.0.274 gpt4all==1.0.8 -chromadb==0.4.7 +chromadb==0.5.0 llama-cpp-python==0.1.81 urllib3==2.0.4 PyMuPDF==1.23.5 @@ -12,4 +12,4 @@ pandoc==2.3 pypandoc==1.11 tqdm==4.66.1 sentence_transformers==2.2.2 -numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability \ No newline at end of file +numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability From cd5c8f6471abf32965289f0226016a78f0c5c938 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 12 Sep 2024 12:10:30 -0700 Subject: [PATCH 15/18] Optimize container images for startup (#6547) * Optimize container images for startup This change adjusts how to handle runner payloads to support container builds where we keep them extracted in the filesystem. This makes it easier to optimize the cpu/cuda vs cpu/rocm images for size, and should result in faster startup times for container images. * Refactor payload logic and add buildx support for faster builds * Move payloads around * Review comments * Converge to buildx based helper scripts * Use docker buildx action for release --- .dockerignore | 2 + .github/workflows/release.yaml | 209 +++++++++-- .github/workflows/test.yaml | 43 +-- .gitignore | 3 + Dockerfile | 103 ++++-- build/darwin/amd64/placeholder | 1 + build/darwin/arm64/placeholder | 1 + build/embed_darwin_amd64.go | 8 + build/embed_darwin_arm64.go | 8 + build/embed_linux.go | 6 + build/embed_unused.go | 8 + build/linux/amd64/placeholder | 1 + build/linux/arm64/placeholder | 1 + envconfig/config.go | 48 --- gpu/assets.go | 148 -------- gpu/gpu.go | 7 +- llm/generate/gen_common.sh | 32 +- llm/generate/gen_darwin.sh | 12 +- llm/generate/gen_linux.sh | 35 +- llm/{llm_darwin_arm64.go => llm_darwin.go} | 4 - llm/llm_darwin_amd64.go | 11 - llm/llm_linux.go | 4 - llm/llm_windows.go | 4 - llm/payload.go | 233 ------------- llm/server.go | 29 +- runners/common.go | 384 +++++++++++++++++++++ runners/runners_test.go | 50 +++ scripts/build_darwin.sh | 3 +- scripts/build_docker.sh | 82 ++--- scripts/build_linux.sh | 48 ++- scripts/env.sh | 14 + server/routes.go | 8 +- 32 files changed, 861 insertions(+), 689 deletions(-) create mode 100644 build/darwin/amd64/placeholder create mode 100644 build/darwin/arm64/placeholder create mode 100644 build/embed_darwin_amd64.go create mode 100644 build/embed_darwin_arm64.go create mode 100644 build/embed_linux.go create mode 100644 build/embed_unused.go create mode 100644 build/linux/amd64/placeholder create mode 100644 build/linux/arm64/placeholder delete mode 100644 gpu/assets.go rename llm/{llm_darwin_arm64.go => llm_darwin.go} (55%) delete mode 100644 llm/llm_darwin_amd64.go delete mode 100644 llm/payload.go create mode 100644 runners/common.go create mode 100644 runners/runners_test.go create mode 100644 scripts/env.sh diff --git a/.dockerignore b/.dockerignore index 43f2e07d..fada7a9b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -7,3 +7,5 @@ llm/llama.cpp .env .cache test_data +llm/build +llama/build diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9c1e3e13..02b5f8e6 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -102,8 +102,8 @@ jobs: with: name: generate-windows-cpu path: | - llm/build/**/bin/* - llm/build/**/*.a + build/**/* + build/**/*.a dist/windows-amd64/** # ROCm generation step @@ -176,7 +176,7 @@ jobs: with: name: generate-windows-rocm path: | - llm/build/**/bin/* + build/**/* dist/windows-amd64/** - uses: actions/upload-artifact@v4 with: @@ -265,7 +265,7 @@ jobs: with: name: generate-windows-cuda-${{ matrix.cuda.version }} path: | - llm/build/**/bin/* + build/**/* dist/windows-amd64/** - uses: actions/upload-artifact@v4 with: @@ -338,7 +338,7 @@ jobs: - uses: actions/download-artifact@v4 with: name: generate-windows-rocm - - run: dir llm/build + - run: dir build - run: | $gopath=(get-command go).source | split-path -parent & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" @@ -359,9 +359,7 @@ jobs: environment: release runs-on: linux env: - OLLAMA_SKIP_MANIFEST_CREATE: '1' BUILD_ARCH: amd64 - PUSH: '1' steps: - uses: actions/checkout@v4 with: @@ -369,14 +367,8 @@ jobs: - name: Set Version shell: bash run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ vars.DOCKER_USER }} - password: ${{ secrets.DOCKER_ACCESS_TOKEN }} - run: | ./scripts/build_linux.sh - ./scripts/build_docker.sh - uses: actions/upload-artifact@v4 with: name: dist-linux-amd64 @@ -390,9 +382,7 @@ jobs: environment: release runs-on: linux-arm64 env: - OLLAMA_SKIP_MANIFEST_CREATE: '1' BUILD_ARCH: arm64 - PUSH: '1' steps: - uses: actions/checkout@v4 with: @@ -421,14 +411,8 @@ jobs: sudo usermod -aG docker $USER sudo apt-get install acl sudo setfacl --modify user:$USER:rw /var/run/docker.sock - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ vars.DOCKER_USER }} - password: ${{ secrets.DOCKER_ACCESS_TOKEN }} - run: | ./scripts/build_linux.sh - ./scripts/build_docker.sh - uses: actions/upload-artifact@v4 with: name: dist-linux-arm64 @@ -436,6 +420,181 @@ jobs: dist/*linux* !dist/*-cov + # Container image build + build-linux: + environment: release + strategy: + matrix: + runner: + - linux + - linux-arm64 + runs-on: ${{ matrix.runner }} + env: + FINAL_IMAGE_REPO: ollama/ollama + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - name: 'Install Docker' + if: ${{ startsWith(matrix.runner, 'linux-arm64') }} + run: | + sudo apt-get update + sudo apt-get install -y ca-certificates curl + sudo install -m 0755 -d /etc/apt/keyrings + sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc + sudo chmod a+r /etc/apt/keyrings/docker.asc + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \ + $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ + sudo tee /etc/apt/sources.list.d/docker.list > /dev/null + sudo apt-get update + sudo apt-get install -y docker-ce docker-ce-cli containerd.io + sudo usermod -aG docker $USER + sudo apt-get install acl + sudo setfacl --modify user:$USER:rw /var/run/docker.sock + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.FINAL_IMAGE_REPO }} + flavor: | + latest=false + tags: | + type=ref,event=tag + type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr + type=semver,pattern={{version}} + - name: Set Version + shell: bash + run: | + machine=$(uname -m) + case ${machine} in + x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;; + aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;; + esac >>$GITHUB_ENV + echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ vars.DOCKER_USER }} + password: ${{ secrets.DOCKER_ACCESS_TOKEN }} + - name: Build and push by digest + id: build + uses: docker/build-push-action@v6 + with: + context: "." + platforms: linux/${{ env.ARCH }} + build-args: | + GOFLAGS + outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true + - name: Export digest + run: | + mkdir -p /tmp/digests + digest="${{ steps.build.outputs.digest }}" + touch "/tmp/digests/${digest#sha256:}" + - name: Upload digest + uses: actions/upload-artifact@v4 + with: + name: digests-${{ env.PLATFORM_PAIR }} + path: /tmp/digests/* + if-no-files-found: error + retention-days: 1 + merge: + environment: release + runs-on: linux + needs: + - build-linux + env: + FINAL_IMAGE_REPO: ollama/ollama + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - name: Download digests + uses: actions/download-artifact@v4 + with: + path: /tmp/digests + pattern: digests-* + merge-multiple: true + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.FINAL_IMAGE_REPO }} + flavor: | + latest=false + tags: | + type=ref,event=tag + type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr + type=semver,pattern={{version}} + - name: Set Version + shell: bash + run: | + machine=$(uname -m) + case ${machine} in + x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;; + aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;; + esac >>$GITHUB_ENV + echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ vars.DOCKER_USER }} + password: ${{ secrets.DOCKER_ACCESS_TOKEN }} + - name: Create manifest list and push + working-directory: /tmp/digests + run: | + docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ + $(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *) + - name: Inspect image + run: | + docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }} + build-linux-rocm: + environment: release + runs-on: linux + env: + FINAL_IMAGE_REPO: ollama/ollama + ARCH: amd64 + PLATFORM_PAIR: linux-amd64 + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.FINAL_IMAGE_REPO }} + flavor: | + latest=false + tags: | + type=ref,event=tag + type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr + type=semver,pattern={{version}} + - name: Set Version + shell: bash + run: | + echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ vars.DOCKER_USER }} + password: ${{ secrets.DOCKER_ACCESS_TOKEN }} + - name: Build and push by digest + id: build + uses: docker/build-push-action@v6 + with: + context: "." + target: runtime-rocm + build-args: | + GOFLAGS + tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm,${{ env.FINAL_IMAGE_REPO }}:rocm + push: true + # Aggregate all the assets and ship a release release: needs: @@ -448,8 +607,6 @@ jobs: permissions: contents: write env: - OLLAMA_SKIP_IMAGE_BUILD: '1' - PUSH: '1' GH_TOKEN: ${{ github.token }} steps: - uses: actions/checkout@v4 @@ -458,12 +615,6 @@ jobs: run: | echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ vars.DOCKER_USER }} - password: ${{ secrets.DOCKER_ACCESS_TOKEN }} - - run: ./scripts/build_docker.sh - name: Retrieve built artifact uses: actions/download-artifact@v4 with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3d58fa3e..26dc732a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -81,12 +81,6 @@ jobs: if: ${{ ! startsWith(matrix.os, 'windows-') }} name: 'Unix Go Generate' - run: go build . - - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.os }}-${{ matrix.arch }}-libraries - path: | - llm/build/**/bin/* - llm/build/**/*.a generate-cuda: needs: [changes] if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }} @@ -114,12 +108,6 @@ jobs: go generate -x ./... env: OLLAMA_SKIP_CPU_GENERATE: '1' - - uses: actions/upload-artifact@v4 - with: - name: cuda-${{ matrix.cuda-version }}-libraries - path: | - llm/build/**/bin/* - dist/windows-amd64/** generate-rocm: needs: [changes] if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }} @@ -147,12 +135,6 @@ jobs: go generate -x ./... env: OLLAMA_SKIP_CPU_GENERATE: '1' - - uses: actions/upload-artifact@v4 - with: - name: rocm-${{ matrix.rocm-version }}-libraries - path: | - llm/build/**/bin/* - dist/windows-amd64/** # ROCm generation step generate-windows-rocm: @@ -189,7 +171,6 @@ jobs: name: go generate env: OLLAMA_SKIP_CPU_GENERATE: '1' - # TODO - do we need any artifacts? # CUDA generation step generate-windows-cuda: @@ -231,7 +212,6 @@ jobs: go generate -x ./... env: OLLAMA_SKIP_CPU_GENERATE: '1' - # TODO - do we need any artifacts? lint: strategy: @@ -263,14 +243,6 @@ jobs: arm64) echo ARCH=arm64 ;; esac >>$GITHUB_ENV shell: bash - - run: | - mkdir -p llm/build/linux/$ARCH/stub/bin - touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server - if: ${{ startsWith(matrix.os, 'ubuntu-') }} - - run: | - mkdir -p llm/build/darwin/$ARCH/stub/bin - touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server - if: ${{ startsWith(matrix.os, 'macos-') }} - uses: golangci/golangci-lint-action@v6 with: args: --timeout 8m0s -v @@ -301,23 +273,10 @@ jobs: cache: true - run: | case ${{ matrix.arch }} in - amd64) echo ARCH=x86_64 ;; + amd64) echo ARCH=amd64 ;; arm64) echo ARCH=arm64 ;; esac >>$GITHUB_ENV shell: bash - - run: | - mkdir -p llm/build/linux/$ARCH/stub/bin - touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server - if: ${{ startsWith(matrix.os, 'ubuntu-') }} - - run: | - mkdir -p llm/build/darwin/$ARCH/stub/bin - touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server - if: ${{ startsWith(matrix.os, 'macos-') }} - shell: bash - run: go generate ./... - run: go build - run: go test -v ./... - - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.os }}-binaries - path: ollama diff --git a/.gitignore b/.gitignore index 0d826ab6..87f8b007 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,7 @@ ggml-metal.metal test_data *.crt llm/build +build/*/*/* +!build/**/placeholder +llama/build __debug_bin* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 655f1081..0f43e618 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,7 +47,7 @@ RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ bash gen_linux.sh -FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64 +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -63,7 +63,7 @@ RUN OLLAMA_SKIP_STATIC_GENERATE=1 \ CUDA_VARIANT="_v11" \ bash gen_linux.sh -FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64 +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -143,64 +143,103 @@ RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh -# Intermediate stage used for ./scripts/build_linux.sh +# Intermediate stages used for ./scripts/build_linux.sh FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64 ENV CGO_ENABLED=1 WORKDIR /go/src/github.com/ollama/ollama COPY . . -COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/ +COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/ +COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/ COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/ COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/ ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ go build -trimpath -o dist/linux-amd64/bin/ollama . +RUN cd dist/linux-$GOARCH && \ + tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz +RUN cd dist/linux-$GOARCH-rocm && \ + tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz -# Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 ENV CGO_ENABLED=1 ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . -COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/ +COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/ +COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/ +ARG GOFLAGS +ARG CGO_CFLAGS +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-arm64/bin/ollama . +RUN cd dist/linux-$GOARCH && \ + tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz + +FROM --platform=linux/amd64 scratch AS dist-amd64 +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz / +FROM --platform=linux/arm64 scratch AS dist-arm64 +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz / +FROM dist-$TARGETARCH as dist + + +# Optimized container images do not cary nested payloads +FROM --platform=linux/amd64 static-build-amd64 AS container-build-amd64 +WORKDIR /go/src/github.com/ollama/ollama +COPY . . +ARG GOFLAGS +ARG CGO_CFLAGS +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-amd64/bin/ollama . + +FROM --platform=linux/arm64 static-build-arm64 AS container-build-arm64 +WORKDIR /go/src/github.com/ollama/ollama +COPY . . ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ go build -trimpath -o dist/linux-arm64/bin/ollama . -# Strip out ROCm dependencies to keep the primary image lean -FROM --platform=linux/amd64 ubuntu:22.04 AS amd64-libs-without-rocm -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/ -RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* - -# Runtime stages FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64 -COPY --from=amd64-libs-without-rocm /scratch/ /lib/ -RUN apt-get update && apt-get install -y ca-certificates && \ +RUN apt-get update && \ + apt-get install -y ca-certificates && \ apt-get clean && rm -rf /var/lib/apt/lists/* -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ +COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ +COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64 -COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ -RUN apt-get update && apt-get install -y ca-certificates && \ +RUN apt-get update && \ + apt-get install -y ca-certificates && \ apt-get clean && rm -rf /var/lib/apt/lists/* -COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/ +COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/ +COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ +COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ +COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ -# Radeon images are much larger so we keep it distinct from the CPU/CUDA image -FROM rocm/dev-centos-7:${ROCM_VERSION}-complete AS runtime-rocm -RUN update-pciids -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ -RUN ln -s /opt/rocm/lib /lib/ollama +# ROCm libraries larger so we keep it distinct from the CPU/CUDA image +FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm +# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer +# across releases +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/ +RUN apt-get update && \ + apt-get install -y ca-certificates && \ + apt-get clean && rm -rf /var/lib/apt/lists/* +COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ +COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/ EXPOSE 11434 ENV OLLAMA_HOST=0.0.0.0 diff --git a/build/darwin/amd64/placeholder b/build/darwin/amd64/placeholder new file mode 100644 index 00000000..87dc2738 --- /dev/null +++ b/build/darwin/amd64/placeholder @@ -0,0 +1 @@ +This is here to make sure the build/ directory exists for the go:embed command diff --git a/build/darwin/arm64/placeholder b/build/darwin/arm64/placeholder new file mode 100644 index 00000000..87dc2738 --- /dev/null +++ b/build/darwin/arm64/placeholder @@ -0,0 +1 @@ +This is here to make sure the build/ directory exists for the go:embed command diff --git a/build/embed_darwin_amd64.go b/build/embed_darwin_amd64.go new file mode 100644 index 00000000..af1458ea --- /dev/null +++ b/build/embed_darwin_amd64.go @@ -0,0 +1,8 @@ +package build + +import "embed" + +// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling + +//go:embed darwin/amd64/* +var EmbedFS embed.FS diff --git a/build/embed_darwin_arm64.go b/build/embed_darwin_arm64.go new file mode 100644 index 00000000..d885365d --- /dev/null +++ b/build/embed_darwin_arm64.go @@ -0,0 +1,8 @@ +package build + +import "embed" + +// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling + +//go:embed darwin/arm64/* +var EmbedFS embed.FS diff --git a/build/embed_linux.go b/build/embed_linux.go new file mode 100644 index 00000000..4cf7be4c --- /dev/null +++ b/build/embed_linux.go @@ -0,0 +1,6 @@ +package build + +import "embed" + +//go:embed linux/* +var EmbedFS embed.FS diff --git a/build/embed_unused.go b/build/embed_unused.go new file mode 100644 index 00000000..00fbe02e --- /dev/null +++ b/build/embed_unused.go @@ -0,0 +1,8 @@ +//go:build !linux && !darwin + +package build + +import "embed" + +// unused on windows +var EmbedFS embed.FS diff --git a/build/linux/amd64/placeholder b/build/linux/amd64/placeholder new file mode 100644 index 00000000..87dc2738 --- /dev/null +++ b/build/linux/amd64/placeholder @@ -0,0 +1 @@ +This is here to make sure the build/ directory exists for the go:embed command diff --git a/build/linux/arm64/placeholder b/build/linux/arm64/placeholder new file mode 100644 index 00000000..87dc2738 --- /dev/null +++ b/build/linux/arm64/placeholder @@ -0,0 +1 @@ +This is here to make sure the build/ directory exists for the go:embed command diff --git a/envconfig/config.go b/envconfig/config.go index 2c4393fe..9c1490a9 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -179,53 +179,6 @@ var ( HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION") ) -func RunnersDir() (p string) { - if p := Var("OLLAMA_RUNNERS_DIR"); p != "" { - return p - } - - if runtime.GOOS != "windows" { - return - } - - defer func() { - if p == "" { - slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'") - } - }() - - // On Windows we do not carry the payloads inside the main executable - exe, err := os.Executable() - if err != nil { - return - } - - cwd, err := os.Getwd() - if err != nil { - return - } - - var paths []string - for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} { - paths = append(paths, - root, - filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), - filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH), - ) - } - - // Try a few variations to improve developer experience when building from source in the local tree - for _, path := range paths { - candidate := filepath.Join(path, "lib", "ollama", "runners") - if _, err := os.Stat(candidate); err == nil { - p = candidate - break - } - } - - return p -} - func Uint(key string, defaultValue uint) func() uint { return func() uint { if s := Var(key); s != "" { @@ -290,7 +243,6 @@ func AsMap() map[string]EnvVar { "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"}, "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"}, - "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"}, "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"}, diff --git a/gpu/assets.go b/gpu/assets.go deleted file mode 100644 index 6d62d0dc..00000000 --- a/gpu/assets.go +++ /dev/null @@ -1,148 +0,0 @@ -package gpu - -import ( - "errors" - "fmt" - "log/slog" - "os" - "path/filepath" - "runtime" - "strconv" - "strings" - "sync" - "syscall" - "time" - - "github.com/ollama/ollama/envconfig" -) - -var ( - lock sync.Mutex - payloadsDir = "" -) - -func PayloadsDir() (string, error) { - lock.Lock() - defer lock.Unlock() - var err error - if payloadsDir == "" { - runnersDir := envconfig.RunnersDir() - - if runnersDir != "" { - payloadsDir = runnersDir - return payloadsDir, nil - } - - // The remainder only applies on non-windows where we still carry payloads in the main executable - cleanupTmpDirs() - tmpDir := envconfig.TmpDir() - if tmpDir == "" { - tmpDir, err = os.MkdirTemp("", "ollama") - if err != nil { - return "", fmt.Errorf("failed to generate tmp dir: %w", err) - } - } else { - err = os.MkdirAll(tmpDir, 0o755) - if err != nil { - return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err) - } - } - - // Track our pid so we can clean up orphaned tmpdirs - n := filepath.Join(tmpDir, "ollama.pid") - if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil { - return "", fmt.Errorf("failed to write pid file %s: %w", n, err) - } - - // We create a distinct subdirectory for payloads within the tmpdir - // This will typically look like /tmp/ollama3208993108/runners on linux - payloadsDir = filepath.Join(tmpDir, "runners") - } - return payloadsDir, nil -} - -// Best effort to clean up prior tmpdirs -func cleanupTmpDirs() { - matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid")) - if err != nil { - return - } - - for _, match := range matches { - raw, err := os.ReadFile(match) - if errors.Is(err, os.ErrNotExist) { - slog.Debug("not a ollama runtime directory, skipping", "path", match) - continue - } else if err != nil { - slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err) - continue - } - - pid, err := strconv.Atoi(string(raw)) - if err != nil { - slog.Warn("invalid pid, skipping", "path", match, "error", err) - continue - } - - p, err := os.FindProcess(pid) - if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) { - slog.Warn("process still running, skipping", "pid", pid, "path", match) - continue - } - - if err := os.Remove(match); err != nil { - slog.Warn("could not cleanup stale pidfile", "path", match, "error", err) - } - - runners := filepath.Join(filepath.Dir(match), "runners") - if err := os.RemoveAll(runners); err != nil { - slog.Warn("could not cleanup stale runners", "path", runners, "error", err) - } - - if err := os.Remove(filepath.Dir(match)); err != nil { - slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err) - } - } -} - -func Cleanup() { - lock.Lock() - defer lock.Unlock() - runnersDir := envconfig.RunnersDir() - if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" { - // We want to fully clean up the tmpdir parent of the payloads dir - tmpDir := filepath.Clean(filepath.Join(payloadsDir, "..")) - slog.Debug("cleaning up", "dir", tmpDir) - err := os.RemoveAll(tmpDir) - if err != nil { - // On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove - time.Sleep(1000 * time.Millisecond) - err = os.RemoveAll(tmpDir) - if err != nil { - slog.Warn("failed to clean up", "dir", tmpDir, "err", err) - } - } - } -} - -func UpdatePath(dir string) { - if runtime.GOOS == "windows" { - tmpDir := filepath.Dir(dir) - pathComponents := strings.Split(os.Getenv("PATH"), ";") - i := 0 - for _, comp := range pathComponents { - if strings.EqualFold(comp, dir) { - return - } - // Remove any other prior paths to our temp dir - if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) { - pathComponents[i] = comp - i++ - } - } - newPath := strings.Join(append([]string{dir}, pathComponents...), ";") - slog.Info("updating", "PATH", newPath) - os.Setenv("PATH", newPath) - } - // linux and darwin rely on rpath -} diff --git a/gpu/gpu.go b/gpu/gpu.go index 3de93f7f..1fa941dd 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -93,10 +93,9 @@ func initCudaHandles() *cudaHandles { localAppData := os.Getenv("LOCALAPPDATA") cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)} } - tmpDir, _ := PayloadsDir() - if tmpDir != "" { - // TODO - add "payloads" for subprocess - cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)} + libDir := LibraryDir() + if libDir != "" { + cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)} } cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...) diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index cef68ea1..9fe47529 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -31,6 +31,7 @@ init_vars() { NO_WHOLE_ARCHIVE="" GCC_ARCH="-arch ${ARCH}" DIST_BASE=../../dist/darwin-${GOARCH}/ + PAYLOAD_BASE=../../build/darwin/${GOARCH} ;; "Linux") LIB_EXT="so" @@ -40,6 +41,7 @@ init_vars() { # Cross compiling not supported on linux - Use docker GCC_ARCH="" DIST_BASE=../../dist/linux-${GOARCH}/ + PAYLOAD_BASE=../../build/linux/${GOARCH} ;; *) ;; @@ -47,7 +49,8 @@ init_vars() { if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" fi - GZIP=$(which pigz 2>/dev/null || echo "gzip") + GZIP=$(command -v pigz 2>/dev/null || echo "gzip") + RUNNER_BASE="${DIST_BASE}/lib/ollama/runners" } git_module_setup() { @@ -91,17 +94,34 @@ build() { rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal } -compress() { - echo "Compressing payloads to reduce overall binary size..." - rm -rf ${BUILD_DIR}/bin/*.gz +dist() { + [ -z "${RUNNER}" ] && exit 1 + mkdir -p ${RUNNER_BASE}/${RUNNER}/ for f in ${BUILD_DIR}/bin/* ; do - ${GZIP} -n --best -f ${f} & + cp ${f} ${RUNNER_BASE}/${RUNNER}/ + done + # check for lib directory + if [ -d ${BUILD_DIR}/lib ]; then + for f in ${BUILD_DIR}/lib/* ; do + cp ${f} ${RUNNER_BASE}/${RUNNER}/ + done + fi +} + +# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir +compress() { + [ -z "${RUNNER}" ] && exit 1 + echo "Compressing payloads with ${GZIP} to reduce overall binary size..." + rm -rf "${PAYLOAD_BASE}/${RUNNER}/" + mkdir -p "${PAYLOAD_BASE}/${RUNNER}/" + for f in ${BUILD_DIR}/bin/* ; do + ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" & compress_pids+=" $!" done # check for lib directory if [ -d ${BUILD_DIR}/lib ]; then for f in ${BUILD_DIR}/lib/* ; do - ${GZIP} -n --best -f ${f} & + ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" & compress_pids+=" $!" done fi diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index acea9c8d..49c67125 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -39,7 +39,8 @@ case "${GOARCH}" in # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="../build/darwin/${ARCH}/cpu" + RUNNER=cpu + BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}" echo "Building LCD CPU" build sign ${BUILD_DIR}/bin/ollama_llama_server @@ -51,7 +52,8 @@ case "${GOARCH}" in # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="../build/darwin/${ARCH}/cpu_avx" + RUNNER=cpu_avx + BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}" echo "Building AVX CPU" build sign ${BUILD_DIR}/bin/ollama_llama_server @@ -63,7 +65,8 @@ case "${GOARCH}" in # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}" - BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2" + RUNNER=cpu_avx2 + BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}" echo "Building AVX2 CPU" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" build @@ -84,7 +87,8 @@ case "${GOARCH}" in if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then init_vars CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" - BUILD_DIR="../build/darwin/${ARCH}/metal" + RUNNER="metal" + BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" build sign ${BUILD_DIR}/bin/ollama_llama_server diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 1f702ca2..eb7fa786 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -79,10 +79,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then init_vars echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" - BUILD_DIR="../build/linux/${ARCH}/cpu" + RUNNER="cpu" + BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" echo "Building custom CPU" build install + dist compress else # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 @@ -102,10 +104,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="../build/linux/${ARCH}/cpu" + RUNNER=cpu + BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" echo "Building LCD CPU" build install + dist compress fi @@ -120,10 +124,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" - BUILD_DIR="../build/linux/${ARCH}/cpu_avx" + RUNNER=cpu_avx + BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" echo "Building AVX CPU" build install + dist compress fi @@ -134,10 +140,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}" - BUILD_DIR="../build/linux/${ARCH}/cpu_avx2" + RUNNER=cpu_avx2 + BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" echo "Building AVX2 CPU" build install + dist compress fi fi @@ -187,11 +195,13 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then fi export CUDAFLAGS="-t8" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" - BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" + RUNNER=cuda${CUDA_VARIANT} + BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}" build install + dist echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" mkdir -p "${CUDA_DIST_DIR}" for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do @@ -212,7 +222,8 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI CC=icx CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" - BUILD_DIR="../build/linux/${ARCH}/oneapi" + RUNNER=oneapi + BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama" export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it @@ -231,6 +242,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}" cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}" install + dist compress fi @@ -259,7 +271,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}" echo "Building custom ROCM GPU" fi - BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" + RUNNER=rocm${ROCM_VARIANT} + BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" # ROCm dependencies are too large to fit into a unified bundle ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama" # TODO figure out how to disable runpath (rpath) @@ -269,13 +282,17 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then # copy the ROCM dependencies mkdir -p "${ROCM_DIST_DIR}" - for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do + for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do cp -a "${dep}"* "${ROCM_DIST_DIR}" + if [ $(readlink -f "${dep}") != "${dep}" ] ; then + cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}" + fi done install + dist compress fi cleanup wait_for_compress -echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" +echo "go generate completed. LLM runners: $(cd ${PAYLOAD_BASE}; echo *)" diff --git a/llm/llm_darwin_arm64.go b/llm/llm_darwin.go similarity index 55% rename from llm/llm_darwin_arm64.go rename to llm/llm_darwin.go index 20ce8552..60837ed0 100644 --- a/llm/llm_darwin_arm64.go +++ b/llm/llm_darwin.go @@ -1,11 +1,7 @@ package llm import ( - "embed" "syscall" ) -//go:embed build/darwin/arm64/*/bin/* -var libEmbed embed.FS - var LlamaServerSysProcAttr = &syscall.SysProcAttr{} diff --git a/llm/llm_darwin_amd64.go b/llm/llm_darwin_amd64.go deleted file mode 100644 index 60eed719..00000000 --- a/llm/llm_darwin_amd64.go +++ /dev/null @@ -1,11 +0,0 @@ -package llm - -import ( - "embed" - "syscall" -) - -//go:embed build/darwin/x86_64/*/bin/* -var libEmbed embed.FS - -var LlamaServerSysProcAttr = &syscall.SysProcAttr{} diff --git a/llm/llm_linux.go b/llm/llm_linux.go index 928b4e79..60837ed0 100644 --- a/llm/llm_linux.go +++ b/llm/llm_linux.go @@ -1,11 +1,7 @@ package llm import ( - "embed" "syscall" ) -//go:embed build/linux/*/*/bin/* -var libEmbed embed.FS - var LlamaServerSysProcAttr = &syscall.SysProcAttr{} diff --git a/llm/llm_windows.go b/llm/llm_windows.go index 763cccf9..74a735c2 100644 --- a/llm/llm_windows.go +++ b/llm/llm_windows.go @@ -1,13 +1,9 @@ package llm import ( - "embed" "syscall" ) -// unused on windows -var libEmbed embed.FS - const CREATE_DEFAULT_ERROR_MODE = 0x04000000 var LlamaServerSysProcAttr = &syscall.SysProcAttr{ diff --git a/llm/payload.go b/llm/payload.go deleted file mode 100644 index 963b3295..00000000 --- a/llm/payload.go +++ /dev/null @@ -1,233 +0,0 @@ -package llm - -import ( - "compress/gzip" - "errors" - "fmt" - "io" - "io/fs" - "log/slog" - "os" - "path/filepath" - "runtime" - "slices" - "strings" - - "golang.org/x/sync/errgroup" - - "github.com/ollama/ollama/gpu" -) - -var errPayloadMissing = errors.New("expected payloads not included in this build of ollama") - -func Init() error { - payloadsDir, err := gpu.PayloadsDir() - if err != nil { - return err - } - - if runtime.GOOS != "windows" { - slog.Info("extracting embedded files", "dir", payloadsDir) - binGlob := "build/*/*/*/bin/*" - - // extract server libraries - err = extractFiles(payloadsDir, binGlob) - if err != nil { - return fmt.Errorf("extract binaries: %v", err) - } - } - - var variants []string - for v := range getAvailableServers() { - variants = append(variants, v) - } - slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants)) - slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY") - - return nil -} - -// binary names may contain an optional variant separated by '_' -// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2" -// Any library without a variant is the lowest common denominator -func getAvailableServers() map[string]string { - payloadsDir, err := gpu.PayloadsDir() - if err != nil { - slog.Error("payload lookup error", "error", err) - return nil - } - - // glob payloadsDir for files that start with ollama_ - pattern := filepath.Join(payloadsDir, "*", "ollama_*") - - files, err := filepath.Glob(pattern) - if err != nil { - slog.Debug("could not glob", "pattern", pattern, "error", err) - return nil - } - - servers := make(map[string]string) - for _, file := range files { - slog.Debug("availableServers : found", "file", file) - servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file) - } - - return servers -} - -// serversForGpu returns a list of compatible servers give the provided GPU -// info, ordered by performance. assumes Init() has been called -// TODO - switch to metadata based mapping -func serversForGpu(info gpu.GpuInfo) []string { - // glob workDir for files that start with ollama_ - availableServers := getAvailableServers() - requested := info.Library - if info.Variant != gpu.CPUCapabilityNone.String() { - requested += "_" + info.Variant - } - - servers := []string{} - - // exact match first - for a := range availableServers { - if a == requested { - servers = []string{a} - - if a == "metal" { - return servers - } - - break - } - } - - alt := []string{} - - // Then for GPUs load alternates and sort the list for consistent load ordering - if info.Library != "cpu" { - for a := range availableServers { - if info.Library == strings.Split(a, "_")[0] && a != requested { - alt = append(alt, a) - } - } - - slices.Sort(alt) - servers = append(servers, alt...) - } - - if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") { - // Load up the best CPU variant if not primary requested - if info.Library != "cpu" { - variant := gpu.GetCPUCapability() - // If no variant, then we fall back to default - // If we have a variant, try that if we find an exact match - // Attempting to run the wrong CPU instructions will panic the - // process - if variant != gpu.CPUCapabilityNone { - for cmp := range availableServers { - if cmp == "cpu_"+variant.String() { - servers = append(servers, cmp) - break - } - } - } else { - servers = append(servers, "cpu") - } - } - - if len(servers) == 0 { - servers = []string{"cpu"} - } - } - - return servers -} - -// Return the optimal server for this CPU architecture -func serverForCpu() string { - if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { - return "metal" - } - variant := gpu.GetCPUCapability() - availableServers := getAvailableServers() - if variant != gpu.CPUCapabilityNone { - for cmp := range availableServers { - if cmp == "cpu_"+variant.String() { - return cmp - } - } - } - return "cpu" -} - -// extract extracts the embedded files to the target directory -func extractFiles(targetDir string, glob string) error { - files, err := fs.Glob(libEmbed, glob) - if err != nil || len(files) == 0 { - return errPayloadMissing - } - - if err := os.MkdirAll(targetDir, 0o755); err != nil { - return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err) - } - - g := new(errgroup.Group) - - // build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE - for _, file := range files { - filename := file - - variant := filepath.Base(filepath.Dir(filepath.Dir(filename))) - - slog.Debug("extracting", "variant", variant, "file", filename) - - g.Go(func() error { - srcf, err := libEmbed.Open(filename) - if err != nil { - return err - } - defer srcf.Close() - - src := io.Reader(srcf) - if strings.HasSuffix(filename, ".gz") { - src, err = gzip.NewReader(src) - if err != nil { - return fmt.Errorf("decompress payload %s: %v", filename, err) - } - filename = strings.TrimSuffix(filename, ".gz") - } - - variantDir := filepath.Join(targetDir, variant) - if err := os.MkdirAll(variantDir, 0o755); err != nil { - return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err) - } - - base := filepath.Base(filename) - destFilename := filepath.Join(variantDir, base) - - _, err = os.Stat(destFilename) - switch { - case errors.Is(err, os.ErrNotExist): - destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) - if err != nil { - return fmt.Errorf("write payload %s: %v", filename, err) - } - defer destFile.Close() - if _, err := io.Copy(destFile, src); err != nil { - return fmt.Errorf("copy payload %s: %v", filename, err) - } - case err != nil: - return fmt.Errorf("stat payload %s: %v", filename, err) - } - return nil - }) - } - - err = g.Wait() - if err != nil { - // If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted - gpu.Cleanup() - return err - } - return nil -} diff --git a/llm/server.go b/llm/server.go index 5d5b8c4f..6c504f14 100644 --- a/llm/server.go +++ b/llm/server.go @@ -24,9 +24,11 @@ import ( "golang.org/x/sync/semaphore" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/build" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/format" "github.com/ollama/ollama/gpu" + "github.com/ollama/ollama/runners" ) type LlamaServer interface { @@ -106,7 +108,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr gpus = gpu.GetCPUInfo() } if len(gpus) == 1 && gpus[0].Library == "cpu" { - cpuRunner = serverForCpu() + cpuRunner = runners.ServerForCpu() estimate = EstimateGPULayers(gpus, ggml, projectors, opts) } else { estimate = EstimateGPULayers(gpus, ggml, projectors, opts) @@ -118,7 +120,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr opts.NumGPU = 0 case gpus[0].Library != "metal" && estimate.Layers == 0: // Don't bother loading into the GPU if no layers can fit - cpuRunner = serverForCpu() + cpuRunner = runners.ServerForCpu() gpus = gpu.GetCPUInfo() case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu": opts.NumGPU = estimate.Layers @@ -145,25 +147,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") } - availableServers := getAvailableServers() + rDir, err := runners.Refresh(build.EmbedFS) + if err != nil { + return nil, err + } + + availableServers := runners.GetAvailableServers(rDir) if len(availableServers) == 0 { - if runtime.GOOS != "windows" { - slog.Warn("llama server binary disappeared, reinitializing payloads") - err = Init() - if err != nil { - slog.Warn("failed to reinitialize payloads", "error", err) - return nil, err - } - availableServers = getAvailableServers() - } else { - return nil, finalErr - } + return nil, finalErr } var servers []string if cpuRunner != "" { servers = []string{cpuRunner} } else { - servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant + servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant } demandLib := envconfig.LLMLibrary() if demandLib != "" { @@ -330,7 +327,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr _, err := os.Stat(server) if errors.Is(err, os.ErrNotExist) { slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err) - err = Init() + _, err = runners.Refresh(build.EmbedFS) if err != nil { slog.Warn("failed to reinitialize payloads", "error", err) return nil, err diff --git a/runners/common.go b/runners/common.go new file mode 100644 index 00000000..681c397b --- /dev/null +++ b/runners/common.go @@ -0,0 +1,384 @@ +package runners + +import ( + "compress/gzip" + "errors" + "fmt" + "io" + "io/fs" + "log/slog" + "os" + "path/filepath" + "runtime" + "slices" + "strconv" + "strings" + "sync" + "syscall" + + "golang.org/x/sync/errgroup" + + "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/gpu" +) + +const ( + binGlob = "*/*/*/*" +) + +var ( + lock sync.Mutex + runnersDir = "" +) + +// Return the location where runners are stored +// If runners are payloads, this will either extract them +// or refresh them if any have disappeared due to tmp cleaners +func Refresh(payloadFS fs.FS) (string, error) { + lock.Lock() + defer lock.Unlock() + var err error + + // Wire up extra logging on our first load + if runnersDir == "" { + defer func() { + var runners []string + for v := range GetAvailableServers(runnersDir) { + runners = append(runners, v) + } + slog.Info("Dynamic LLM libraries", "runners", runners) + slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY") + }() + } + + if hasPayloads(payloadFS) { + if runnersDir == "" { + runnersDir, err = extractRunners(payloadFS) + } else { + err = refreshRunners(payloadFS, runnersDir) + } + } else if runnersDir == "" { + runnersDir, err = locateRunners() + } + + return runnersDir, err +} + +func Cleanup(payloadFS fs.FS) { + lock.Lock() + defer lock.Unlock() + if hasPayloads(payloadFS) && runnersDir != "" { + // We want to fully clean up the tmpdir parent of the payloads dir + tmpDir := filepath.Clean(filepath.Join(runnersDir, "..")) + slog.Debug("cleaning up", "dir", tmpDir) + err := os.RemoveAll(tmpDir) + if err != nil { + slog.Warn("failed to clean up", "dir", tmpDir, "err", err) + } + } +} + +func locateRunners() (string, error) { + exe, err := os.Executable() + if err != nil { + return "", err + } + + cwd, err := os.Getwd() + if err != nil { + return "", err + } + + var paths []string + for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} { + paths = append(paths, + root, + filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), + filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH), + ) + } + + // Try a few variations to improve developer experience when building from source in the local tree + for _, path := range paths { + candidate := filepath.Join(path, "lib", "ollama", "runners") + if _, err := os.Stat(candidate); err == nil { + return candidate, nil + } + } + return "", fmt.Errorf("unable to locate runners in any search path %v", paths) +} + +// Return true if we're carying nested payloads for the runners +func hasPayloads(payloadFS fs.FS) bool { + files, err := fs.Glob(payloadFS, binGlob) + if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) { + return false + } + return true +} + +func extractRunners(payloadFS fs.FS) (string, error) { + cleanupTmpDirs() + tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama") + if err != nil { + return "", fmt.Errorf("failed to generate tmp dir: %w", err) + } + // Track our pid so we can clean up orphaned tmpdirs + n := filepath.Join(tmpDir, "ollama.pid") + if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil { + slog.Warn("failed to write pid file", "file", n, "error", err) + } + // We create a distinct subdirectory for payloads within the tmpdir + // This will typically look like /tmp/ollama3208993108/runners on linux + rDir := filepath.Join(tmpDir, "runners") + + slog.Info("extracting embedded files", "dir", rDir) + return rDir, refreshRunners(payloadFS, rDir) +} + +func refreshRunners(payloadFS fs.FS, rDir string) error { + // extract or refresh server libraries + err := extractFiles(payloadFS, rDir, binGlob) + if err != nil { + return fmt.Errorf("extract binaries: %v", err) + } + return nil +} + +// extract extracts the embedded files to the target directory +func extractFiles(payloadFS fs.FS, targetDir string, glob string) error { + files, err := fs.Glob(payloadFS, glob) + if err != nil || len(files) == 0 { + // Should not happen + return fmt.Errorf("extractFiles called without payload present") + } + + if err := os.MkdirAll(targetDir, 0o755); err != nil { + return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err) + } + + g := new(errgroup.Group) + + // $OS/$GOARCH/$RUNNER/$FILE + for _, file := range files { + filename := file + + runner := filepath.Base(filepath.Dir(filename)) + + slog.Debug("extracting", "runner", runner, "payload", filename) + + g.Go(func() error { + srcf, err := payloadFS.Open(filename) + if err != nil { + return err + } + defer srcf.Close() + + src := io.Reader(srcf) + if strings.HasSuffix(filename, ".gz") { + src, err = gzip.NewReader(src) + if err != nil { + return fmt.Errorf("decompress payload %s: %v", filename, err) + } + filename = strings.TrimSuffix(filename, ".gz") + } + + runnerDir := filepath.Join(targetDir, runner) + if err := os.MkdirAll(runnerDir, 0o755); err != nil { + return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err) + } + + base := filepath.Base(filename) + destFilename := filepath.Join(runnerDir, base) + + _, err = os.Stat(destFilename) + switch { + case errors.Is(err, os.ErrNotExist): + destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755) + if err != nil { + return fmt.Errorf("write payload %s: %v", filename, err) + } + defer destFile.Close() + if _, err := io.Copy(destFile, src); err != nil { + return fmt.Errorf("copy payload %s: %v", filename, err) + } + case err != nil: + return fmt.Errorf("stat payload %s: %v", filename, err) + } + return nil + }) + } + + err = g.Wait() + if err != nil { + slog.Error("failed to extract files", "error", err) + // If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted + err := os.RemoveAll(targetDir) + if err != nil { + slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err) + } + return err + } + return nil +} + +// Best effort to clean up prior tmpdirs +func cleanupTmpDirs() { + tmpDir := envconfig.TmpDir() + if tmpDir == "" { + tmpDir = os.TempDir() + } + matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid")) + if err != nil { + return + } + + for _, match := range matches { + raw, err := os.ReadFile(match) + if errors.Is(err, os.ErrNotExist) { + slog.Debug("not a ollama runtime directory, skipping", "path", match) + continue + } else if err != nil { + slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err) + continue + } + + pid, err := strconv.Atoi(string(raw)) + if err != nil { + slog.Warn("invalid pid, skipping", "path", match, "error", err) + continue + } + + p, err := os.FindProcess(pid) + if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) { + slog.Warn("process still running, skipping", "pid", pid, "path", match) + continue + } + + if err := os.Remove(match); err != nil { + slog.Warn("could not cleanup stale pidfile", "path", match, "error", err) + } + + runners := filepath.Join(filepath.Dir(match), "runners") + if err := os.RemoveAll(runners); err != nil { + slog.Warn("could not cleanup stale runners", "path", runners, "error", err) + } + + if err := os.Remove(filepath.Dir(match)); err != nil { + slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err) + } + } +} + +// directory names are the name of the runner and may contain an optional +// variant prefixed with '_' as the separator. For example, "cuda_v11" and +// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the +// lowest common denominator +func GetAvailableServers(payloadsDir string) map[string]string { + if payloadsDir == "" { + slog.Error("empty runner dir") + return nil + } + + // glob payloadsDir for files that start with ollama_ + pattern := filepath.Join(payloadsDir, "*", "ollama_*") + + files, err := filepath.Glob(pattern) + if err != nil { + slog.Debug("could not glob", "pattern", pattern, "error", err) + return nil + } + + servers := make(map[string]string) + for _, file := range files { + slog.Debug("availableServers : found", "file", file) + servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file) + } + + return servers +} + +// serversForGpu returns a list of compatible servers give the provided GPU +// info, ordered by performance. assumes Init() has been called +// TODO - switch to metadata based mapping +func ServersForGpu(info gpu.GpuInfo) []string { + // glob workDir for files that start with ollama_ + availableServers := GetAvailableServers(runnersDir) + requested := info.Library + if info.Variant != gpu.CPUCapabilityNone.String() { + requested += "_" + info.Variant + } + + servers := []string{} + + // exact match first + for a := range availableServers { + if a == requested { + servers = []string{a} + + if a == "metal" { + return servers + } + + break + } + } + + alt := []string{} + + // Then for GPUs load alternates and sort the list for consistent load ordering + if info.Library != "cpu" { + for a := range availableServers { + if info.Library == strings.Split(a, "_")[0] && a != requested { + alt = append(alt, a) + } + } + + slices.Sort(alt) + servers = append(servers, alt...) + } + + if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") { + // Load up the best CPU variant if not primary requested + if info.Library != "cpu" { + variant := gpu.GetCPUCapability() + // If no variant, then we fall back to default + // If we have a variant, try that if we find an exact match + // Attempting to run the wrong CPU instructions will panic the + // process + if variant != gpu.CPUCapabilityNone { + for cmp := range availableServers { + if cmp == "cpu_"+variant.String() { + servers = append(servers, cmp) + break + } + } + } else { + servers = append(servers, "cpu") + } + } + + if len(servers) == 0 { + servers = []string{"cpu"} + } + } + + return servers +} + +// Return the optimal server for this CPU architecture +func ServerForCpu() string { + if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { + return "metal" + } + variant := gpu.GetCPUCapability() + availableServers := GetAvailableServers(runnersDir) + if variant != gpu.CPUCapabilityNone { + for cmp := range availableServers { + if cmp == "cpu_"+variant.String() { + return cmp + } + } + } + return "cpu" +} diff --git a/runners/runners_test.go b/runners/runners_test.go new file mode 100644 index 00000000..e6439448 --- /dev/null +++ b/runners/runners_test.go @@ -0,0 +1,50 @@ +package runners + +import ( + "log/slog" + "os" + "path" + "runtime" + "strings" + "testing" + "testing/fstest" +) + +func TestRefreshRunners(t *testing.T) { + slog.SetLogLoggerLevel(slog.LevelDebug) + + payloadFS := fstest.MapFS{ + path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")}, + } + tmpDir, err := os.MkdirTemp("", "testing") + if err != nil { + t.Fatalf("failed to make tmp dir %s", err) + } + t.Setenv("OLLAMA_TMPDIR", tmpDir) + rDir, err := Refresh(payloadFS) + if err != nil { + t.Fatalf("failed to extract to %s %s", tmpDir, err) + } + if !strings.Contains(rDir, tmpDir) { + t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir) + } + + // spot check results + servers := GetAvailableServers(rDir) + if len(servers) < 1 { + t.Fatalf("expected at least 1 server") + } + + // Refresh contents + rDir, err = extractRunners(payloadFS) + if err != nil { + t.Fatalf("failed to extract to %s %s", tmpDir, err) + } + if !strings.Contains(rDir, tmpDir) { + t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir) + } + + cleanupTmpDirs() + + Cleanup(payloadFS) +} diff --git a/scripts/build_darwin.sh b/scripts/build_darwin.sh index a2f76af2..17ac0b94 100755 --- a/scripts/build_darwin.sh +++ b/scripts/build_darwin.sh @@ -2,8 +2,7 @@ set -e -export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} -export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" +. $(dirname $0)/env.sh mkdir -p dist diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh index e91c56ed..567eb7c7 100755 --- a/scripts/build_docker.sh +++ b/scripts/build_docker.sh @@ -2,76 +2,34 @@ set -eu -export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} -export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" - -# We use 2 different image repositories to handle combining architecture images into multiarch manifest -# (The ROCm image is x86 only and is not a multiarch manifest) -# For developers, you can override the DOCKER_ORG to generate multiarch manifests -# DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh -DOCKER_ORG=${DOCKER_ORG:-"ollama"} -RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"} -FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"} - -BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"} +. $(dirname $0)/env.sh # Set PUSH to a non-empty string to trigger push instead of load PUSH=${PUSH:-""} -# In CI mode, we break things down -OLLAMA_SKIP_MANIFEST_CREATE=${OLLAMA_SKIP_MANIFEST_CREATE:-""} -OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""} - if [ -z "${PUSH}" ] ; then + echo "Building ${FINAL_IMAGE_REPO}:$VERSION locally. set PUSH=1 to push" LOAD_OR_PUSH="--load" else - echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}" + echo "Will be pushing ${FINAL_IMAGE_REPO}:$VERSION" LOAD_OR_PUSH="--push" fi -if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then - for TARGETARCH in ${BUILD_ARCH}; do - docker build \ - ${LOAD_OR_PUSH} \ - --platform=linux/${TARGETARCH} \ - --build-arg=VERSION \ - --build-arg=GOFLAGS \ - -f Dockerfile \ - -t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \ - . - done +docker buildx build \ + ${LOAD_OR_PUSH} \ + --platform=${PLATFORM} \ + ${OLLAMA_COMMON_BUILD_ARGS} \ + -f Dockerfile \ + -t ${FINAL_IMAGE_REPO}:$VERSION \ + . - if echo ${BUILD_ARCH} | grep "amd64" > /dev/null; then - docker build \ - ${LOAD_OR_PUSH} \ - --platform=linux/amd64 \ - --build-arg=VERSION \ - --build-arg=GOFLAGS \ - --target runtime-rocm \ - -f Dockerfile \ - -t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \ - . - fi -fi - -if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then - if [ -n "${PUSH}" ]; then - docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \ - ${RELEASE_IMAGE_REPO}:$VERSION-amd64 \ - ${RELEASE_IMAGE_REPO}:$VERSION-arm64 - docker manifest push ${FINAL_IMAGE_REPO}:$VERSION - - # For symmetry, tag/push the rocm image - if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then - echo "Tagging and pushing rocm image" - docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm - docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm - docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm - fi - else - echo "Skipping manifest generation when not pushing images are available locally as " - echo " ${RELEASE_IMAGE_REPO}:$VERSION-amd64" - echo " ${RELEASE_IMAGE_REPO}:$VERSION-arm64" - echo " ${RELEASE_IMAGE_REPO}:$VERSION-rocm" - fi -fi +if echo $PLATFORM | grep "amd64" > /dev/null; then + docker buildx build \ + ${LOAD_OR_PUSH} \ + --platform=linux/amd64 \ + ${OLLAMA_COMMON_BUILD_ARGS} \ + --target runtime-rocm \ + -f Dockerfile \ + -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \ + . +fi \ No newline at end of file diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 6cb0d0cd..894d9dd2 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -1,37 +1,29 @@ #!/bin/sh +# +# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder +# +# docker context create amd64 --docker host=ssh://mybuildhost +# docker buildx create --name mybuilder amd64 --platform linux/amd64 +# docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64 +# docker buildx use mybuilder + set -eu -export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} -export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" -GZIP=$(which pigz 2>/dev/null || echo "gzip") +. $(dirname $0)/env.sh -BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"} -export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""} mkdir -p dist -for TARGETARCH in ${BUILD_ARCH}; do - docker build \ - --platform=linux/$TARGETARCH \ - --build-arg=GOFLAGS \ - --build-arg=CGO_CFLAGS \ - --build-arg=OLLAMA_CUSTOM_CPU_DEFS \ - --build-arg=AMDGPU_TARGETS \ - --target build-$TARGETARCH \ +docker buildx build \ + --output type=local,dest=./dist/ \ + --platform=${PLATFORM} \ + ${OLLAMA_COMMON_BUILD_ARGS} \ + --target dist \ -f Dockerfile \ - -t builder:$TARGETARCH \ . - docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH - rm -rf ./dist/linux-$TARGETARCH - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist - if echo ${TARGETARCH} | grep "amd64" > /dev/null; then - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist - fi - docker rm builder-$TARGETARCH - echo "Compressing final linux bundle..." - rm -f ./dist/ollama-linux-$TARGETARCH.tgz - (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz ) - if [ -d dist/linux-$TARGETARCH-rocm ]; then - (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz ) - fi -done + +# buildx behavior changes for single vs. multiplatform +if echo $PLATFORM | grep "," > /dev/null ; then + mv -f ./dist/linux_*64/ollama* ./dist/ + rmdir ./dist/linux_*64 +fi \ No newline at end of file diff --git a/scripts/env.sh b/scripts/env.sh new file mode 100644 index 00000000..d3ca05d7 --- /dev/null +++ b/scripts/env.sh @@ -0,0 +1,14 @@ +# Common environment setup across build*.sh scripts + +export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} +export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" +# TODO - consider `docker buildx ls --format=json` to autodiscover platform capability +PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"} +DOCKER_ORG=${DOCKER_ORG:-"ollama"} +RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"} +FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"} +OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION --build-arg=GOFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS --build-arg=AMDGPU_TARGETS" + +echo "Building Ollama" +echo "VERSION=$VERSION" +echo "PLATFORM=$PLATFORM" \ No newline at end of file diff --git a/server/routes.go b/server/routes.go index f202973e..6bd3a93f 100644 --- a/server/routes.go +++ b/server/routes.go @@ -26,11 +26,13 @@ import ( "golang.org/x/sync/errgroup" "github.com/ollama/ollama/api" + "github.com/ollama/ollama/build" "github.com/ollama/ollama/envconfig" "github.com/ollama/ollama/gpu" "github.com/ollama/ollama/llm" "github.com/ollama/ollama/openai" "github.com/ollama/ollama/parser" + "github.com/ollama/ollama/runners" "github.com/ollama/ollama/template" "github.com/ollama/ollama/types/errtypes" "github.com/ollama/ollama/types/model" @@ -1216,12 +1218,12 @@ func Serve(ln net.Listener) error { srvr.Close() schedDone() sched.unloadAllRunners() - gpu.Cleanup() + runners.Cleanup(build.EmbedFS) done() }() - if err := llm.Init(); err != nil { - return fmt.Errorf("unable to initialize llm library %w", err) + if _, err := runners.Refresh(build.EmbedFS); err != nil { + return fmt.Errorf("unable to initialize llm runners %w", err) } s.sched.Run(schedCtx) From fda0d3be5224b59a4b4b031e18c89adca71657ed Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 12 Sep 2024 16:38:05 -0700 Subject: [PATCH 16/18] Use GOARCH for build dirs (#6779) Corrects x86_64 vs amd64 discrepancy --- llm/generate/gen_darwin.sh | 8 ++++---- llm/generate/gen_linux.sh | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 49c67125..c37366f3 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -40,7 +40,7 @@ case "${GOARCH}" in init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" RUNNER=cpu - BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}" + BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}" echo "Building LCD CPU" build sign ${BUILD_DIR}/bin/ollama_llama_server @@ -53,7 +53,7 @@ case "${GOARCH}" in init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" RUNNER=cpu_avx - BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}" + BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}" echo "Building AVX CPU" build sign ${BUILD_DIR}/bin/ollama_llama_server @@ -66,7 +66,7 @@ case "${GOARCH}" in init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}" RUNNER=cpu_avx2 - BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}" + BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}" echo "Building AVX2 CPU" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation" build @@ -88,7 +88,7 @@ case "${GOARCH}" in init_vars CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" RUNNER="metal" - BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}" + BUILD_DIR="../build/darwin/${GOARCH}/${RUNNER}" EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders" build sign ${BUILD_DIR}/bin/ollama_llama_server diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index eb7fa786..48d08fd0 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -80,7 +80,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" RUNNER="cpu" - BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" echo "Building custom CPU" build install @@ -105,7 +105,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" RUNNER=cpu - BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" echo "Building LCD CPU" build install @@ -125,7 +125,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" RUNNER=cpu_avx - BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" echo "Building AVX CPU" build install @@ -141,7 +141,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then init_vars CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}" RUNNER=cpu_avx2 - BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" echo "Building AVX2 CPU" build install @@ -196,7 +196,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then export CUDAFLAGS="-t8" CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" RUNNER=cuda${CUDA_VARIANT} - BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}" build @@ -223,7 +223,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then CC=icx CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" RUNNER=oneapi - BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama" export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it @@ -272,7 +272,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then echo "Building custom ROCM GPU" fi RUNNER=rocm${ROCM_VARIANT} - BUILD_DIR="../build/linux/${ARCH}/${RUNNER}" + BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}" # ROCm dependencies are too large to fit into a unified bundle ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama" # TODO figure out how to disable runpath (rpath) @@ -282,7 +282,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then # copy the ROCM dependencies mkdir -p "${ROCM_DIST_DIR}" - for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do + for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do cp -a "${dep}"* "${ROCM_DIST_DIR}" if [ $(readlink -f "${dep}") != "${dep}" ] ; then cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}" From 56b9af336a049dae37f7cc62246121ac9d18576e Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 13 Sep 2024 08:24:08 -0700 Subject: [PATCH 17/18] Fix incremental builds on linux (#6780) scripts: fix incremental builds on linux or similar --- llm/generate/gen_common.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index 9fe47529..ab5d7612 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -137,7 +137,7 @@ wait_for_compress() { install() { echo "Installing libraries to bin dir ${BUILD_DIR}/bin/" - for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do + for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT} | grep -v "${BUILD_DIR}/bin/" ); do rm -f "${BUILD_DIR}/bin/$(basename ${lib})" cp -af "${lib}" "${BUILD_DIR}/bin/" done From d889c6fd07533859502f9571a8f80ec1d4322d0e Mon Sep 17 00:00:00 2001 From: Edward Cui <37892357+ECuiDev@users.noreply.github.com> Date: Sat, 14 Sep 2024 20:52:37 -0700 Subject: [PATCH 18/18] readme: add Obsidian Quiz Generator plugin to community integrations (#6789) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ecbb7023..81e7583e 100644 --- a/README.md +++ b/README.md @@ -431,6 +431,7 @@ See the [API documentation](./docs/api.md) for all endpoints. - [vnc-lm](https://github.com/jk011ru/vnc-lm) (A containerized Discord bot with support for attachments and web links) - [LSP-AI](https://github.com/SilasMarvin/lsp-ai) (Open-source language server for AI-powered functionality) - [QodeAssist](https://github.com/Palm1r/QodeAssist) (AI-powered coding assistant plugin for Qt Creator) +- [Obsidian Quiz Generator plugin](https://github.com/ECuiDev/obsidian-quiz-generator) ### Supported backends