Compare commits
No commits in common. "f564d9cbc15a09873e3aad086c7dfa9cf4610369" and "1d125ce9b72ff6784968a0cbdaf6f25ae93a0cf3" have entirely different histories.
f564d9cbc1
...
1d125ce9b7
93 changed files with 326 additions and 1567 deletions
10
.github/workflows/release.yaml
vendored
10
.github/workflows/release.yaml
vendored
|
@ -31,7 +31,7 @@ jobs:
|
||||||
security set-keychain-settings -lut 3600 build.keychain
|
security set-keychain-settings -lut 3600 build.keychain
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: Build Darwin
|
- name: Build Darwin
|
||||||
env:
|
env:
|
||||||
|
@ -87,7 +87,7 @@ jobs:
|
||||||
write-host "plugin installed"
|
write-host "plugin installed"
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- run: go get ./...
|
- run: go get ./...
|
||||||
- run: |
|
- run: |
|
||||||
|
@ -141,7 +141,7 @@ jobs:
|
||||||
write-host "plugin installed"
|
write-host "plugin installed"
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install ROCm'
|
- name: 'Install ROCm'
|
||||||
run: |
|
run: |
|
||||||
|
@ -218,7 +218,7 @@ jobs:
|
||||||
write-host "plugin installed"
|
write-host "plugin installed"
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install CUDA'
|
- name: 'Install CUDA'
|
||||||
run: |
|
run: |
|
||||||
|
@ -306,7 +306,7 @@ jobs:
|
||||||
write-host "plugin installed"
|
write-host "plugin installed"
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- run: go get
|
- run: go get
|
||||||
- uses: actions/download-artifact@v4
|
- uses: actions/download-artifact@v4
|
||||||
|
|
10
.github/workflows/test.yaml
vendored
10
.github/workflows/test.yaml
vendored
|
@ -63,7 +63,7 @@ jobs:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- run: go get ./...
|
- run: go get ./...
|
||||||
- run: |
|
- run: |
|
||||||
|
@ -163,7 +163,7 @@ jobs:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install ROCm'
|
- name: 'Install ROCm'
|
||||||
run: |
|
run: |
|
||||||
|
@ -200,7 +200,7 @@ jobs:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- name: 'Install CUDA'
|
- name: 'Install CUDA'
|
||||||
run: |
|
run: |
|
||||||
|
@ -255,7 +255,7 @@ jobs:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: false
|
cache: false
|
||||||
- run: |
|
- run: |
|
||||||
case ${{ matrix.arch }} in
|
case ${{ matrix.arch }} in
|
||||||
|
@ -297,7 +297,7 @@ jobs:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version: "stable"
|
go-version-file: go.mod
|
||||||
cache: true
|
cache: true
|
||||||
- run: |
|
- run: |
|
||||||
case ${{ matrix.arch }} in
|
case ${{ matrix.arch }} in
|
||||||
|
|
38
README.md
38
README.md
|
@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
|
||||||
|
|
||||||
## Quickstart
|
## Quickstart
|
||||||
|
|
||||||
To run and chat with [Llama 3.1](https://ollama.com/library/llama3.1):
|
To run and chat with [Llama 3](https://ollama.com/library/llama3):
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama run llama3.1
|
ollama run llama3
|
||||||
```
|
```
|
||||||
|
|
||||||
## Model library
|
## Model library
|
||||||
|
@ -49,9 +49,8 @@ Here are some example models that can be downloaded:
|
||||||
|
|
||||||
| Model | Parameters | Size | Download |
|
| Model | Parameters | Size | Download |
|
||||||
| ------------------ | ---------- | ----- | ------------------------------ |
|
| ------------------ | ---------- | ----- | ------------------------------ |
|
||||||
| Llama 3.1 | 8B | 4.7GB | `ollama run llama3.1` |
|
| Llama 3 | 8B | 4.7GB | `ollama run llama3` |
|
||||||
| Llama 3.1 | 70B | 40GB | `ollama run llama3.1:70b` |
|
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
|
||||||
| Llama 3.1 | 405B | 231GB | `ollama run llama3.1:405b` |
|
|
||||||
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
||||||
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
||||||
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
|
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
|
||||||
|
@ -65,8 +64,7 @@ Here are some example models that can be downloaded:
|
||||||
| LLaVA | 7B | 4.5GB | `ollama run llava` |
|
| LLaVA | 7B | 4.5GB | `ollama run llava` |
|
||||||
| Solar | 10.7B | 6.1GB | `ollama run solar` |
|
| Solar | 10.7B | 6.1GB | `ollama run solar` |
|
||||||
|
|
||||||
> [!NOTE]
|
> Note: You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
|
||||||
> You should have at least 8 GB of RAM available to run the 7B models, 16 GB to run the 13B models, and 32 GB to run the 33B models.
|
|
||||||
|
|
||||||
## Customize a model
|
## Customize a model
|
||||||
|
|
||||||
|
@ -98,16 +96,16 @@ See the [guide](docs/import.md) on importing models for more information.
|
||||||
|
|
||||||
### Customize a prompt
|
### Customize a prompt
|
||||||
|
|
||||||
Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.1` model:
|
Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3` model:
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama pull llama3.1
|
ollama pull llama3
|
||||||
```
|
```
|
||||||
|
|
||||||
Create a `Modelfile`:
|
Create a `Modelfile`:
|
||||||
|
|
||||||
```
|
```
|
||||||
FROM llama3.1
|
FROM llama3
|
||||||
|
|
||||||
# set the temperature to 1 [higher is more creative, lower is more coherent]
|
# set the temperature to 1 [higher is more creative, lower is more coherent]
|
||||||
PARAMETER temperature 1
|
PARAMETER temperature 1
|
||||||
|
@ -142,7 +140,7 @@ ollama create mymodel -f ./Modelfile
|
||||||
### Pull a model
|
### Pull a model
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama pull llama3.1
|
ollama pull llama3
|
||||||
```
|
```
|
||||||
|
|
||||||
> This command can also be used to update a local model. Only the diff will be pulled.
|
> This command can also be used to update a local model. Only the diff will be pulled.
|
||||||
|
@ -150,13 +148,13 @@ ollama pull llama3.1
|
||||||
### Remove a model
|
### Remove a model
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama rm llama3.1
|
ollama rm llama3
|
||||||
```
|
```
|
||||||
|
|
||||||
### Copy a model
|
### Copy a model
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama cp llama3.1 my-model
|
ollama cp llama3 my-model
|
||||||
```
|
```
|
||||||
|
|
||||||
### Multiline input
|
### Multiline input
|
||||||
|
@ -173,21 +171,21 @@ I'm a basic program that prints the famous "Hello, world!" message to the consol
|
||||||
### Multimodal models
|
### Multimodal models
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama run llava "What's in this image? /Users/jmorgan/Desktop/smile.png"
|
>>> What's in this image? /Users/jmorgan/Desktop/smile.png
|
||||||
The image features a yellow smiley face, which is likely the central focus of the picture.
|
The image features a yellow smiley face, which is likely the central focus of the picture.
|
||||||
```
|
```
|
||||||
|
|
||||||
### Pass the prompt as an argument
|
### Pass the prompt as an argument
|
||||||
|
|
||||||
```
|
```
|
||||||
$ ollama run llama3.1 "Summarize this file: $(cat README.md)"
|
$ ollama run llama3 "Summarize this file: $(cat README.md)"
|
||||||
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
|
||||||
```
|
```
|
||||||
|
|
||||||
### Show model information
|
### Show model information
|
||||||
|
|
||||||
```
|
```
|
||||||
ollama show llama3.1
|
ollama show llama3
|
||||||
```
|
```
|
||||||
|
|
||||||
### List models on your computer
|
### List models on your computer
|
||||||
|
@ -215,7 +213,7 @@ Next, start the server:
|
||||||
Finally, in a separate shell, run a model:
|
Finally, in a separate shell, run a model:
|
||||||
|
|
||||||
```
|
```
|
||||||
./ollama run llama3.1
|
./ollama run llama3
|
||||||
```
|
```
|
||||||
|
|
||||||
## REST API
|
## REST API
|
||||||
|
@ -226,7 +224,7 @@ Ollama has a REST API for running and managing models.
|
||||||
|
|
||||||
```
|
```
|
||||||
curl http://localhost:11434/api/generate -d '{
|
curl http://localhost:11434/api/generate -d '{
|
||||||
"model": "llama3.1",
|
"model": "llama3",
|
||||||
"prompt":"Why is the sky blue?"
|
"prompt":"Why is the sky blue?"
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
@ -235,7 +233,7 @@ curl http://localhost:11434/api/generate -d '{
|
||||||
|
|
||||||
```
|
```
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
"model": "llama3.1",
|
"model": "llama3",
|
||||||
"messages": [
|
"messages": [
|
||||||
{ "role": "user", "content": "why is the sky blue?" }
|
{ "role": "user", "content": "why is the sky blue?" }
|
||||||
]
|
]
|
||||||
|
@ -298,7 +296,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
|
- [Kerlig AI](https://www.kerlig.com/) (AI writing assistant for macOS)
|
||||||
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
|
- [AI Studio](https://github.com/MindWorkAI/AI-Studio)
|
||||||
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
|
- [Sidellama](https://github.com/gyopak/sidellama) (browser-based LLM client)
|
||||||
- [LLMStack](https://github.com/trypromptly/LLMStack) (No-code multi-agent framework to build LLM agents and workflows)
|
|
||||||
|
|
||||||
### Terminal
|
### Terminal
|
||||||
|
|
||||||
|
@ -337,7 +334,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
### Libraries
|
### Libraries
|
||||||
|
|
||||||
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
- [LangChain](https://python.langchain.com/docs/integrations/llms/ollama) and [LangChain.js](https://js.langchain.com/docs/modules/model_io/models/llms/integrations/ollama) with [example](https://js.langchain.com/docs/use_cases/question_answering/local_retrieval_qa)
|
||||||
- [Firebase Genkit](https://firebase.google.com/docs/genkit/plugins/ollama)
|
|
||||||
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
|
- [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
|
||||||
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
|
- [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
|
||||||
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
- [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
|
||||||
|
|
25
SECURITY.md
25
SECURITY.md
|
@ -1,25 +0,0 @@
|
||||||
# Security
|
|
||||||
|
|
||||||
The Ollama maintainer team takes security seriously and will actively work to resolve security issues.
|
|
||||||
|
|
||||||
## Reporting a vulnerability
|
|
||||||
|
|
||||||
If you discover a security vulnerability, please do not open a public issue. Instead, please report it by emailing hello@ollama.com. We ask that you give us sufficient time to investigate and address the vulnerability before disclosing it publicly.
|
|
||||||
|
|
||||||
Please include the following details in your report:
|
|
||||||
- A description of the vulnerability
|
|
||||||
- Steps to reproduce the issue
|
|
||||||
- Your assessment of the potential impact
|
|
||||||
- Any possible mitigations
|
|
||||||
|
|
||||||
## Security best practices
|
|
||||||
|
|
||||||
While the maintainer team does their best to secure Ollama, users are encouraged to implement their own security best practices, such as:
|
|
||||||
|
|
||||||
- Regularly updating to the latest version of Ollama
|
|
||||||
- Securing access to hosted instances of Ollama
|
|
||||||
- Monitoring systems for unusual activity
|
|
||||||
|
|
||||||
## Contact
|
|
||||||
|
|
||||||
For any other questions or concerns related to security, please contact us at hello@ollama.com
|
|
10
api/types.go
10
api/types.go
|
@ -114,11 +114,6 @@ func (t Tools) String() string {
|
||||||
return string(bts)
|
return string(bts)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t Tool) String() string {
|
|
||||||
bts, _ := json.Marshal(t)
|
|
||||||
return string(bts)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Message is a single message in a chat sequence. The message contains the
|
// Message is a single message in a chat sequence. The message contains the
|
||||||
// role ("system", "user", or "assistant"), the content and an optional list
|
// role ("system", "user", or "assistant"), the content and an optional list
|
||||||
// of images.
|
// of images.
|
||||||
|
@ -214,7 +209,6 @@ type Options struct {
|
||||||
NumPredict int `json:"num_predict,omitempty"`
|
NumPredict int `json:"num_predict,omitempty"`
|
||||||
TopK int `json:"top_k,omitempty"`
|
TopK int `json:"top_k,omitempty"`
|
||||||
TopP float32 `json:"top_p,omitempty"`
|
TopP float32 `json:"top_p,omitempty"`
|
||||||
MinP float32 `json:"min_p,omitempty"`
|
|
||||||
TFSZ float32 `json:"tfs_z,omitempty"`
|
TFSZ float32 `json:"tfs_z,omitempty"`
|
||||||
TypicalP float32 `json:"typical_p,omitempty"`
|
TypicalP float32 `json:"typical_p,omitempty"`
|
||||||
RepeatLastN int `json:"repeat_last_n,omitempty"`
|
RepeatLastN int `json:"repeat_last_n,omitempty"`
|
||||||
|
@ -267,10 +261,6 @@ type EmbedRequest struct {
|
||||||
type EmbedResponse struct {
|
type EmbedResponse struct {
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Embeddings [][]float32 `json:"embeddings"`
|
Embeddings [][]float32 `json:"embeddings"`
|
||||||
|
|
||||||
TotalDuration time.Duration `json:"total_duration,omitempty"`
|
|
||||||
LoadDuration time.Duration `json:"load_duration,omitempty"`
|
|
||||||
PromptEvalCount int `json:"prompt_eval_count,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
// EmbeddingRequest is the request passed to [Client.Embeddings].
|
||||||
|
|
|
@ -138,7 +138,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
|
||||||
|
|
||||||
|
|
||||||
;FinishedHeadingLabel=Run your first model
|
;FinishedHeadingLabel=Run your first model
|
||||||
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3.1
|
;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n ollama run llama3
|
||||||
;ClickFinish=%n
|
;ClickFinish=%n
|
||||||
|
|
||||||
[Registry]
|
[Registry]
|
||||||
|
|
|
@ -4,5 +4,5 @@ write-host "Welcome to Ollama!"
|
||||||
write-host ""
|
write-host ""
|
||||||
write-host "Run your first model:"
|
write-host "Run your first model:"
|
||||||
write-host ""
|
write-host ""
|
||||||
write-host "`tollama run llama3.1"
|
write-host "`tollama run llama3"
|
||||||
write-host ""
|
write-host ""
|
|
@ -1341,10 +1341,10 @@ func NewCLI() *cobra.Command {
|
||||||
envVars["OLLAMA_NUM_PARALLEL"],
|
envVars["OLLAMA_NUM_PARALLEL"],
|
||||||
envVars["OLLAMA_NOPRUNE"],
|
envVars["OLLAMA_NOPRUNE"],
|
||||||
envVars["OLLAMA_ORIGINS"],
|
envVars["OLLAMA_ORIGINS"],
|
||||||
envVars["OLLAMA_SCHED_SPREAD"],
|
|
||||||
envVars["OLLAMA_TMPDIR"],
|
envVars["OLLAMA_TMPDIR"],
|
||||||
envVars["OLLAMA_FLASH_ATTENTION"],
|
envVars["OLLAMA_FLASH_ATTENTION"],
|
||||||
envVars["OLLAMA_LLM_LIBRARY"],
|
envVars["OLLAMA_LLM_LIBRARY"],
|
||||||
|
envVars["OLLAMA_MAX_VRAM"],
|
||||||
})
|
})
|
||||||
default:
|
default:
|
||||||
appendEnvDocs(cmd, envs)
|
appendEnvDocs(cmd, envs)
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"cmp"
|
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
@ -10,14 +9,13 @@ import (
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
"slices"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
"golang.org/x/exp/maps"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/envconfig"
|
"github.com/ollama/ollama/envconfig"
|
||||||
"github.com/ollama/ollama/parser"
|
|
||||||
"github.com/ollama/ollama/progress"
|
"github.com/ollama/ollama/progress"
|
||||||
"github.com/ollama/ollama/readline"
|
"github.com/ollama/ollama/readline"
|
||||||
"github.com/ollama/ollama/types/errtypes"
|
"github.com/ollama/ollama/types/errtypes"
|
||||||
|
@ -140,7 +138,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
fmt.Fprintln(os.Stderr, " /set parameter num_predict <int> Max number of tokens to predict")
|
fmt.Fprintln(os.Stderr, " /set parameter num_predict <int> Max number of tokens to predict")
|
||||||
fmt.Fprintln(os.Stderr, " /set parameter top_k <int> Pick from top k num of tokens")
|
fmt.Fprintln(os.Stderr, " /set parameter top_k <int> Pick from top k num of tokens")
|
||||||
fmt.Fprintln(os.Stderr, " /set parameter top_p <float> Pick token based on sum of probabilities")
|
fmt.Fprintln(os.Stderr, " /set parameter top_p <float> Pick token based on sum of probabilities")
|
||||||
fmt.Fprintln(os.Stderr, " /set parameter min_p <float> Pick token based on top token probability * min_p")
|
|
||||||
fmt.Fprintln(os.Stderr, " /set parameter num_ctx <int> Set the context size")
|
fmt.Fprintln(os.Stderr, " /set parameter num_ctx <int> Set the context size")
|
||||||
fmt.Fprintln(os.Stderr, " /set parameter temperature <float> Set creativity level")
|
fmt.Fprintln(os.Stderr, " /set parameter temperature <float> Set creativity level")
|
||||||
fmt.Fprintln(os.Stderr, " /set parameter repeat_penalty <float> How strongly to penalize repetitions")
|
fmt.Fprintln(os.Stderr, " /set parameter repeat_penalty <float> How strongly to penalize repetitions")
|
||||||
|
@ -509,35 +506,31 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildModelfile(opts runOptions) string {
|
func buildModelfile(opts runOptions) string {
|
||||||
var f parser.File
|
var mf strings.Builder
|
||||||
f.Commands = append(f.Commands, parser.Command{Name: "model", Args: cmp.Or(opts.ParentModel, opts.Model)})
|
model := opts.ParentModel
|
||||||
|
if model == "" {
|
||||||
|
model = opts.Model
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&mf, "FROM %s\n", model)
|
||||||
if opts.System != "" {
|
if opts.System != "" {
|
||||||
f.Commands = append(f.Commands, parser.Command{Name: "system", Args: opts.System})
|
fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System)
|
||||||
}
|
}
|
||||||
|
|
||||||
keys := maps.Keys(opts.Options)
|
keys := make([]string, 0)
|
||||||
slices.Sort(keys)
|
for k := range opts.Options {
|
||||||
|
keys = append(keys, k)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
for _, k := range keys {
|
for _, k := range keys {
|
||||||
v := opts.Options[k]
|
fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k])
|
||||||
var cmds []parser.Command
|
|
||||||
switch t := v.(type) {
|
|
||||||
case []string:
|
|
||||||
for _, s := range t {
|
|
||||||
cmds = append(cmds, parser.Command{Name: k, Args: s})
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
cmds = append(cmds, parser.Command{Name: k, Args: fmt.Sprintf("%v", t)})
|
|
||||||
}
|
|
||||||
|
|
||||||
f.Commands = append(f.Commands, cmds...)
|
|
||||||
}
|
}
|
||||||
|
fmt.Fprintln(&mf)
|
||||||
|
|
||||||
for _, msg := range opts.Messages {
|
for _, msg := range opts.Messages {
|
||||||
f.Commands = append(f.Commands, parser.Command{Name: "message", Args: fmt.Sprintf("%s: %s", msg.Role, msg.Content)})
|
fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content)
|
||||||
}
|
}
|
||||||
|
|
||||||
return f.String()
|
return mf.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
func normalizeFilePath(fp string) string {
|
func normalizeFilePath(fp string) string {
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"testing"
|
"testing"
|
||||||
|
"text/template"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
)
|
)
|
||||||
|
@ -61,47 +63,52 @@ func TestModelfileBuilder(t *testing.T) {
|
||||||
{Role: "user", Content: "Hey there hork!"},
|
{Role: "user", Content: "Hey there hork!"},
|
||||||
{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
|
{Role: "assistant", Content: "Yes it is true, I am half horse, half shark."},
|
||||||
},
|
},
|
||||||
Options: map[string]any{
|
Options: map[string]interface{}{},
|
||||||
"temperature": 0.9,
|
|
||||||
"seed": 42,
|
|
||||||
"penalize_newline": false,
|
|
||||||
"stop": []string{"hi", "there"},
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
t.Run("model", func(t *testing.T) {
|
opts.Options["temperature"] = 0.9
|
||||||
expect := `FROM hork
|
opts.Options["seed"] = 42
|
||||||
SYSTEM You are part horse and part shark, but all hork. Do horklike things
|
opts.Options["penalize_newline"] = false
|
||||||
|
opts.Options["stop"] = []string{"hi", "there"}
|
||||||
|
|
||||||
|
mf := buildModelfile(opts)
|
||||||
|
expectedModelfile := `FROM {{.Model}}
|
||||||
|
SYSTEM """{{.System}}"""
|
||||||
PARAMETER penalize_newline false
|
PARAMETER penalize_newline false
|
||||||
PARAMETER seed 42
|
PARAMETER seed 42
|
||||||
PARAMETER stop hi
|
PARAMETER stop [hi there]
|
||||||
PARAMETER stop there
|
|
||||||
PARAMETER temperature 0.9
|
PARAMETER temperature 0.9
|
||||||
MESSAGE user Hey there hork!
|
|
||||||
MESSAGE assistant Yes it is true, I am half horse, half shark.
|
MESSAGE user """Hey there hork!"""
|
||||||
|
MESSAGE assistant """Yes it is true, I am half horse, half shark."""
|
||||||
`
|
`
|
||||||
|
|
||||||
actual := buildModelfile(opts)
|
tmpl, err := template.New("").Parse(expectedModelfile)
|
||||||
if diff := cmp.Diff(expect, actual); diff != "" {
|
require.NoError(t, err)
|
||||||
t.Errorf("mismatch (-want +got):\n%s", diff)
|
|
||||||
}
|
var buf bytes.Buffer
|
||||||
})
|
err = tmpl.Execute(&buf, opts)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, buf.String(), mf)
|
||||||
|
|
||||||
t.Run("parent model", func(t *testing.T) {
|
|
||||||
opts.ParentModel = "horseshark"
|
opts.ParentModel = "horseshark"
|
||||||
expect := `FROM horseshark
|
mf = buildModelfile(opts)
|
||||||
SYSTEM You are part horse and part shark, but all hork. Do horklike things
|
expectedModelfile = `FROM {{.ParentModel}}
|
||||||
|
SYSTEM """{{.System}}"""
|
||||||
PARAMETER penalize_newline false
|
PARAMETER penalize_newline false
|
||||||
PARAMETER seed 42
|
PARAMETER seed 42
|
||||||
PARAMETER stop hi
|
PARAMETER stop [hi there]
|
||||||
PARAMETER stop there
|
|
||||||
PARAMETER temperature 0.9
|
PARAMETER temperature 0.9
|
||||||
MESSAGE user Hey there hork!
|
|
||||||
MESSAGE assistant Yes it is true, I am half horse, half shark.
|
MESSAGE user """Hey there hork!"""
|
||||||
|
MESSAGE assistant """Yes it is true, I am half horse, half shark."""
|
||||||
`
|
`
|
||||||
actual := buildModelfile(opts)
|
|
||||||
if diff := cmp.Diff(expect, actual); diff != "" {
|
tmpl, err = template.New("").Parse(expectedModelfile)
|
||||||
t.Errorf("mismatch (-want +got):\n%s", diff)
|
require.NoError(t, err)
|
||||||
}
|
|
||||||
})
|
var parentBuf bytes.Buffer
|
||||||
|
err = tmpl.Execute(&parentBuf, opts)
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, parentBuf.String(), mf)
|
||||||
}
|
}
|
||||||
|
|
|
@ -71,11 +71,6 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
|
||||||
"tokenizer.ggml.unknown_token_id": uint32(0),
|
"tokenizer.ggml.unknown_token_id": uint32(0),
|
||||||
}
|
}
|
||||||
|
|
||||||
if m.Params.HeadDimension > 0 {
|
|
||||||
kv["llama.attention.key_length"] = uint32(m.Params.HeadDimension)
|
|
||||||
kv["llama.attention.value_length"] = uint32(m.Params.HeadDimension)
|
|
||||||
}
|
|
||||||
|
|
||||||
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
|
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
204
docs/api.md
204
docs/api.md
|
@ -40,7 +40,6 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
||||||
|
|
||||||
- `model`: (required) the [model name](#model-names)
|
- `model`: (required) the [model name](#model-names)
|
||||||
- `prompt`: the prompt to generate a response for
|
- `prompt`: the prompt to generate a response for
|
||||||
- `suffix`: the text after the model response
|
|
||||||
- `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
|
- `images`: (optional) a list of base64-encoded images (for multimodal models such as `llava`)
|
||||||
|
|
||||||
Advanced parameters (optional):
|
Advanced parameters (optional):
|
||||||
|
@ -58,8 +57,7 @@ Advanced parameters (optional):
|
||||||
|
|
||||||
Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
|
Enable JSON mode by setting the `format` parameter to `json`. This will structure the response as a valid JSON object. See the JSON mode [example](#request-json-mode) below.
|
||||||
|
|
||||||
> [!IMPORTANT]
|
> Note: it's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.
|
||||||
> It's important to instruct the model to use JSON in the `prompt`. Otherwise, the model may generate large amounts whitespace.
|
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
||||||
|
@ -150,44 +148,8 @@ If `stream` is set to `false`, the response will be a single JSON object:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Request (with suffix)
|
|
||||||
|
|
||||||
##### Request
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl http://localhost:11434/api/generate -d '{
|
|
||||||
"model": "codellama:code",
|
|
||||||
"prompt": "def compute_gcd(a, b):",
|
|
||||||
"suffix": " return result",
|
|
||||||
"options": {
|
|
||||||
"temperature": 0
|
|
||||||
},
|
|
||||||
"stream": false
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Response
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"model": "codellama:code",
|
|
||||||
"created_at": "2024-07-22T20:47:51.147561Z",
|
|
||||||
"response": "\n if a == 0:\n return b\n else:\n return compute_gcd(b % a, a)\n\ndef compute_lcm(a, b):\n result = (a * b) / compute_gcd(a, b)\n",
|
|
||||||
"done": true,
|
|
||||||
"done_reason": "stop",
|
|
||||||
"context": [...],
|
|
||||||
"total_duration": 1162761250,
|
|
||||||
"load_duration": 6683708,
|
|
||||||
"prompt_eval_count": 17,
|
|
||||||
"prompt_eval_duration": 201222000,
|
|
||||||
"eval_count": 63,
|
|
||||||
"eval_duration": 953997000
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Request (JSON mode)
|
#### Request (JSON mode)
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
> When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON.
|
> When `format` is set to `json`, the output will always be a well-formed JSON object. It's important to also instruct the model to respond in JSON.
|
||||||
|
|
||||||
##### Request
|
##### Request
|
||||||
|
@ -336,7 +298,6 @@ curl http://localhost:11434/api/generate -d '{
|
||||||
"num_predict": 100,
|
"num_predict": 100,
|
||||||
"top_k": 20,
|
"top_k": 20,
|
||||||
"top_p": 0.9,
|
"top_p": 0.9,
|
||||||
"min_p": 0.0,
|
|
||||||
"tfs_z": 0.5,
|
"tfs_z": 0.5,
|
||||||
"typical_p": 0.7,
|
"typical_p": 0.7,
|
||||||
"repeat_last_n": 33,
|
"repeat_last_n": 33,
|
||||||
|
@ -419,14 +380,12 @@ Generate the next message in a chat with a provided model. This is a streaming e
|
||||||
|
|
||||||
- `model`: (required) the [model name](#model-names)
|
- `model`: (required) the [model name](#model-names)
|
||||||
- `messages`: the messages of the chat, this can be used to keep a chat memory
|
- `messages`: the messages of the chat, this can be used to keep a chat memory
|
||||||
- `tools`: tools for the model to use if supported. Requires `stream` to be set to `false`
|
|
||||||
|
|
||||||
The `message` object has the following fields:
|
The `message` object has the following fields:
|
||||||
|
|
||||||
- `role`: the role of the message, either `system`, `user`, `assistant`, or `tool`
|
- `role`: the role of the message, either `system`, `user` or `assistant`
|
||||||
- `content`: the content of the message
|
- `content`: the content of the message
|
||||||
- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
|
- `images` (optional): a list of images to include in the message (for multimodal models such as `llava`)
|
||||||
- `tool_calls` (optional): a list of tools the model wants to use
|
|
||||||
|
|
||||||
Advanced parameters (optional):
|
Advanced parameters (optional):
|
||||||
|
|
||||||
|
@ -587,7 +546,7 @@ Final response:
|
||||||
|
|
||||||
##### Request
|
##### Request
|
||||||
|
|
||||||
Send a chat message with images. The images should be provided as an array, with the individual images encoded in Base64.
|
Send a chat message with a conversation history.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/chat -d '{
|
curl http://localhost:11434/api/chat -d '{
|
||||||
|
@ -663,79 +622,6 @@ curl http://localhost:11434/api/chat -d '{
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Chat request (with tools)
|
|
||||||
|
|
||||||
##### Request
|
|
||||||
|
|
||||||
```
|
|
||||||
curl http://localhost:11434/api/chat -d '{
|
|
||||||
"model": "mistral",
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": "What is the weather today in Paris?"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"stream": false,
|
|
||||||
"tools": [
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "get_current_weather",
|
|
||||||
"description": "Get the current weather for a location",
|
|
||||||
"parameters": {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"location": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The location to get the weather for, e.g. San Francisco, CA"
|
|
||||||
},
|
|
||||||
"format": {
|
|
||||||
"type": "string",
|
|
||||||
"description": "The format to return the weather in, e.g. 'celsius' or 'fahrenheit'",
|
|
||||||
"enum": ["celsius", "fahrenheit"]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["location", "format"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Response
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"model": "mistral:7b-instruct-v0.3-q4_K_M",
|
|
||||||
"created_at": "2024-07-22T20:33:28.123648Z",
|
|
||||||
"message": {
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "",
|
|
||||||
"tool_calls": [
|
|
||||||
{
|
|
||||||
"function": {
|
|
||||||
"name": "get_current_weather",
|
|
||||||
"arguments": {
|
|
||||||
"format": "celsius",
|
|
||||||
"location": "Paris, FR"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"done_reason": "stop",
|
|
||||||
"done": true,
|
|
||||||
"total_duration": 885095291,
|
|
||||||
"load_duration": 3753500,
|
|
||||||
"prompt_eval_count": 122,
|
|
||||||
"prompt_eval_duration": 328493000,
|
|
||||||
"eval_count": 33,
|
|
||||||
"eval_duration": 552222000
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Create a Model
|
## Create a Model
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
|
@ -1140,7 +1026,7 @@ If `stream` is set to `false`, then the response is a single JSON object:
|
||||||
## Generate Embeddings
|
## Generate Embeddings
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
POST /api/embed
|
POST /api/embeddings
|
||||||
```
|
```
|
||||||
|
|
||||||
Generate embeddings from a model
|
Generate embeddings from a model
|
||||||
|
@ -1148,11 +1034,10 @@ Generate embeddings from a model
|
||||||
### Parameters
|
### Parameters
|
||||||
|
|
||||||
- `model`: name of model to generate embeddings from
|
- `model`: name of model to generate embeddings from
|
||||||
- `input`: text or list of text to generate embeddings for
|
- `prompt`: text to generate embeddings for
|
||||||
|
|
||||||
Advanced parameters:
|
Advanced parameters:
|
||||||
|
|
||||||
- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
|
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||||
|
|
||||||
|
@ -1161,9 +1046,9 @@ Advanced parameters:
|
||||||
#### Request
|
#### Request
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/embed -d '{
|
curl http://localhost:11434/api/embeddings -d '{
|
||||||
"model": "all-minilm",
|
"model": "all-minilm",
|
||||||
"input": "Why is the sky blue?"
|
"prompt": "Here is an article about llamas..."
|
||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -1171,35 +1056,10 @@ curl http://localhost:11434/api/embed -d '{
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"model": "all-minilm",
|
"embedding": [
|
||||||
"embeddings": [[
|
0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
|
||||||
0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
|
0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
|
||||||
0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
|
]
|
||||||
]]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Request (Multiple input)
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl http://localhost:11434/api/embed -d '{
|
|
||||||
"model": "all-minilm",
|
|
||||||
"input": ["Why is the sky blue?", "Why is the grass green?"]
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Response
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"model": "all-minilm",
|
|
||||||
"embeddings": [[
|
|
||||||
0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
|
|
||||||
0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
|
|
||||||
],[
|
|
||||||
-0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
|
|
||||||
0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
|
|
||||||
]]
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -1246,45 +1106,3 @@ A single JSON object will be returned.
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## Generate Embedding
|
|
||||||
|
|
||||||
> Note: this endpoint has been superseded by `/api/embed`
|
|
||||||
|
|
||||||
```shell
|
|
||||||
POST /api/embeddings
|
|
||||||
```
|
|
||||||
|
|
||||||
Generate embeddings from a model
|
|
||||||
|
|
||||||
### Parameters
|
|
||||||
|
|
||||||
- `model`: name of model to generate embeddings from
|
|
||||||
- `prompt`: text to generate embeddings for
|
|
||||||
|
|
||||||
Advanced parameters:
|
|
||||||
|
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
|
||||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
|
||||||
|
|
||||||
### Examples
|
|
||||||
|
|
||||||
#### Request
|
|
||||||
|
|
||||||
```shell
|
|
||||||
curl http://localhost:11434/api/embeddings -d '{
|
|
||||||
"model": "all-minilm",
|
|
||||||
"prompt": "Here is an article about llamas..."
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Response
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"embedding": [
|
|
||||||
0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
|
|
||||||
0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
|
@ -63,7 +63,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114
|
||||||
Now you can run a model:
|
Now you can run a model:
|
||||||
|
|
||||||
```
|
```
|
||||||
docker exec -it ollama ollama run llama3.1
|
docker exec -it ollama ollama run llama3
|
||||||
```
|
```
|
||||||
|
|
||||||
### Try different models
|
### Try different models
|
||||||
|
|
|
@ -227,7 +227,7 @@ curl http://localhost:11434/api/chat -d '{"model": "mistral"}'
|
||||||
|
|
||||||
To preload a model using the CLI, use the command:
|
To preload a model using the CLI, use the command:
|
||||||
```shell
|
```shell
|
||||||
ollama run llama3.1 ""
|
ollama run llama3 ""
|
||||||
```
|
```
|
||||||
|
|
||||||
## How do I keep a model loaded in memory or make it unload immediately?
|
## How do I keep a model loaded in memory or make it unload immediately?
|
||||||
|
@ -273,7 +273,3 @@ The following server settings may be used to adjust how Ollama handles concurren
|
||||||
- `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
|
- `OLLAMA_MAX_QUEUE` - The maximum number of requests Ollama will queue when busy before rejecting additional requests. The default is 512
|
||||||
|
|
||||||
Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
|
Note: Windows with Radeon GPUs currently default to 1 model maximum due to limitations in ROCm v5.7 for available VRAM reporting. Once ROCm v6.2 is available, Windows Radeon will follow the defaults above. You may enable concurrent model loads on Radeon on Windows, but ensure you don't load more models than will fit into your GPUs VRAM.
|
||||||
|
|
||||||
## How does Ollama load models on multiple GPUs?
|
|
||||||
|
|
||||||
Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
|
|
|
@ -1,7 +1,6 @@
|
||||||
# Ollama Model File
|
# Ollama Model File
|
||||||
|
|
||||||
> [!NOTE]
|
> Note: `Modelfile` syntax is in development
|
||||||
> `Modelfile` syntax is in development
|
|
||||||
|
|
||||||
A model file is the blueprint to create and share models with Ollama.
|
A model file is the blueprint to create and share models with Ollama.
|
||||||
|
|
||||||
|
@ -141,7 +140,6 @@ PARAMETER <parameter> <parametervalue>
|
||||||
| num_predict | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) | int | num_predict 42 |
|
| num_predict | Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context) | int | num_predict 42 |
|
||||||
| top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 |
|
| top_k | Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) | int | top_k 40 |
|
||||||
| top_p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) | float | top_p 0.9 |
|
| top_p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) | float | top_p 0.9 |
|
||||||
| min_p | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter *p* represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with *p*=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. (Default: 0.0) | float | min_p 0.05 |
|
|
||||||
|
|
||||||
### TEMPLATE
|
### TEMPLATE
|
||||||
|
|
||||||
|
|
|
@ -78,8 +78,8 @@ curl http://localhost:11434/v1/chat/completions \
|
||||||
- [x] Streaming
|
- [x] Streaming
|
||||||
- [x] JSON mode
|
- [x] JSON mode
|
||||||
- [x] Reproducible outputs
|
- [x] Reproducible outputs
|
||||||
- [x] Tools (streaming support coming soon)
|
|
||||||
- [ ] Vision
|
- [ ] Vision
|
||||||
|
- [ ] Function calling
|
||||||
- [ ] Logprobs
|
- [ ] Logprobs
|
||||||
|
|
||||||
#### Supported request fields
|
#### Supported request fields
|
||||||
|
@ -97,9 +97,9 @@ curl http://localhost:11434/v1/chat/completions \
|
||||||
- [x] `temperature`
|
- [x] `temperature`
|
||||||
- [x] `top_p`
|
- [x] `top_p`
|
||||||
- [x] `max_tokens`
|
- [x] `max_tokens`
|
||||||
- [x] `tools`
|
|
||||||
- [ ] `tool_choice`
|
|
||||||
- [ ] `logit_bias`
|
- [ ] `logit_bias`
|
||||||
|
- [ ] `tools`
|
||||||
|
- [ ] `tool_choice`
|
||||||
- [ ] `user`
|
- [ ] `user`
|
||||||
- [ ] `n`
|
- [ ] `n`
|
||||||
|
|
||||||
|
|
173
docs/template.md
173
docs/template.md
|
@ -1,173 +0,0 @@
|
||||||
# Template
|
|
||||||
|
|
||||||
Ollama provides a powerful templating engine backed by Go's built-in templating engine to construct prompts for your large language model. This feature is a valuable tool to get the most out of your models.
|
|
||||||
|
|
||||||
## Basic Template Structure
|
|
||||||
|
|
||||||
A basic Go template consists of three main parts:
|
|
||||||
|
|
||||||
* **Layout**: The overall structure of the template.
|
|
||||||
* **Variables**: Placeholders for dynamic data that will be replaced with actual values when the template is rendered.
|
|
||||||
* **Functions**: Custom functions or logic that can be used to manipulate the template's content.
|
|
||||||
|
|
||||||
Here's an example of a simple chat template:
|
|
||||||
|
|
||||||
```gotmpl
|
|
||||||
{{- range .Messages }}
|
|
||||||
{{ .Role }}: {{ .Content }}
|
|
||||||
{{- end }}
|
|
||||||
```
|
|
||||||
|
|
||||||
In this example, we have:
|
|
||||||
|
|
||||||
* A basic messages structure (layout)
|
|
||||||
* Three variables: `Messages`, `Role`, and `Content` (variables)
|
|
||||||
* A custom function (action) that iterates over an array of items (`range .Messages`) and displays each item
|
|
||||||
|
|
||||||
## Adding templates to your model
|
|
||||||
|
|
||||||
By default, models imported into Ollama have a default template of `{{ .Prompt }}`, i.e. user inputs are sent verbatim to the LLM. This is appropriate for text or code completion models but lacks essential markers for chat or instruction models.
|
|
||||||
|
|
||||||
Omitting a template in these models puts the responsibility of correctly templating input onto the user. Adding a template allows users to easily get the best results from the model.
|
|
||||||
|
|
||||||
To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
|
|
||||||
|
|
||||||
```dockerfile
|
|
||||||
FROM llama3
|
|
||||||
|
|
||||||
TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
|
|
||||||
|
|
||||||
{{ .System }}<|eot_id|>
|
|
||||||
{{- end }}
|
|
||||||
{{- range .Messages }}<|start_header_id|>{{ .Role }}<|end_header_id|>
|
|
||||||
|
|
||||||
{{ .Content }}<|eot_id|>
|
|
||||||
{{- end }}<|start_header_id|>assistant<|end_header_id|>
|
|
||||||
|
|
||||||
"""
|
|
||||||
```
|
|
||||||
|
|
||||||
## Variables
|
|
||||||
|
|
||||||
`System` (string): system prompt
|
|
||||||
|
|
||||||
`Prompt` (string): user prompt
|
|
||||||
|
|
||||||
`Response` (string): assistant response
|
|
||||||
|
|
||||||
`Suffix` (string): text inserted after the assistant's response
|
|
||||||
|
|
||||||
`Messages` (list): list of messages
|
|
||||||
|
|
||||||
`Messages[].Role` (string): role which can be one of `system`, `user`, `assistant`, or `tool`
|
|
||||||
|
|
||||||
`Messages[].Content` (string): message content
|
|
||||||
|
|
||||||
`Messages[].ToolCalls` (list): list of tools the model wants to call
|
|
||||||
|
|
||||||
`Messages[].ToolCalls[].Function` (object): function to call
|
|
||||||
|
|
||||||
`Messages[].ToolCalls[].Function.Name` (string): function name
|
|
||||||
|
|
||||||
`Messages[].ToolCalls[].Function.Arguments` (map): mapping of argument name to argument value
|
|
||||||
|
|
||||||
`Tools` (list): list of tools the model can access
|
|
||||||
|
|
||||||
`Tools[].Type` (string): schema type. `type` is always `function`
|
|
||||||
|
|
||||||
`Tools[].Function` (object): function definition
|
|
||||||
|
|
||||||
`Tools[].Function.Name` (string): function name
|
|
||||||
|
|
||||||
`Tools[].Function.Description` (string): function description
|
|
||||||
|
|
||||||
`Tools[].Function.Parameters` (object): function parameters
|
|
||||||
|
|
||||||
`Tools[].Function.Parameters.Type` (string): schema type. `type` is always `object`
|
|
||||||
|
|
||||||
`Tools[].Function.Parameters.Required` (list): list of required properties
|
|
||||||
|
|
||||||
`Tools[].Function.Parameters.Properties` (map): mapping of property name to property definition
|
|
||||||
|
|
||||||
`Tools[].Function.Parameters.Properties[].Type` (string): property type
|
|
||||||
|
|
||||||
`Tools[].Function.Parameters.Properties[].Description` (string): property description
|
|
||||||
|
|
||||||
`Tools[].Function.Parameters.Properties[].Enum` (list): list of valid values
|
|
||||||
|
|
||||||
## Tips and Best Practices
|
|
||||||
|
|
||||||
Keep the following tips and best practices in mind when working with Go templates:
|
|
||||||
|
|
||||||
* **Be mindful of dot**: Control flow structures like `range` and `with` changes the value `.`
|
|
||||||
* **Out-of-scope variables**: Use `$.` to reference variables not currently in scope, starting from the root
|
|
||||||
* **Whitespace control**: Use `-` to trim leading (`{{-`) and trailing (`-}}`) whitespace
|
|
||||||
|
|
||||||
## Examples
|
|
||||||
|
|
||||||
### Example Messages
|
|
||||||
|
|
||||||
#### ChatML
|
|
||||||
|
|
||||||
ChatML is a popular template format. It can be used for models such as Databrick's DBRX, Intel's Neural Chat, and Microsoft's Orca 2.
|
|
||||||
|
|
||||||
```gotmpl
|
|
||||||
{{- if .System }}<|im_start|>system
|
|
||||||
{{ .System }}<|im_end|>
|
|
||||||
{{ end }}
|
|
||||||
{{- range .Messages }}<|im_start|>{{ .Role }}
|
|
||||||
{{ .Content }}<|im_end|>
|
|
||||||
{{ end }}<|im_start|>assistant
|
|
||||||
{{ else }}
|
|
||||||
{{ if .System }}<|im_start|>system
|
|
||||||
{{ .System }}<|im_end|>
|
|
||||||
```
|
|
||||||
|
|
||||||
### Example Tools
|
|
||||||
|
|
||||||
Tools support can be added to a model by adding a `{{ .Tools }}` node to the template. This feature is useful for models trained to call external tools and can a powerful tool for retrieving real-time data or performing complex tasks.
|
|
||||||
|
|
||||||
#### Mistral
|
|
||||||
|
|
||||||
Mistral v0.3 and Mixtral 8x22B supports tool calling.
|
|
||||||
|
|
||||||
```gotmpl
|
|
||||||
{{- range $index, $_ := .Messages }}
|
|
||||||
{{- if eq .Role "user" }}
|
|
||||||
{{- if and (le (len (slice $.Messages $index)) 2) $.Tools }}[AVAILABLE_TOOLS] {{ json $.Tools }}[/AVAILABLE_TOOLS]
|
|
||||||
{{- end }}[INST] {{ if and (eq (len (slice $.Messages $index)) 1) $.System }}{{ $.System }}
|
|
||||||
|
|
||||||
{{ end }}{{ .Content }}[/INST]
|
|
||||||
{{- else if eq .Role "assistant" }}
|
|
||||||
{{- if .Content }} {{ .Content }}</s>
|
|
||||||
{{- else if .ToolCalls }}[TOOL_CALLS] [
|
|
||||||
{{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ json .Function.Arguments }}}
|
|
||||||
{{- end }}]</s>
|
|
||||||
{{- end }}
|
|
||||||
{{- else if eq .Role "tool" }}[TOOL_RESULTS] {"content": {{ .Content }}}[/TOOL_RESULTS]
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Example Fill-in-Middle
|
|
||||||
|
|
||||||
Fill-in-middle support can be added to a model by adding a `{{ .Suffix }}` node to the template. This feature is useful for models that are trained to generate text in the middle of user input, such as code completion models.
|
|
||||||
|
|
||||||
#### CodeLlama
|
|
||||||
|
|
||||||
CodeLlama [7B](https://ollama.com/library/codellama:7b-code) and [13B](https://ollama.com/library/codellama:13b-code) code completion models support fill-in-middle.
|
|
||||||
|
|
||||||
```gotmpl
|
|
||||||
<PRE> {{ .Prompt }} <SUF>{{ .Suffix }} <MID>
|
|
||||||
```
|
|
||||||
|
|
||||||
> [!NOTE]
|
|
||||||
> CodeLlama 34B and 70B code completion and all instruct and Python fine-tuned models do not support fill-in-middle.
|
|
||||||
|
|
||||||
#### Codestral
|
|
||||||
|
|
||||||
Codestral [22B](https://ollama.com/library/codestral:22b) supports fill-in-middle.
|
|
||||||
|
|
||||||
```gotmpl
|
|
||||||
[SUFFIX]{{ .Suffix }}[PREFIX] {{ .Prompt }}
|
|
||||||
```
|
|
|
@ -15,7 +15,7 @@ import { Ollama } from "@langchain/community/llms/ollama";
|
||||||
|
|
||||||
const ollama = new Ollama({
|
const ollama = new Ollama({
|
||||||
baseUrl: "http://localhost:11434",
|
baseUrl: "http://localhost:11434",
|
||||||
model: "llama3.1",
|
model: "llama3",
|
||||||
});
|
});
|
||||||
|
|
||||||
const answer = await ollama.invoke(`why is the sky blue?`);
|
const answer = await ollama.invoke(`why is the sky blue?`);
|
||||||
|
@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`);
|
||||||
console.log(answer);
|
console.log(answer);
|
||||||
```
|
```
|
||||||
|
|
||||||
That will get us the same thing as if we ran `ollama run llama3.1 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
|
That will get us the same thing as if we ran `ollama run llama3 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
npm install cheerio
|
npm install cheerio
|
||||||
|
|
|
@ -23,8 +23,6 @@ Logs will often be helpful in diagnosing the problem (see
|
||||||
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
|
* NVIDIA 452.39 or newer Drivers if you have an NVIDIA card
|
||||||
* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
|
* AMD Radeon Driver https://www.amd.com/en/support if you have a Radeon card
|
||||||
|
|
||||||
Ollama uses unicode characters for progress indication, which may render as unknown squares in some older terminal fonts in Windows 10. If you see this, try changing your terminal font settings.
|
|
||||||
|
|
||||||
## API Access
|
## API Access
|
||||||
|
|
||||||
Here's a quick example showing API access from `powershell`
|
Here's a quick example showing API access from `powershell`
|
||||||
|
|
|
@ -43,6 +43,8 @@ var (
|
||||||
MaxRunners int
|
MaxRunners int
|
||||||
// Set via OLLAMA_MAX_QUEUE in the environment
|
// Set via OLLAMA_MAX_QUEUE in the environment
|
||||||
MaxQueuedRequests int
|
MaxQueuedRequests int
|
||||||
|
// Set via OLLAMA_MAX_VRAM in the environment
|
||||||
|
MaxVRAM uint64
|
||||||
// Set via OLLAMA_MODELS in the environment
|
// Set via OLLAMA_MODELS in the environment
|
||||||
ModelsDir string
|
ModelsDir string
|
||||||
// Set via OLLAMA_NOHISTORY in the environment
|
// Set via OLLAMA_NOHISTORY in the environment
|
||||||
|
@ -87,6 +89,7 @@ func AsMap() map[string]EnvVar {
|
||||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
|
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
|
||||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
|
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
|
||||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
|
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
|
||||||
|
"OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
|
||||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
|
"OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
|
||||||
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
|
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
|
||||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
|
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
|
||||||
|
@ -191,6 +194,16 @@ func LoadConfig() {
|
||||||
|
|
||||||
TmpDir = clean("OLLAMA_TMPDIR")
|
TmpDir = clean("OLLAMA_TMPDIR")
|
||||||
|
|
||||||
|
userLimit := clean("OLLAMA_MAX_VRAM")
|
||||||
|
if userLimit != "" {
|
||||||
|
avail, err := strconv.ParseUint(userLimit, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
|
||||||
|
} else {
|
||||||
|
MaxVRAM = avail
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
|
LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
|
||||||
|
|
||||||
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
|
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
|
||||||
|
|
|
@ -35,7 +35,7 @@ func main() {
|
||||||
|
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
req := &api.ChatRequest{
|
req := &api.ChatRequest{
|
||||||
Model: "llama3.1",
|
Model: "llama3",
|
||||||
Messages: messages,
|
Messages: messages,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ func main() {
|
||||||
|
|
||||||
// By default, GenerateRequest is streaming.
|
// By default, GenerateRequest is streaming.
|
||||||
req := &api.GenerateRequest{
|
req := &api.GenerateRequest{
|
||||||
Model: "gemma2",
|
Model: "gemma",
|
||||||
Prompt: "how many planets are there?",
|
Prompt: "how many planets are there?",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ func main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
req := &api.GenerateRequest{
|
req := &api.GenerateRequest{
|
||||||
Model: "gemma2",
|
Model: "gemma",
|
||||||
Prompt: "how many planets are there?",
|
Prompt: "how many planets are there?",
|
||||||
|
|
||||||
// set streaming to false
|
// set streaming to false
|
||||||
|
|
0
examples/go-http-generate/README.md
Normal file
0
examples/go-http-generate/README.md
Normal file
|
@ -4,14 +4,6 @@ This example provides an interface for asking questions to a PDF document.
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
1. Ensure you have the `llama3.1` model installed:
|
|
||||||
|
|
||||||
```
|
|
||||||
ollama pull llama3.1
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Install the Python Requirements.
|
|
||||||
|
|
||||||
```
|
```
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
|
@ -51,7 +51,7 @@ while True:
|
||||||
template=template,
|
template=template,
|
||||||
)
|
)
|
||||||
|
|
||||||
llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
llm = Ollama(model="llama3:8b", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
||||||
qa_chain = RetrievalQA.from_chain_type(
|
qa_chain = RetrievalQA.from_chain_type(
|
||||||
llm,
|
llm,
|
||||||
retriever=vectorstore.as_retriever(),
|
retriever=vectorstore.as_retriever(),
|
||||||
|
|
|
@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso
|
||||||
|
|
||||||
## Running the Example
|
## Running the Example
|
||||||
|
|
||||||
1. Ensure you have the `llama3.1` model installed:
|
1. Ensure you have the `llama2` model installed:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ollama pull llama3.1
|
ollama pull llama2
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Install the Python Requirements.
|
2. Install the Python Requirements.
|
||||||
|
|
|
@ -5,7 +5,7 @@ from langchain.chains.summarize import load_summarize_chain
|
||||||
loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
|
loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
llm = Ollama(model="llama3.1")
|
llm = Ollama(model="llama3")
|
||||||
chain = load_summarize_chain(llm, chain_type="stuff")
|
chain = load_summarize_chain(llm, chain_type="stuff")
|
||||||
|
|
||||||
result = chain.invoke(docs)
|
result = chain.invoke(docs)
|
||||||
|
|
|
@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
|
||||||
|
|
||||||
## Running the Example
|
## Running the Example
|
||||||
|
|
||||||
1. Ensure you have the `llama3.1` model installed:
|
1. Ensure you have the `llama3` model installed:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ollama pull llama3.1
|
ollama pull llama3
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Install the Python Requirements.
|
2. Install the Python Requirements.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from langchain.llms import Ollama
|
from langchain.llms import Ollama
|
||||||
|
|
||||||
input = input("What is your question?")
|
input = input("What is your question?")
|
||||||
llm = Ollama(model="llama3.1")
|
llm = Ollama(model="llama3")
|
||||||
res = llm.predict(input)
|
res = llm.predict(input)
|
||||||
print (res)
|
print (res)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
FROM llama3.1
|
FROM llama3
|
||||||
PARAMETER temperature 1
|
PARAMETER temperature 1
|
||||||
SYSTEM """
|
SYSTEM """
|
||||||
You are Mario from super mario bros, acting as an assistant.
|
You are Mario from super mario bros, acting as an assistant.
|
||||||
|
|
|
@ -2,12 +2,12 @@
|
||||||
|
|
||||||
# Example character: Mario
|
# Example character: Mario
|
||||||
|
|
||||||
This example shows how to create a basic character using Llama3.1 as the base model.
|
This example shows how to create a basic character using Llama3 as the base model.
|
||||||
|
|
||||||
To run this example:
|
To run this example:
|
||||||
|
|
||||||
1. Download the Modelfile
|
1. Download the Modelfile
|
||||||
2. `ollama pull llama3.1` to get the base model used in the model file.
|
2. `ollama pull llama3` to get the base model used in the model file.
|
||||||
3. `ollama create NAME -f ./Modelfile`
|
3. `ollama create NAME -f ./Modelfile`
|
||||||
4. `ollama run NAME`
|
4. `ollama run NAME`
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
|
||||||
What the model file looks like:
|
What the model file looks like:
|
||||||
|
|
||||||
```
|
```
|
||||||
FROM llama3.1
|
FROM llama3
|
||||||
PARAMETER temperature 1
|
PARAMETER temperature 1
|
||||||
SYSTEM """
|
SYSTEM """
|
||||||
You are Mario from Super Mario Bros, acting as an assistant.
|
You are Mario from Super Mario Bros, acting as an assistant.
|
||||||
|
|
|
@ -4,7 +4,7 @@ imageName = input("Enter the name of the image: ")
|
||||||
client = docker.from_env()
|
client = docker.from_env()
|
||||||
s = requests.Session()
|
s = requests.Session()
|
||||||
output=""
|
output=""
|
||||||
with s.post('http://localhost:11434/api/generate', json={'model': 'mattw/dockerit', 'prompt': inputDescription}, stream=True) as r:
|
with s.post('http://localhost:11434/api/generate', json={'model': 'dockerit', 'prompt': inputDescription}, stream=True) as r:
|
||||||
for line in r.iter_lines():
|
for line in r.iter_lines():
|
||||||
if line:
|
if line:
|
||||||
j = json.loads(line)
|
j = json.loads(line)
|
||||||
|
|
|
@ -2,7 +2,7 @@ import requests
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
|
|
||||||
model = "llama3.1"
|
model = "llama3"
|
||||||
template = {
|
template = {
|
||||||
"firstName": "",
|
"firstName": "",
|
||||||
"lastName": "",
|
"lastName": "",
|
||||||
|
|
|
@ -12,7 +12,7 @@ countries = [
|
||||||
"France",
|
"France",
|
||||||
]
|
]
|
||||||
country = random.choice(countries)
|
country = random.choice(countries)
|
||||||
model = "llama3.1"
|
model = "llama3"
|
||||||
|
|
||||||
prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
|
prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
|
||||||
|
|
||||||
|
|
|
@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
|
||||||
|
|
||||||
## Running the Example
|
## Running the Example
|
||||||
|
|
||||||
1. Ensure you have the `llama3.1` model installed:
|
1. Ensure you have the `llama3` model installed:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ollama pull llama3.1
|
ollama pull llama3
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Install the Python Requirements.
|
2. Install the Python Requirements.
|
||||||
|
|
|
@ -2,7 +2,7 @@ import json
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
|
# NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
|
||||||
model = "llama3.1" # TODO: update this for whatever model you wish to use
|
model = "llama3" # TODO: update this for whatever model you wish to use
|
||||||
|
|
||||||
|
|
||||||
def chat(messages):
|
def chat(messages):
|
||||||
|
|
|
@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
|
||||||
|
|
||||||
## Running the Example
|
## Running the Example
|
||||||
|
|
||||||
1. Ensure you have the `llama3.1` model installed:
|
1. Ensure you have the `llama3` model installed:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ollama pull llama3.1
|
ollama pull llama3
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Install the Python Requirements.
|
2. Install the Python Requirements.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import * as readline from "readline";
|
import * as readline from "readline";
|
||||||
|
|
||||||
const model = "llama3.1";
|
const model = "llama3";
|
||||||
type Message = {
|
type Message = {
|
||||||
role: "assistant" | "user" | "system";
|
role: "assistant" | "user" | "system";
|
||||||
content: string;
|
content: string;
|
||||||
|
|
|
@ -10,7 +10,6 @@ import (
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
"slices"
|
"slices"
|
||||||
"sort"
|
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
@ -83,20 +82,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||||
// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
|
// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
|
||||||
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
|
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
|
||||||
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
|
||||||
sort.Slice(matches, func(i, j int) bool {
|
|
||||||
// /sys/class/kfd/kfd/topology/nodes/<number>/properties
|
|
||||||
a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("parse err", "error", err, "match", matches[i])
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64)
|
|
||||||
if err != nil {
|
|
||||||
slog.Debug("parse err", "error", err, "match", matches[i])
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return a < b
|
|
||||||
})
|
|
||||||
cpuCount := 0
|
cpuCount := 0
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
slog.Debug("evaluating amdgpu node " + match)
|
slog.Debug("evaluating amdgpu node " + match)
|
||||||
|
|
|
@ -69,7 +69,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
||||||
reqLimit := len(req)
|
reqLimit := len(req)
|
||||||
iterLimit := 5
|
iterLimit := 5
|
||||||
|
|
||||||
vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
|
vram := os.Getenv("OLLAMA_MAX_VRAM")
|
||||||
if vram != "" {
|
if vram != "" {
|
||||||
max, err := strconv.ParseUint(vram, 10, 64)
|
max, err := strconv.ParseUint(vram, 10, 64)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
@ -106,7 +106,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
||||||
|
|
||||||
// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
|
// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
|
||||||
func TestMultiModelStress(t *testing.T) {
|
func TestMultiModelStress(t *testing.T) {
|
||||||
vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
|
vram := os.Getenv("OLLAMA_MAX_VRAM")
|
||||||
if vram == "" {
|
if vram == "" {
|
||||||
t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
|
t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,45 +4,12 @@ package integration
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"math"
|
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
)
|
)
|
||||||
|
|
||||||
func floatsEqual32(a, b float32) bool {
|
|
||||||
return math.Abs(float64(a-b)) <= 1e-4
|
|
||||||
}
|
|
||||||
|
|
||||||
func floatsEqual64(a, b float64) bool {
|
|
||||||
return math.Abs(a-b) <= 1e-4
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestAllMiniLMEmbeddings(t *testing.T) {
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
req := api.EmbeddingRequest{
|
|
||||||
Model: "all-minilm",
|
|
||||||
Prompt: "why is the sky blue?",
|
|
||||||
}
|
|
||||||
|
|
||||||
res, err := embeddingTestHelper(ctx, t, req)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(res.Embedding) != 384 {
|
|
||||||
t.Fatalf("expected 384 floats, got %d", len(res.Embedding))
|
|
||||||
}
|
|
||||||
|
|
||||||
if !floatsEqual64(res.Embedding[0], 0.06642947345972061) {
|
|
||||||
t.Fatalf("expected 0.06642947345972061, got %.16f", res.Embedding[0])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestAllMiniLMEmbed(t *testing.T) {
|
func TestAllMiniLMEmbed(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
@ -66,12 +33,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
|
||||||
t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
|
t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
|
||||||
}
|
}
|
||||||
|
|
||||||
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) {
|
if res.Embeddings[0][0] != 0.010071031 {
|
||||||
t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
|
t.Fatalf("expected 0.010071031, got %f", res.Embeddings[0][0])
|
||||||
}
|
|
||||||
|
|
||||||
if res.PromptEvalCount != 8 {
|
|
||||||
t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,16 +61,12 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
|
||||||
t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
|
t.Fatalf("expected 384 floats, got %d", len(res.Embeddings[0]))
|
||||||
}
|
}
|
||||||
|
|
||||||
if !floatsEqual32(res.Embeddings[0][0], 0.010071031) || !floatsEqual32(res.Embeddings[1][0], -0.009802706) {
|
if res.Embeddings[0][0] != 0.010071031 || res.Embeddings[1][0] != -0.009802706 {
|
||||||
t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
|
t.Fatalf("expected 0.010071031 and -0.009802706, got %f and %f", res.Embeddings[0][0], res.Embeddings[1][0])
|
||||||
}
|
|
||||||
|
|
||||||
if res.PromptEvalCount != 16 {
|
|
||||||
t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
func TestAllMiniLmEmbedTruncate(t *testing.T) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
|
@ -176,22 +135,6 @@ func TestAllMiniLMEmbedTruncate(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func embeddingTestHelper(ctx context.Context, t *testing.T, req api.EmbeddingRequest) (*api.EmbeddingResponse, error) {
|
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
|
||||||
defer cleanup()
|
|
||||||
if err := PullIfMissing(ctx, client, req.Model); err != nil {
|
|
||||||
t.Fatalf("failed to pull model %s: %v", req.Model, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
response, err := client.Embeddings(ctx, &req)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return response, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
|
func embedTestHelper(ctx context.Context, t *testing.T, req api.EmbedRequest) (*api.EmbedResponse, error) {
|
||||||
client, _, cleanup := InitServerConnection(ctx, t)
|
client, _, cleanup := InitServerConnection(ctx, t)
|
||||||
defer cleanup()
|
defer cleanup()
|
||||||
|
|
20
llm/ext_server/server.cpp
vendored
20
llm/ext_server/server.cpp
vendored
|
@ -41,7 +41,6 @@
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#include <errhandlingapi.h>
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
@ -1221,7 +1220,6 @@ struct llama_server_context
|
||||||
res.result_json = json
|
res.result_json = json
|
||||||
{
|
{
|
||||||
{"embedding", std::vector<float>(embd, embd + n_embd)},
|
{"embedding", std::vector<float>(embd, embd + n_embd)},
|
||||||
{"timings", slot.get_formated_timings()},
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2439,6 +2437,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
|
||||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
}
|
}
|
||||||
|
else if (arg == "--lora-base")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.lora_base = argv[i];
|
||||||
|
}
|
||||||
else if (arg == "-v" || arg == "--verbose")
|
else if (arg == "-v" || arg == "--verbose")
|
||||||
{
|
{
|
||||||
server_verbose = true;
|
server_verbose = true;
|
||||||
|
@ -2730,9 +2737,6 @@ int wmain(int argc, wchar_t **wargv) {
|
||||||
for (int i = 0; i < argc; ++i) {
|
for (int i = 0; i < argc; ++i) {
|
||||||
argv[i] = wchar_to_char(wargv[i]);
|
argv[i] = wchar_to_char(wargv[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adjust error mode to avoid error dialog after we start.
|
|
||||||
SetErrorMode(SEM_FAILCRITICALERRORS);
|
|
||||||
#else
|
#else
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
#endif
|
#endif
|
||||||
|
@ -3204,15 +3208,11 @@ int main(int argc, char **argv) {
|
||||||
|
|
||||||
responses = result.result_json.value("results", std::vector<json>{result.result_json});
|
responses = result.result_json.value("results", std::vector<json>{result.result_json});
|
||||||
json embeddings = json::array();
|
json embeddings = json::array();
|
||||||
|
|
||||||
int prompt_n = 0;
|
|
||||||
for (auto & elem : responses) {
|
for (auto & elem : responses) {
|
||||||
embeddings.push_back(elem.at("embedding"));
|
embeddings.push_back(elem.at("embedding"));
|
||||||
prompt_n += elem.at("timings").at("prompt_n").get<int>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// send the result
|
// send the result
|
||||||
json embedding_res = json{{"embedding", embeddings}, {"prompt_n", prompt_n}};
|
json embedding_res = json{{"embedding", embeddings}};
|
||||||
return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
|
return res.set_content(embedding_res.dump(), "application/json; charset=utf-8");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 6eeaeba126ff701f3e8f79f246805b7023709972
|
Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584
|
|
@ -2,10 +2,7 @@ package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"embed"
|
"embed"
|
||||||
"syscall"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
//go:embed build/darwin/x86_64/*/bin/*
|
//go:embed build/darwin/x86_64/*/bin/*
|
||||||
var libEmbed embed.FS
|
var libEmbed embed.FS
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
|
||||||
|
|
|
@ -2,10 +2,7 @@ package llm
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"embed"
|
"embed"
|
||||||
"syscall"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
//go:embed build/darwin/arm64/*/bin/*
|
//go:embed build/darwin/arm64/*/bin/*
|
||||||
var libEmbed embed.FS
|
var libEmbed embed.FS
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
|
||||||
|
|
|
@ -1,11 +1,6 @@
|
||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import "embed"
|
||||||
"embed"
|
|
||||||
"syscall"
|
|
||||||
)
|
|
||||||
|
|
||||||
//go:embed build/linux/*/*/bin/*
|
//go:embed build/linux/*/*/bin/*
|
||||||
var libEmbed embed.FS
|
var libEmbed embed.FS
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
|
|
||||||
|
|
|
@ -1,20 +1,6 @@
|
||||||
package llm
|
package llm
|
||||||
|
|
||||||
import (
|
import "embed"
|
||||||
"embed"
|
|
||||||
"syscall"
|
|
||||||
)
|
|
||||||
|
|
||||||
// unused on windows
|
// unused on windows
|
||||||
var libEmbed embed.FS
|
var libEmbed embed.FS
|
||||||
|
|
||||||
const CREATE_DEFAULT_ERROR_MODE = 0x04000000
|
|
||||||
|
|
||||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
|
|
||||||
// Wire up the default error handling logic If for some reason a DLL is
|
|
||||||
// missing in the path this will pop up a GUI Dialog explaining the fault so
|
|
||||||
// the user can either fix their PATH, or report a bug. Without this
|
|
||||||
// setting, the process exits immediately with a generic exit status but no
|
|
||||||
// way to (easily) figure out what the actual missing DLL was.
|
|
||||||
CreationFlags: CREATE_DEFAULT_ERROR_MODE,
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
index a207451f..2ddf431d 100644
|
index 2b9ace28..172640e2 100644
|
||||||
--- a/src/llama.cpp
|
--- a/src/llama.cpp
|
||||||
+++ b/src/llama.cpp
|
+++ b/src/llama.cpp
|
||||||
@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
|
@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
|
||||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||||
vocab.tokenizer_add_space_prefix = false;
|
vocab.tokenizer_add_space_prefix = false;
|
||||||
vocab.tokenizer_clean_spaces = true;
|
vocab.tokenizer_clean_spaces = true;
|
||||||
|
@ -20,9 +20,9 @@ index a207451f..2ddf431d 100644
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "llama3" ||
|
tokenizer_pre == "llama3" ||
|
||||||
@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
|
@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
|
||||||
tokenizer_pre == "codeshell") {
|
tokenizer_pre == "jais") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||||||
} else {
|
} else {
|
||||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||||
|
|
13
llm/patches/06-qwen2.diff
Normal file
13
llm/patches/06-qwen2.diff
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
|
index 40d2ec2c..f34eb79a 100644
|
||||||
|
--- a/src/llama.cpp
|
||||||
|
+++ b/src/llama.cpp
|
||||||
|
@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||||
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||||
|
cb(kq, "kq", il);
|
||||||
|
|
||||||
|
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
||||||
|
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
|
||||||
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
||||||
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
||||||
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
@ -1,358 +0,0 @@
|
||||||
diff --git a/common/common.cpp b/common/common.cpp
|
|
||||||
index dbb724fb..c26fe6ee 100644
|
|
||||||
--- a/common/common.cpp
|
|
||||||
+++ b/common/common.cpp
|
|
||||||
@@ -2087,14 +2087,27 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
||||||
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
|
||||||
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
|
||||||
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
|
||||||
+
|
|
||||||
+ // try to load as gguf
|
|
||||||
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
|
|
||||||
if (adapter == nullptr) {
|
|
||||||
- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
|
||||||
- llama_free(lctx);
|
|
||||||
- llama_free_model(model);
|
|
||||||
- return std::make_tuple(nullptr, nullptr);
|
|
||||||
+ fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
|
|
||||||
+
|
|
||||||
+ // if that fails, try loading as ggla for compatibility
|
|
||||||
+ int err = llama_model_apply_lora_from_file(model,
|
|
||||||
+ lora_adapter.c_str(),
|
|
||||||
+ lora_scale,
|
|
||||||
+ nullptr,
|
|
||||||
+ params.n_threads);
|
|
||||||
+ if (err != 0) {
|
|
||||||
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
|
||||||
+ llama_free(lctx);
|
|
||||||
+ llama_free_model(model);
|
|
||||||
+ return std::make_tuple(nullptr, nullptr);
|
|
||||||
+ }
|
|
||||||
+ } else {
|
|
||||||
+ llama_lora_adapter_set(lctx, adapter, lora_scale);
|
|
||||||
}
|
|
||||||
- llama_lora_adapter_set(lctx, adapter, lora_scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.ignore_eos) {
|
|
||||||
diff --git a/include/llama.h b/include/llama.h
|
|
||||||
index 93fd77ca..b0fb37a6 100644
|
|
||||||
--- a/include/llama.h
|
|
||||||
+++ b/include/llama.h
|
|
||||||
@@ -1160,6 +1160,20 @@ extern "C" {
|
|
||||||
|
|
||||||
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
|
||||||
|
|
||||||
+ // Apply a LoRA adapter to a loaded model
|
|
||||||
+ // path_base_model is the path to a higher quality model to use as a base for
|
|
||||||
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
|
|
||||||
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
|
||||||
+ // will be applied on top of the previous one
|
|
||||||
+ // Returns 0 on success
|
|
||||||
+ LLAMA_API int32_t llama_model_apply_lora_from_file(
|
|
||||||
+ const struct llama_model * model,
|
|
||||||
+ const char * path_lora,
|
|
||||||
+ float scale,
|
|
||||||
+ const char * path_base_model,
|
|
||||||
+ int32_t n_threads);
|
|
||||||
+
|
|
||||||
+
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
|
||||||
index 80a0dd0f..9d7b0e17 100644
|
|
||||||
--- a/src/llama.cpp
|
|
||||||
+++ b/src/llama.cpp
|
|
||||||
@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
|
|
||||||
fputs(text, stderr);
|
|
||||||
fflush(stderr);
|
|
||||||
}
|
|
||||||
+
|
|
||||||
+static int llama_apply_lora_from_file_internal(
|
|
||||||
+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
|
||||||
+) {
|
|
||||||
+ LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
|
||||||
+
|
|
||||||
+ const int64_t t_start_lora_us = ggml_time_us();
|
|
||||||
+
|
|
||||||
+ llama_file fin(path_lora, "rb");
|
|
||||||
+
|
|
||||||
+ // verify magic and version
|
|
||||||
+ {
|
|
||||||
+ uint32_t magic = fin.read_u32();
|
|
||||||
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ uint32_t format_version = fin.read_u32();
|
|
||||||
+ if (format_version != 1) {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ int32_t lora_r = fin.read_u32();
|
|
||||||
+ int32_t lora_alpha = fin.read_u32();
|
|
||||||
+ float scaling = scale * (float)lora_alpha / (float)lora_r;
|
|
||||||
+
|
|
||||||
+ LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
|
||||||
+
|
|
||||||
+ // load base model
|
|
||||||
+ std::unique_ptr<llama_model_loader> ml;
|
|
||||||
+ if (path_base_model) {
|
|
||||||
+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
|
||||||
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
|
||||||
+ ml->init_mappings(/*prefetch*/ false); // no prefetching
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ struct tensor_meta {
|
|
||||||
+ std::string name;
|
|
||||||
+ ggml_type type;
|
|
||||||
+ int32_t ne[2];
|
|
||||||
+ size_t offset;
|
|
||||||
+ };
|
|
||||||
+ std::map<std::string, tensor_meta> tensor_meta_map;
|
|
||||||
+
|
|
||||||
+ // load all tensor meta
|
|
||||||
+ while (true) {
|
|
||||||
+ if (fin.tell() == fin.size) {
|
|
||||||
+ // eof
|
|
||||||
+ break;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ int32_t n_dims;
|
|
||||||
+ int32_t name_len;
|
|
||||||
+ int32_t ftype;
|
|
||||||
+
|
|
||||||
+ fin.read_raw(&n_dims, sizeof(n_dims));
|
|
||||||
+ fin.read_raw(&name_len, sizeof(name_len));
|
|
||||||
+ fin.read_raw(&ftype, sizeof(ftype));
|
|
||||||
+
|
|
||||||
+ if (n_dims != 1 && n_dims != 2) {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ int32_t ne[2] = { 1, 1 };
|
|
||||||
+ for (int i = 0; i < n_dims; ++i) {
|
|
||||||
+ fin.read_raw(&ne[i], sizeof(ne[i]));
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ std::string name;
|
|
||||||
+ {
|
|
||||||
+ GGML_ASSERT(name_len < GGML_MAX_NAME);
|
|
||||||
+ char buf[GGML_MAX_NAME];
|
|
||||||
+ fin.read_raw(buf, name_len);
|
|
||||||
+ name = std::string(buf, name_len);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // check for lora suffix
|
|
||||||
+ std::string lora_suffix;
|
|
||||||
+ if (name.length() > 6) {
|
|
||||||
+ lora_suffix = name.substr(name.length() - 6);
|
|
||||||
+ }
|
|
||||||
+ if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // tensor type
|
|
||||||
+ ggml_type wtype;
|
|
||||||
+ switch (ftype) {
|
|
||||||
+ case 0: wtype = GGML_TYPE_F32; break;
|
|
||||||
+ case 1: wtype = GGML_TYPE_F16; break;
|
|
||||||
+ default:
|
|
||||||
+ {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
|
||||||
+ __func__, ftype);
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // data offset
|
|
||||||
+ size_t offset = fin.tell();
|
|
||||||
+ offset = (offset + 31) & -32;
|
|
||||||
+
|
|
||||||
+ // skip tensor data
|
|
||||||
+ fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
|
|
||||||
+
|
|
||||||
+ tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ bool warned = false;
|
|
||||||
+ int n_tensors = 0;
|
|
||||||
+
|
|
||||||
+ // apply
|
|
||||||
+ ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
|
||||||
+ if (backend_cpu == nullptr) {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+ ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
|
|
||||||
+
|
|
||||||
+ std::vector<no_init<uint8_t>> read_buf;
|
|
||||||
+ for (const auto & it : model.tensors_by_name) {
|
|
||||||
+ const std::string & base_name = it.first;
|
|
||||||
+ ggml_tensor * model_t = it.second;
|
|
||||||
+
|
|
||||||
+ if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
|
|
||||||
+ tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
|
|
||||||
+ continue;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
|
|
||||||
+ tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
|
|
||||||
+
|
|
||||||
+ ggml_init_params lora_init_params = {
|
|
||||||
+ /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
|
||||||
+ /* .mem_buffer */ nullptr,
|
|
||||||
+ /* .no_alloc */ true,
|
|
||||||
+ };
|
|
||||||
+ ggml_context * lora_ctx = ggml_init(lora_init_params);
|
|
||||||
+ if (lora_ctx == nullptr) {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
|
|
||||||
+ ggml_backend_free(backend_cpu);
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // create tensors
|
|
||||||
+ ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
|
|
||||||
+ ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
|
|
||||||
+ ggml_set_name(loraA, metaA.name.c_str());
|
|
||||||
+ ggml_set_name(loraB, metaB.name.c_str());
|
|
||||||
+
|
|
||||||
+ ggml_tensor * base_t;
|
|
||||||
+ if (ml) {
|
|
||||||
+ if (!ml->get_tensor_meta(base_name.c_str())) {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+ base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
|
|
||||||
+ } else {
|
|
||||||
+ base_t = ggml_dup_tensor(lora_ctx, model_t);
|
|
||||||
+ }
|
|
||||||
+ ggml_set_name(base_t, base_name.c_str());
|
|
||||||
+
|
|
||||||
+ // allocate in backend buffer
|
|
||||||
+ ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
|
||||||
+ if (lora_buf == nullptr) {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // load tensor data
|
|
||||||
+ auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
|
|
||||||
+ read_buf.resize(ggml_nbytes(tensor));
|
|
||||||
+ fin.seek(tensor_meta.offset, SEEK_SET);
|
|
||||||
+ fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
|
|
||||||
+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
|
|
||||||
+ };
|
|
||||||
+ load_tensor(metaA, loraA);
|
|
||||||
+ load_tensor(metaB, loraB);
|
|
||||||
+
|
|
||||||
+ // load base model tensor data
|
|
||||||
+ if (ml) {
|
|
||||||
+ ml->load_data_for(base_t);
|
|
||||||
+ } else {
|
|
||||||
+ ggml_backend_tensor_copy(model_t, base_t);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (ggml_is_quantized(base_t->type) && !warned) {
|
|
||||||
+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
|
||||||
+ "use a f16 or f32 base model with --lora-base\n", __func__);
|
|
||||||
+ warned = true;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
|
||||||
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
|
||||||
+ ggml_free(lora_ctx);
|
|
||||||
+ ggml_backend_buffer_free(lora_buf);
|
|
||||||
+ ggml_backend_free(backend_cpu);
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ auto build_lora_graph = [&]() {
|
|
||||||
+ // w = w + BA*s
|
|
||||||
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
|
||||||
+ ggml_set_name(BA, "BA");
|
|
||||||
+
|
|
||||||
+ if (scaling != 1.0f) {
|
|
||||||
+ BA = ggml_scale(lora_ctx, BA, scaling);
|
|
||||||
+ ggml_set_name(BA, "BA_scaled");
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ ggml_tensor * r;
|
|
||||||
+ r = ggml_add_inplace(lora_ctx, base_t, BA);
|
|
||||||
+ ggml_set_name(r, "r_add");
|
|
||||||
+
|
|
||||||
+ if (base_t->type != model_t->type) {
|
|
||||||
+ // convert the result to the model type
|
|
||||||
+ r = ggml_cast(lora_ctx, r, model_t->type);
|
|
||||||
+ ggml_set_name(r, "r_cast");
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return r;
|
|
||||||
+ };
|
|
||||||
+
|
|
||||||
+ ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
|
||||||
+ ggml_tensor * r = build_lora_graph();
|
|
||||||
+ ggml_build_forward_expand(gf, r);
|
|
||||||
+
|
|
||||||
+ ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
|
||||||
+ if (graph_buf == nullptr) {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
|
|
||||||
+ ggml_free(lora_ctx);
|
|
||||||
+ ggml_backend_buffer_free(lora_buf);
|
|
||||||
+ ggml_backend_free(backend_cpu);
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ ggml_backend_graph_compute(backend_cpu, gf);
|
|
||||||
+
|
|
||||||
+ ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
|
|
||||||
+
|
|
||||||
+#if 0
|
|
||||||
+ // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
|
|
||||||
+ //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
|
|
||||||
+
|
|
||||||
+ // sched compute
|
|
||||||
+ ggml_build_forward_expand(gf, build_graph());
|
|
||||||
+ ggml_backend_sched_init_measure(sched, gf);
|
|
||||||
+
|
|
||||||
+ // create the graph again, since the previous one was destroyed by the measure
|
|
||||||
+ ggml_graph_clear(gf);
|
|
||||||
+ ggml_build_forward_expand(gf, build_graph());
|
|
||||||
+ ggml_backend_sched_graph_compute(sched, gf);
|
|
||||||
+ ggml_backend_sched_free(sched);
|
|
||||||
+#endif
|
|
||||||
+
|
|
||||||
+ ggml_backend_buffer_free(lora_buf);
|
|
||||||
+ ggml_backend_buffer_free(graph_buf);
|
|
||||||
+ ggml_free(lora_ctx);
|
|
||||||
+
|
|
||||||
+ n_tensors++;
|
|
||||||
+ if (n_tensors % 4 == 0) {
|
|
||||||
+ LLAMA_LOG_INFO(".");
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ ggml_backend_free(backend_cpu);
|
|
||||||
+
|
|
||||||
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
|
||||||
+ LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
|
||||||
+
|
|
||||||
+ return 0;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
|
||||||
+ try {
|
|
||||||
+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
|
||||||
+ } catch (const std::exception & err) {
|
|
||||||
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
|
||||||
+ return 1;
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
\ No newline at end of file
|
|
|
@ -1,20 +0,0 @@
|
||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
|
||||||
index a207451f..fba6b175 100644
|
|
||||||
--- a/src/llama.cpp
|
|
||||||
+++ b/src/llama.cpp
|
|
||||||
@@ -4969,6 +4969,7 @@ static void llm_load_hparams(
|
|
||||||
hparams.attn_soft_cap = true;
|
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
|
||||||
+ case 26: model.type = e_model::MODEL_2B; break;
|
|
||||||
case 42: model.type = e_model::MODEL_9B; break;
|
|
||||||
case 46: model.type = e_model::MODEL_27B; break;
|
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
|
||||||
@@ -11736,6 +11737,7 @@ struct llm_build_context {
|
|
||||||
|
|
||||||
// ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
|
|
||||||
switch (model.type) {
|
|
||||||
+ case e_model::MODEL_2B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
|
||||||
case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
|
|
||||||
case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
|
|
||||||
default: GGML_ABORT("fatal error");
|
|
43
llm/patches/10-tekken.diff
Normal file
43
llm/patches/10-tekken.diff
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
diff --git a/include/llama.h b/include/llama.h
|
||||||
|
index bb4b05ba..a92174e0 100644
|
||||||
|
--- a/include/llama.h
|
||||||
|
+++ b/include/llama.h
|
||||||
|
@@ -92,6 +92,7 @@ extern "C" {
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||||
|
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||||
|
};
|
||||||
|
|
||||||
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
|
index 18364976..435b6fe5 100644
|
||||||
|
--- a/src/llama.cpp
|
||||||
|
+++ b/src/llama.cpp
|
||||||
|
@@ -5429,6 +5429,12 @@ static void llm_load_vocab(
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "jais") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||||||
|
+ } else if (
|
||||||
|
+ tokenizer_pre == "tekken") {
|
||||||
|
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
|
||||||
|
+ vocab.tokenizer_clean_spaces = false;
|
||||||
|
+ vocab.tokenizer_ignore_merges = true;
|
||||||
|
+ vocab.tokenizer_add_bos = true;
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
|
||||||
|
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||||
|
};
|
||||||
|
break;
|
||||||
|
+ case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
|
||||||
|
+ // original regex from tokenizer.json
|
||||||
|
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||||
|
+ regex_exprs = {
|
||||||
|
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
+ };
|
||||||
|
+ break;
|
||||||
|
default:
|
||||||
|
// default regex for BPE tokenization pre-processing
|
||||||
|
regex_exprs = {
|
19
llm/patches/11-embd_kv.diff
Normal file
19
llm/patches/11-embd_kv.diff
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
|
index 2b9ace28..e60d3d8d 100644
|
||||||
|
--- a/src/llama.cpp
|
||||||
|
+++ b/src/llama.cpp
|
||||||
|
@@ -6052,10 +6052,10 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
|
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
||||||
|
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||||
|
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||||
|
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
||||||
|
|
||||||
|
// optional bias tensors
|
||||||
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
@ -33,7 +33,7 @@ type LlamaServer interface {
|
||||||
Ping(ctx context.Context) error
|
Ping(ctx context.Context) error
|
||||||
WaitUntilRunning(ctx context.Context) error
|
WaitUntilRunning(ctx context.Context) error
|
||||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||||
Embed(ctx context.Context, input []string) (*EmbedResponse, error)
|
Embed(ctx context.Context, input []string) ([][]float32, error)
|
||||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||||
Close() error
|
Close() error
|
||||||
|
@ -346,7 +346,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
s.cmd.Env = os.Environ()
|
s.cmd.Env = os.Environ()
|
||||||
s.cmd.Stdout = os.Stdout
|
s.cmd.Stdout = os.Stdout
|
||||||
s.cmd.Stderr = s.status
|
s.cmd.Stderr = s.status
|
||||||
s.cmd.SysProcAttr = LlamaServerSysProcAttr
|
|
||||||
|
|
||||||
envWorkarounds := [][2]string{}
|
envWorkarounds := [][2]string{}
|
||||||
for _, gpu := range gpus {
|
for _, gpu := range gpus {
|
||||||
|
@ -418,17 +417,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||||
|
|
||||||
// reap subprocess when it exits
|
// reap subprocess when it exits
|
||||||
go func() {
|
go func() {
|
||||||
err := s.cmd.Wait()
|
s.done <- s.cmd.Wait()
|
||||||
// Favor a more detailed message over the process exit status
|
|
||||||
if err != nil && s.status != nil && s.status.LastErrMsg != "" {
|
|
||||||
slog.Debug("llama runner terminated", "error", err)
|
|
||||||
if strings.Contains(s.status.LastErrMsg, "unknown model") {
|
|
||||||
s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
|
|
||||||
}
|
|
||||||
s.done <- fmt.Errorf(s.status.LastErrMsg)
|
|
||||||
} else {
|
|
||||||
s.done <- err
|
|
||||||
}
|
|
||||||
}()
|
}()
|
||||||
|
|
||||||
return s, nil
|
return s, nil
|
||||||
|
@ -591,7 +580,14 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||||
slog.Warn("client connection closed before server finished loading, aborting load")
|
slog.Warn("client connection closed before server finished loading, aborting load")
|
||||||
return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
|
return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
|
||||||
case err := <-s.done:
|
case err := <-s.done:
|
||||||
return fmt.Errorf("llama runner process has terminated: %w", err)
|
msg := ""
|
||||||
|
if s.status != nil && s.status.LastErrMsg != "" {
|
||||||
|
msg = s.status.LastErrMsg
|
||||||
|
}
|
||||||
|
if strings.Contains(msg, "unknown model") {
|
||||||
|
return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade")
|
||||||
|
}
|
||||||
|
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
|
||||||
default:
|
default:
|
||||||
}
|
}
|
||||||
if time.Now().After(stallTimer) {
|
if time.Now().After(stallTimer) {
|
||||||
|
@ -727,7 +723,6 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||||
"temperature": req.Options.Temperature,
|
"temperature": req.Options.Temperature,
|
||||||
"top_k": req.Options.TopK,
|
"top_k": req.Options.TopK,
|
||||||
"top_p": req.Options.TopP,
|
"top_p": req.Options.TopP,
|
||||||
"min_p": req.Options.MinP,
|
|
||||||
"tfs_z": req.Options.TFSZ,
|
"tfs_z": req.Options.TFSZ,
|
||||||
"typical_p": req.Options.TypicalP,
|
"typical_p": req.Options.TypicalP,
|
||||||
"repeat_last_n": req.Options.RepeatLastN,
|
"repeat_last_n": req.Options.RepeatLastN,
|
||||||
|
@ -880,10 +875,9 @@ type EmbedRequest struct {
|
||||||
|
|
||||||
type EmbedResponse struct {
|
type EmbedResponse struct {
|
||||||
Embedding [][]float32 `json:"embedding"`
|
Embedding [][]float32 `json:"embedding"`
|
||||||
PromptEvalCount int `json:"prompt_n"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
|
func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, error) {
|
||||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||||
slog.Error("Failed to acquire semaphore", "error", err)
|
slog.Error("Failed to acquire semaphore", "error", err)
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -925,12 +919,12 @@ func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse,
|
||||||
return nil, fmt.Errorf("%s", body)
|
return nil, fmt.Errorf("%s", body)
|
||||||
}
|
}
|
||||||
|
|
||||||
var e EmbedResponse
|
var embedding EmbedResponse
|
||||||
if err := json.Unmarshal(body, &e); err != nil {
|
if err := json.Unmarshal(body, &embedding); err != nil {
|
||||||
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
|
return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return &e, nil
|
return embedding.Embedding, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type TokenizeRequest struct {
|
type TokenizeRequest struct {
|
||||||
|
|
|
@ -19,7 +19,7 @@ export default function () {
|
||||||
const [step, setStep] = useState<Step>(Step.WELCOME)
|
const [step, setStep] = useState<Step>(Step.WELCOME)
|
||||||
const [commandCopied, setCommandCopied] = useState<boolean>(false)
|
const [commandCopied, setCommandCopied] = useState<boolean>(false)
|
||||||
|
|
||||||
const command = 'ollama run llama3.1'
|
const command = 'ollama run llama3'
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className='drag'>
|
<div className='drag'>
|
||||||
|
|
|
@ -218,9 +218,6 @@ func toChatCompletion(id string, r api.ChatResponse) ChatCompletion {
|
||||||
Index: 0,
|
Index: 0,
|
||||||
Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls},
|
Message: Message{Role: r.Message.Role, Content: r.Message.Content, ToolCalls: toolCalls},
|
||||||
FinishReason: func(reason string) *string {
|
FinishReason: func(reason string) *string {
|
||||||
if len(toolCalls) > 0 {
|
|
||||||
reason = "tool_calls"
|
|
||||||
}
|
|
||||||
if len(reason) > 0 {
|
if len(reason) > 0 {
|
||||||
return &reason
|
return &reason
|
||||||
}
|
}
|
||||||
|
|
|
@ -451,7 +451,6 @@ func TestParseFileParameters(t *testing.T) {
|
||||||
"num_predict 1": {"num_predict", "1"},
|
"num_predict 1": {"num_predict", "1"},
|
||||||
"top_k 1": {"top_k", "1"},
|
"top_k 1": {"top_k", "1"},
|
||||||
"top_p 1.0": {"top_p", "1.0"},
|
"top_p 1.0": {"top_p", "1.0"},
|
||||||
"min_p 0.05": {"min_p", "0.05"},
|
|
||||||
"tfs_z 1.0": {"tfs_z", "1.0"},
|
"tfs_z 1.0": {"tfs_z", "1.0"},
|
||||||
"typical_p 1.0": {"typical_p", "1.0"},
|
"typical_p 1.0": {"typical_p", "1.0"},
|
||||||
"repeat_last_n 1": {"repeat_last_n", "1"},
|
"repeat_last_n 1": {"repeat_last_n", "1"},
|
||||||
|
|
|
@ -198,29 +198,19 @@ if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
CUDA_REPO_ERR_MSG="NVIDIA GPU detected, but your OS and Architecture are not supported by NVIDIA. Please install the CUDA driver manually https://docs.nvidia.com/cuda/cuda-installation-guide-linux/"
|
|
||||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
|
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-7-centos-7
|
||||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8
|
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-8-rocky-8
|
||||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
|
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#rhel-9-rocky-9
|
||||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
|
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#fedora
|
||||||
install_cuda_driver_yum() {
|
install_cuda_driver_yum() {
|
||||||
status 'Installing NVIDIA repository...'
|
status 'Installing NVIDIA repository...'
|
||||||
|
|
||||||
case $PACKAGE_MANAGER in
|
case $PACKAGE_MANAGER in
|
||||||
yum)
|
yum)
|
||||||
$SUDO $PACKAGE_MANAGER -y install yum-utils
|
$SUDO $PACKAGE_MANAGER -y install yum-utils
|
||||||
if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
|
|
||||||
$SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
|
$SUDO $PACKAGE_MANAGER-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
|
||||||
else
|
|
||||||
error $CUDA_REPO_ERR_MSG
|
|
||||||
fi
|
|
||||||
;;
|
;;
|
||||||
dnf)
|
dnf)
|
||||||
if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo" >/dev/null ; then
|
|
||||||
$SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
|
$SUDO $PACKAGE_MANAGER config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-$1$2.repo
|
||||||
else
|
|
||||||
error $CUDA_REPO_ERR_MSG
|
|
||||||
fi
|
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
@ -245,11 +235,7 @@ install_cuda_driver_yum() {
|
||||||
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
|
# ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#debian
|
||||||
install_cuda_driver_apt() {
|
install_cuda_driver_apt() {
|
||||||
status 'Installing NVIDIA repository...'
|
status 'Installing NVIDIA repository...'
|
||||||
if curl -I --silent --fail --location "https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb" >/dev/null ; then
|
|
||||||
curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
|
curl -fsSL -o $TEMP_DIR/cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/$1$2/$(uname -m)/cuda-keyring_1.1-1_all.deb
|
||||||
else
|
|
||||||
error $CUDA_REPO_ERR_MSG
|
|
||||||
fi
|
|
||||||
|
|
||||||
case $1 in
|
case $1 in
|
||||||
debian)
|
debian)
|
||||||
|
|
|
@ -67,7 +67,7 @@ func getAuthorizationToken(ctx context.Context, challenge registryChallenge) (st
|
||||||
|
|
||||||
headers.Add("Authorization", signature)
|
headers.Add("Authorization", signature)
|
||||||
|
|
||||||
response, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, ®istryOptions{})
|
response, err := makeRequest(ctx, http.MethodGet, redirectURL, headers, nil, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,6 @@ import (
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"math"
|
"math"
|
||||||
"math/rand/v2"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
|
@ -44,7 +43,7 @@ type blobDownload struct {
|
||||||
|
|
||||||
context.CancelFunc
|
context.CancelFunc
|
||||||
|
|
||||||
done chan struct{}
|
done bool
|
||||||
err error
|
err error
|
||||||
references atomic.Int32
|
references atomic.Int32
|
||||||
}
|
}
|
||||||
|
@ -53,9 +52,7 @@ type blobDownloadPart struct {
|
||||||
N int
|
N int
|
||||||
Offset int64
|
Offset int64
|
||||||
Size int64
|
Size int64
|
||||||
Completed atomic.Int64
|
Completed int64
|
||||||
|
|
||||||
lastUpdatedMu sync.Mutex
|
|
||||||
lastUpdated time.Time
|
lastUpdated time.Time
|
||||||
|
|
||||||
*blobDownload `json:"-"`
|
*blobDownload `json:"-"`
|
||||||
|
@ -74,7 +71,7 @@ func (p *blobDownloadPart) Name() string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *blobDownloadPart) StartsAt() int64 {
|
func (p *blobDownloadPart) StartsAt() int64 {
|
||||||
return p.Offset + p.Completed.Load()
|
return p.Offset + p.Completed
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *blobDownloadPart) StopsAt() int64 {
|
func (p *blobDownloadPart) StopsAt() int64 {
|
||||||
|
@ -84,9 +81,7 @@ func (p *blobDownloadPart) StopsAt() int64 {
|
||||||
func (p *blobDownloadPart) Write(b []byte) (n int, err error) {
|
func (p *blobDownloadPart) Write(b []byte) (n int, err error) {
|
||||||
n = len(b)
|
n = len(b)
|
||||||
p.blobDownload.Completed.Add(int64(n))
|
p.blobDownload.Completed.Add(int64(n))
|
||||||
p.lastUpdatedMu.Lock()
|
|
||||||
p.lastUpdated = time.Now()
|
p.lastUpdated = time.Now()
|
||||||
p.lastUpdatedMu.Unlock()
|
|
||||||
return n, nil
|
return n, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,8 +91,6 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
b.done = make(chan struct{})
|
|
||||||
|
|
||||||
for _, partFilePath := range partFilePaths {
|
for _, partFilePath := range partFilePaths {
|
||||||
part, err := b.readPart(partFilePath)
|
part, err := b.readPart(partFilePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -105,7 +98,7 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
|
||||||
}
|
}
|
||||||
|
|
||||||
b.Total += part.Size
|
b.Total += part.Size
|
||||||
b.Completed.Add(part.Completed.Load())
|
b.Completed.Add(part.Completed)
|
||||||
b.Parts = append(b.Parts, part)
|
b.Parts = append(b.Parts, part)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -145,36 +138,9 @@ func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *r
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *registryOptions) {
|
func (b *blobDownload) Run(ctx context.Context, requestURL *url.URL, opts *registryOptions) {
|
||||||
defer close(b.done)
|
|
||||||
b.err = b.run(ctx, requestURL, opts)
|
b.err = b.run(ctx, requestURL, opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
func newBackoff(maxBackoff time.Duration) func(ctx context.Context) error {
|
|
||||||
var n int
|
|
||||||
return func(ctx context.Context) error {
|
|
||||||
if ctx.Err() != nil {
|
|
||||||
return ctx.Err()
|
|
||||||
}
|
|
||||||
|
|
||||||
n++
|
|
||||||
|
|
||||||
// n^2 backoff timer is a little smoother than the
|
|
||||||
// common choice of 2^n.
|
|
||||||
d := min(time.Duration(n*n)*10*time.Millisecond, maxBackoff)
|
|
||||||
// Randomize the delay between 0.5-1.5 x msec, in order
|
|
||||||
// to prevent accidental "thundering herd" problems.
|
|
||||||
d = time.Duration(float64(d) * (rand.Float64() + 0.5))
|
|
||||||
t := time.NewTimer(d)
|
|
||||||
defer t.Stop()
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return ctx.Err()
|
|
||||||
case <-t.C:
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *registryOptions) error {
|
func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *registryOptions) error {
|
||||||
defer blobDownloadManager.Delete(b.Digest)
|
defer blobDownloadManager.Delete(b.Digest)
|
||||||
ctx, b.CancelFunc = context.WithCancel(ctx)
|
ctx, b.CancelFunc = context.WithCancel(ctx)
|
||||||
|
@ -187,57 +153,11 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
|
||||||
|
|
||||||
_ = file.Truncate(b.Total)
|
_ = file.Truncate(b.Total)
|
||||||
|
|
||||||
directURL, err := func() (*url.URL, error) {
|
|
||||||
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
backoff := newBackoff(10 * time.Second)
|
|
||||||
for {
|
|
||||||
// shallow clone opts to be used in the closure
|
|
||||||
// without affecting the outer opts.
|
|
||||||
newOpts := new(registryOptions)
|
|
||||||
*newOpts = *opts
|
|
||||||
|
|
||||||
newOpts.CheckRedirect = func(req *http.Request, via []*http.Request) error {
|
|
||||||
if len(via) > 10 {
|
|
||||||
return errors.New("maxium redirects exceeded (10) for directURL")
|
|
||||||
}
|
|
||||||
|
|
||||||
// if the hostname is the same, allow the redirect
|
|
||||||
if req.URL.Hostname() == requestURL.Hostname() {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// stop at the first redirect that is not
|
|
||||||
// the same hostname as the original
|
|
||||||
// request.
|
|
||||||
return http.ErrUseLastResponse
|
|
||||||
}
|
|
||||||
|
|
||||||
resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, nil, nil, newOpts)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to get direct URL; backing off and retrying", "err", err)
|
|
||||||
if err := backoff(ctx); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
if resp.StatusCode != http.StatusTemporaryRedirect {
|
|
||||||
return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode)
|
|
||||||
}
|
|
||||||
return resp.Location()
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
g, inner := errgroup.WithContext(ctx)
|
g, inner := errgroup.WithContext(ctx)
|
||||||
g.SetLimit(numDownloadParts)
|
g.SetLimit(numDownloadParts)
|
||||||
for i := range b.Parts {
|
for i := range b.Parts {
|
||||||
part := b.Parts[i]
|
part := b.Parts[i]
|
||||||
if part.Completed.Load() == part.Size {
|
if part.Completed == part.Size {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -245,7 +165,7 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
|
||||||
var err error
|
var err error
|
||||||
for try := 0; try < maxRetries; try++ {
|
for try := 0; try < maxRetries; try++ {
|
||||||
w := io.NewOffsetWriter(file, part.StartsAt())
|
w := io.NewOffsetWriter(file, part.StartsAt())
|
||||||
err = b.downloadChunk(inner, directURL, w, part)
|
err = b.downloadChunk(inner, requestURL, w, part, opts)
|
||||||
switch {
|
switch {
|
||||||
case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
|
case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC):
|
||||||
// return immediately if the context is canceled or the device is out of space
|
// return immediately if the context is canceled or the device is out of space
|
||||||
|
@ -286,31 +206,29 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *regis
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
b.done = true
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart) error {
|
func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *registryOptions) error {
|
||||||
g, ctx := errgroup.WithContext(ctx)
|
g, ctx := errgroup.WithContext(ctx)
|
||||||
g.Go(func() error {
|
g.Go(func() error {
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, requestURL.String(), nil)
|
headers := make(http.Header)
|
||||||
if err != nil {
|
headers.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1))
|
||||||
return err
|
resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, opts)
|
||||||
}
|
|
||||||
req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1))
|
|
||||||
resp, err := http.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed.Load())
|
n, err := io.CopyN(w, io.TeeReader(resp.Body, part), part.Size-part.Completed)
|
||||||
if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) {
|
if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) {
|
||||||
// rollback progress
|
// rollback progress
|
||||||
b.Completed.Add(-n)
|
b.Completed.Add(-n)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
part.Completed.Add(n)
|
part.Completed += n
|
||||||
if err := b.writePart(part.Name(), part); err != nil {
|
if err := b.writePart(part.Name(), part); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -324,21 +242,15 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
if part.Completed.Load() >= part.Size {
|
if part.Completed >= part.Size {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
part.lastUpdatedMu.Lock()
|
if !part.lastUpdated.IsZero() && time.Since(part.lastUpdated) > 5*time.Second {
|
||||||
lastUpdated := part.lastUpdated
|
|
||||||
part.lastUpdatedMu.Unlock()
|
|
||||||
|
|
||||||
if !lastUpdated.IsZero() && time.Since(lastUpdated) > 5*time.Second {
|
|
||||||
const msg = "%s part %d stalled; retrying. If this persists, press ctrl-c to exit, then 'ollama pull' to find a faster connection."
|
const msg = "%s part %d stalled; retrying. If this persists, press ctrl-c to exit, then 'ollama pull' to find a faster connection."
|
||||||
slog.Info(fmt.Sprintf(msg, b.Digest[7:19], part.N))
|
slog.Info(fmt.Sprintf(msg, b.Digest[7:19], part.N))
|
||||||
// reset last updated
|
// reset last updated
|
||||||
part.lastUpdatedMu.Lock()
|
|
||||||
part.lastUpdated = time.Time{}
|
part.lastUpdated = time.Time{}
|
||||||
part.lastUpdatedMu.Unlock()
|
|
||||||
return errPartStalled
|
return errPartStalled
|
||||||
}
|
}
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -403,8 +315,6 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse))
|
||||||
ticker := time.NewTicker(60 * time.Millisecond)
|
ticker := time.NewTicker(60 * time.Millisecond)
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-b.done:
|
|
||||||
return b.err
|
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
fn(api.ProgressResponse{
|
fn(api.ProgressResponse{
|
||||||
Status: fmt.Sprintf("pulling %s", b.Digest[7:19]),
|
Status: fmt.Sprintf("pulling %s", b.Digest[7:19]),
|
||||||
|
@ -412,6 +322,10 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse))
|
||||||
Total: b.Total,
|
Total: b.Total,
|
||||||
Completed: b.Completed.Load(),
|
Completed: b.Completed.Load(),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
if b.done || b.err != nil {
|
||||||
|
return b.err
|
||||||
|
}
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return ctx.Err()
|
return ctx.Err()
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,8 +54,6 @@ type registryOptions struct {
|
||||||
Username string
|
Username string
|
||||||
Password string
|
Password string
|
||||||
Token string
|
Token string
|
||||||
|
|
||||||
CheckRedirect func(req *http.Request, via []*http.Request) error
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type Model struct {
|
type Model struct {
|
||||||
|
@ -1133,9 +1131,7 @@ func makeRequest(ctx context.Context, method string, requestURL *url.URL, header
|
||||||
req.ContentLength = contentLength
|
req.ContentLength = contentLength
|
||||||
}
|
}
|
||||||
|
|
||||||
resp, err := (&http.Client{
|
resp, err := http.DefaultClient.Do(req)
|
||||||
CheckRedirect: regOpts.CheckRedirect,
|
|
||||||
}).Do(req)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
|
@ -263,27 +263,13 @@ func detectChatTemplate(layers []*layerGGML) ([]*layerGGML, error) {
|
||||||
if t, err := template.Named(s); err != nil {
|
if t, err := template.Named(s); err != nil {
|
||||||
slog.Debug("template detection", "error", err)
|
slog.Debug("template detection", "error", err)
|
||||||
} else {
|
} else {
|
||||||
layer, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
|
tmpl, err := NewLayer(t.Reader(), "application/vnd.ollama.image.template")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
layer.status = fmt.Sprintf("using autodetected template %s", t.Name)
|
tmpl.status = fmt.Sprintf("using autodetected template %s", t.Name)
|
||||||
layers = append(layers, &layerGGML{layer, nil})
|
layers = append(layers, &layerGGML{tmpl, nil})
|
||||||
|
|
||||||
if t.Parameters != nil {
|
|
||||||
var b bytes.Buffer
|
|
||||||
if err := json.NewEncoder(&b).Encode(t.Parameters); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
layer, err := NewLayer(&b, "application/vnd.ollama.image.params")
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
layers = append(layers, &layerGGML{layer, nil})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -358,10 +344,6 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if name == "" || arguments == "" {
|
|
||||||
return nil, false
|
|
||||||
}
|
|
||||||
|
|
||||||
var objs []map[string]any
|
var objs []map[string]any
|
||||||
for offset := 0; offset < len(s); {
|
for offset := 0; offset < len(s); {
|
||||||
var obj map[string]any
|
var obj map[string]any
|
||||||
|
@ -379,41 +361,24 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
|
||||||
return nil, false
|
return nil, false
|
||||||
} else {
|
} else {
|
||||||
offset += int(decoder.InputOffset())
|
offset += int(decoder.InputOffset())
|
||||||
|
objs = append(objs, obj)
|
||||||
// collect all nested objects
|
|
||||||
var collect func(any) []map[string]any
|
|
||||||
collect = func(obj any) (all []map[string]any) {
|
|
||||||
switch o := obj.(type) {
|
|
||||||
case map[string]any:
|
|
||||||
all = append(all, o)
|
|
||||||
for _, v := range o {
|
|
||||||
all = append(all, collect(v)...)
|
|
||||||
}
|
|
||||||
case []any:
|
|
||||||
for _, v := range o {
|
|
||||||
all = append(all, collect(v)...)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return all
|
|
||||||
}
|
|
||||||
objs = append(objs, collect(obj)...)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var toolCalls []api.ToolCall
|
var toolCalls []api.ToolCall
|
||||||
for _, kv := range objs {
|
for _, kv := range objs {
|
||||||
n, nok := kv[name].(string)
|
var call api.ToolCall
|
||||||
a, aok := kv[arguments].(map[string]any)
|
for k, v := range kv {
|
||||||
if nok && aok {
|
switch k {
|
||||||
toolCalls = append(toolCalls, api.ToolCall{
|
case name:
|
||||||
Function: api.ToolCallFunction{
|
call.Function.Name = v.(string)
|
||||||
Name: n,
|
case arguments:
|
||||||
Arguments: a,
|
call.Function.Arguments = v.(map[string]any)
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
toolCalls = append(toolCalls, call)
|
||||||
|
}
|
||||||
|
|
||||||
return toolCalls, len(toolCalls) > 0
|
return toolCalls, len(toolCalls) > 0
|
||||||
}
|
}
|
||||||
|
|
|
@ -166,7 +166,6 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
|
||||||
{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
|
{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
|
||||||
{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
|
{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
|
||||||
</tool_call>`, true},
|
</tool_call>`, true},
|
||||||
{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var tools []api.Tool
|
var tools []api.Tool
|
||||||
|
|
|
@ -284,7 +284,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) EmbedHandler(c *gin.Context) {
|
func (s *Server) EmbedHandler(c *gin.Context) {
|
||||||
checkpointStart := time.Now()
|
|
||||||
var req api.EmbedRequest
|
var req api.EmbedRequest
|
||||||
err := c.ShouldBindJSON(&req)
|
err := c.ShouldBindJSON(&req)
|
||||||
switch {
|
switch {
|
||||||
|
@ -333,8 +332,6 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
checkpointLoaded := time.Now()
|
|
||||||
|
|
||||||
kvData, err := getKVData(m.ModelPath, false)
|
kvData, err := getKVData(m.ModelPath, false)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
|
@ -373,16 +370,13 @@ func (s *Server) EmbedHandler(c *gin.Context) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, e := range embeddings.Embedding {
|
for i, e := range embeddings {
|
||||||
embeddings.Embedding[i] = normalize(e)
|
embeddings[i] = normalize(e)
|
||||||
}
|
}
|
||||||
|
|
||||||
resp := api.EmbedResponse{
|
resp := api.EmbedResponse{
|
||||||
Model: req.Model,
|
Model: req.Model,
|
||||||
Embeddings: embeddings.Embedding,
|
Embeddings: embeddings,
|
||||||
TotalDuration: time.Since(checkpointStart),
|
|
||||||
LoadDuration: checkpointLoaded.Sub(checkpointStart),
|
|
||||||
PromptEvalCount: embeddings.PromptEvalCount,
|
|
||||||
}
|
}
|
||||||
c.JSON(http.StatusOK, resp)
|
c.JSON(http.StatusOK, resp)
|
||||||
}
|
}
|
||||||
|
@ -434,9 +428,9 @@ func (s *Server) EmbeddingsHandler(c *gin.Context) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
embedding := make([]float64, len(embeddings.Embedding[0]))
|
embedding := make([]float64, len(embeddings[0]))
|
||||||
|
|
||||||
for i, v := range embeddings.Embedding[0] {
|
for i, v := range embeddings[0] {
|
||||||
embedding[i] = float64(v)
|
embedding[i] = float64(v)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -615,9 +609,10 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
quantization := cmp.Or(r.Quantize, r.Quantization)
|
quantization := cmp.Or(r.Quantize, r.Quantization)
|
||||||
if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); errors.Is(err, errBadTemplate) {
|
if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil {
|
||||||
|
if errors.Is(err, errBadTemplate) {
|
||||||
ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
|
ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
|
||||||
} else if err != nil {
|
}
|
||||||
ch <- gin.H{"error": err.Error()}
|
ch <- gin.H{"error": err.Error()}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
|
@ -599,10 +599,9 @@ func TestCreateDetectTemplate(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
|
checkFileExists(t, filepath.Join(p, "blobs", "*"), []string{
|
||||||
filepath.Join(p, "blobs", "sha256-0d79f567714c62c048378f2107fb332dabee0135d080c302d884317da9433cc5"),
|
|
||||||
filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
|
filepath.Join(p, "blobs", "sha256-553c4a3f747b3d22a4946875f1cc8ed011c2930d83f864a0c7265f9ec0a20413"),
|
||||||
filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"),
|
filepath.Join(p, "blobs", "sha256-c608dc615584cd20d9d830363dabf8a4783ae5d34245c3d8c115edb3bc7b28e4"),
|
||||||
filepath.Join(p, "blobs", "sha256-ea34c57ba5b78b740aafe2aeb74dc6507fc3ad14170b64c26a04fb9e36c88d75"),
|
filepath.Join(p, "blobs", "sha256-f836ee110db21567f826332e4cedd746c06d10664fd5a9ea3659e3683a944510"),
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
|
@ -212,12 +212,9 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
} else if loadedCount == 0 {
|
} else if loadedCount == 0 {
|
||||||
// No models loaded. Load the model but prefer the best fit.
|
// No models loaded. Load the model but prefer the best fit.
|
||||||
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
||||||
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
|
g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
|
||||||
if g != nil {
|
if g != nil {
|
||||||
gpus = g
|
gpus = g
|
||||||
} else {
|
|
||||||
// Only allow partial loads when this is the first model
|
|
||||||
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
|
|
||||||
}
|
}
|
||||||
s.loadFn(pending, ggml, gpus, numParallel)
|
s.loadFn(pending, ggml, gpus, numParallel)
|
||||||
break
|
break
|
||||||
|
@ -234,7 +231,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||||
|
|
||||||
// Update free memory from currently loaded models
|
// Update free memory from currently loaded models
|
||||||
s.updateFreeSpace(availGpus)
|
s.updateFreeSpace(availGpus)
|
||||||
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
|
fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
|
||||||
if fitGpus != nil {
|
if fitGpus != nil {
|
||||||
slog.Debug("new model fits with existing models, loading")
|
slog.Debug("new model fits with existing models, loading")
|
||||||
s.loadFn(pending, ggml, fitGpus, numParallel)
|
s.loadFn(pending, ggml, fitGpus, numParallel)
|
||||||
|
@ -671,12 +668,11 @@ func (a ByDuration) Less(i, j int) bool {
|
||||||
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||||
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
|
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
|
||||||
|
|
||||||
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
||||||
// The list of GPUs returned will always be the same brand (library)
|
|
||||||
// If the model can not be fit fully within the available GPU(s) nil is returned
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
||||||
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
||||||
// opts.NumCtx accordingly
|
// opts.NumCtx accordingly
|
||||||
func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
||||||
var estimatedVRAM uint64
|
var estimatedVRAM uint64
|
||||||
|
|
||||||
var numParallelToTry []int
|
var numParallelToTry []int
|
||||||
|
@ -727,25 +723,6 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
|
||||||
func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
|
||||||
*numParallel = 1
|
|
||||||
byLibrary := gpus.ByLibrary()
|
|
||||||
if len(byLibrary) <= 1 {
|
|
||||||
return gpus
|
|
||||||
}
|
|
||||||
var bestEstimate uint64
|
|
||||||
var bestFit int
|
|
||||||
for i, gl := range byLibrary {
|
|
||||||
_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
|
||||||
if estimatedVRAM > bestEstimate {
|
|
||||||
bestEstimate = estimatedVRAM
|
|
||||||
bestFit = i
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return byLibrary[bestFit]
|
|
||||||
}
|
|
||||||
|
|
||||||
// findRunnerToUnload finds a runner to unload to make room for a new model
|
// findRunnerToUnload finds a runner to unload to make room for a new model
|
||||||
func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
func (s *Scheduler) findRunnerToUnload() *runnerRef {
|
||||||
s.loadedMu.Lock()
|
s.loadedMu.Lock()
|
||||||
|
|
|
@ -666,50 +666,11 @@ func TestAlreadyCanceled(t *testing.T) {
|
||||||
require.Empty(t, scenario1a.req.successCh)
|
require.Empty(t, scenario1a.req.successCh)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestHomogeneousGPUs(t *testing.T) {
|
|
||||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
|
||||||
defer done()
|
|
||||||
s := InitScheduler(ctx)
|
|
||||||
|
|
||||||
s.getGpuFn = func() gpu.GpuInfoList {
|
|
||||||
// Set memory values to require the model to be spread
|
|
||||||
gpus := []gpu.GpuInfo{
|
|
||||||
{Library: "cuda"},
|
|
||||||
{Library: "rocm"},
|
|
||||||
}
|
|
||||||
gpus[0].TotalMemory = 1 * format.GibiByte
|
|
||||||
gpus[0].FreeMemory = 256 * format.MebiByte
|
|
||||||
gpus[1].TotalMemory = 1 * format.GibiByte
|
|
||||||
gpus[1].FreeMemory = 256 * format.MebiByte
|
|
||||||
return gpus
|
|
||||||
}
|
|
||||||
s.getCpuFn = getCpuFn
|
|
||||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
|
||||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
|
||||||
require.Len(t, gpus, 1)
|
|
||||||
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
|
||||||
}
|
|
||||||
slog.Info("a")
|
|
||||||
s.pendingReqCh <- a.req
|
|
||||||
require.Len(t, s.pendingReqCh, 1)
|
|
||||||
s.Run(ctx)
|
|
||||||
select {
|
|
||||||
case resp := <-a.req.successCh:
|
|
||||||
require.Equal(t, resp.llama, a.srv)
|
|
||||||
require.Empty(t, s.pendingReqCh)
|
|
||||||
require.Empty(t, a.req.errCh)
|
|
||||||
case err := <-a.req.errCh:
|
|
||||||
t.Fatal(err.Error())
|
|
||||||
case <-ctx.Done():
|
|
||||||
t.Fatal("timeout")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type mockLlm struct {
|
type mockLlm struct {
|
||||||
pingResp error
|
pingResp error
|
||||||
waitResp error
|
waitResp error
|
||||||
completionResp error
|
completionResp error
|
||||||
embedResp *llm.EmbedResponse
|
embedResp [][]float32
|
||||||
embedRespErr error
|
embedRespErr error
|
||||||
tokenizeResp []int
|
tokenizeResp []int
|
||||||
tokenizeRespErr error
|
tokenizeRespErr error
|
||||||
|
@ -727,7 +688,7 @@ func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitRes
|
||||||
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
|
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
|
||||||
return s.completionResp
|
return s.completionResp
|
||||||
}
|
}
|
||||||
func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
|
func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
|
||||||
return s.embedResp, s.embedRespErr
|
return s.embedResp, s.embedRespErr
|
||||||
}
|
}
|
||||||
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
|
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||||
|
|
45
server/testdata/tools/xlam.gotmpl
vendored
45
server/testdata/tools/xlam.gotmpl
vendored
|
@ -1,45 +0,0 @@
|
||||||
{{- if .System }}{{ .System }}
|
|
||||||
{{ end }}
|
|
||||||
{{- range $i, $_ := .Messages }}
|
|
||||||
{{- if eq .Role "user" }}### Instruction:
|
|
||||||
{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }}
|
|
||||||
[BEGIN OF TASK INSTRUCTION]
|
|
||||||
You are an expert in composing functions. You are given a question and a set of possible functions.
|
|
||||||
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
|
|
||||||
If none of the functions can be used, point it out and refuse to answer.
|
|
||||||
If the given question lacks the parameters required by the function, also point it out.
|
|
||||||
[END OF TASK INSTRUCTION]
|
|
||||||
|
|
||||||
[BEGIN OF AVAILABLE TOOLS]
|
|
||||||
{{ $.Tools }}
|
|
||||||
[END OF AVAILABLE TOOLS]
|
|
||||||
|
|
||||||
[BEGIN OF FORMAT INSTRUCTION]
|
|
||||||
The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
|
|
||||||
The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"tool_calls": [
|
|
||||||
{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
|
|
||||||
... (more tool calls as required)
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
[END OF FORMAT INSTRUCTION]
|
|
||||||
|
|
||||||
[BEGIN OF QUERY]
|
|
||||||
{{ .Content }}
|
|
||||||
[END OF QUERY]
|
|
||||||
|
|
||||||
|
|
||||||
{{ else }}
|
|
||||||
{{ .Content }}
|
|
||||||
{{ end }}
|
|
||||||
{{- else if .ToolCalls }}### Response:
|
|
||||||
{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]}
|
|
||||||
<|EOT|>
|
|
||||||
{{ else if eq .Role "assistant" }}### Response:
|
|
||||||
{{ .Content }}
|
|
||||||
<|EOT|>
|
|
||||||
{{ end }}
|
|
||||||
{{- end }}### Response:
|
|
40
server/testdata/tools/xlam.out
vendored
40
server/testdata/tools/xlam.out
vendored
|
@ -1,40 +0,0 @@
|
||||||
You are a knowledgable assistant. You can answer questions and perform tasks.
|
|
||||||
### Instruction:
|
|
||||||
What's the weather like today in Paris?
|
|
||||||
### Response:
|
|
||||||
{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]}
|
|
||||||
<|EOT|>
|
|
||||||
### Response:
|
|
||||||
The current temperature in Paris, France is 22 degrees Celsius.
|
|
||||||
<|EOT|>
|
|
||||||
### Instruction:
|
|
||||||
[BEGIN OF TASK INSTRUCTION]
|
|
||||||
You are an expert in composing functions. You are given a question and a set of possible functions.
|
|
||||||
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
|
|
||||||
If none of the functions can be used, point it out and refuse to answer.
|
|
||||||
If the given question lacks the parameters required by the function, also point it out.
|
|
||||||
[END OF TASK INSTRUCTION]
|
|
||||||
|
|
||||||
[BEGIN OF AVAILABLE TOOLS]
|
|
||||||
[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]
|
|
||||||
[END OF AVAILABLE TOOLS]
|
|
||||||
|
|
||||||
[BEGIN OF FORMAT INSTRUCTION]
|
|
||||||
The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
|
|
||||||
The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
|
|
||||||
```
|
|
||||||
{
|
|
||||||
"tool_calls": [
|
|
||||||
{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
|
|
||||||
... (more tool calls as required)
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
[END OF FORMAT INSTRUCTION]
|
|
||||||
|
|
||||||
[BEGIN OF QUERY]
|
|
||||||
What's the weather like today in San Francisco and Toronto?
|
|
||||||
[END OF QUERY]
|
|
||||||
|
|
||||||
|
|
||||||
### Response:
|
|
|
@ -254,7 +254,7 @@ func (b *blobUpload) uploadPart(ctx context.Context, method string, requestURL *
|
||||||
|
|
||||||
// retry uploading to the redirect URL
|
// retry uploading to the redirect URL
|
||||||
for try := range maxRetries {
|
for try := range maxRetries {
|
||||||
err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, ®istryOptions{})
|
err = b.uploadPart(ctx, http.MethodPut, redirectURL, part, nil)
|
||||||
switch {
|
switch {
|
||||||
case errors.Is(err, context.Canceled):
|
case errors.Is(err, context.Canceled):
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -1,8 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"<start_system>",
|
|
||||||
"<end_message>",
|
|
||||||
"<start_user>",
|
|
||||||
"<start_assistant>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"### Instruction:",
|
|
||||||
"### Response"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"<|im_start|>",
|
|
||||||
"<|im_end|>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,8 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"System:",
|
|
||||||
"User:",
|
|
||||||
"Assistant:",
|
|
||||||
"<|begin_of_text|>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,7 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"Source:",
|
|
||||||
"Destination:",
|
|
||||||
"<step>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"User:",
|
|
||||||
"Assistant:"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"<start_of_turn>",
|
|
||||||
"<end_of_turn>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,7 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"System:",
|
|
||||||
"Question:",
|
|
||||||
"Answer:"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,8 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"[INST]",
|
|
||||||
"[/INST]",
|
|
||||||
"<<SYS>>",
|
|
||||||
"<</SYS>>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,7 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"<|start_header_id|>",
|
|
||||||
"<|end_header_id|>",
|
|
||||||
"<|eot_id|>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"@@ Instruction",
|
|
||||||
"@@ Response"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"<|im_start|>",
|
|
||||||
"<|im_end|>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,5 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"<|end_of_turn|>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,8 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"<|end|>",
|
|
||||||
"<|system|>",
|
|
||||||
"<|user|>",
|
|
||||||
"<|assistant|>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,7 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"### System:",
|
|
||||||
"### User:",
|
|
||||||
"### Assistant"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,7 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"### Instruction",
|
|
||||||
"### Response",
|
|
||||||
"<|endoftext|>"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -23,7 +23,6 @@ import (
|
||||||
var indexBytes []byte
|
var indexBytes []byte
|
||||||
|
|
||||||
//go:embed *.gotmpl
|
//go:embed *.gotmpl
|
||||||
//go:embed *.json
|
|
||||||
var templatesFS embed.FS
|
var templatesFS embed.FS
|
||||||
|
|
||||||
var templatesOnce = sync.OnceValues(func() ([]*named, error) {
|
var templatesOnce = sync.OnceValues(func() ([]*named, error) {
|
||||||
|
@ -40,15 +39,6 @@ var templatesOnce = sync.OnceValues(func() ([]*named, error) {
|
||||||
|
|
||||||
// normalize line endings
|
// normalize line endings
|
||||||
t.Bytes = bytes.ReplaceAll(bts, []byte("\r\n"), []byte("\n"))
|
t.Bytes = bytes.ReplaceAll(bts, []byte("\r\n"), []byte("\n"))
|
||||||
|
|
||||||
params, err := templatesFS.ReadFile(t.Name + ".json")
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := json.Unmarshal(params, &t.Parameters); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return templates, nil
|
return templates, nil
|
||||||
|
@ -58,10 +48,6 @@ type named struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Template string `json:"template"`
|
Template string `json:"template"`
|
||||||
Bytes []byte
|
Bytes []byte
|
||||||
|
|
||||||
Parameters *struct {
|
|
||||||
Stop []string `json:"stop"`
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t named) Reader() io.Reader {
|
func (t named) Reader() io.Reader {
|
||||||
|
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"USER:",
|
|
||||||
"ASSISTANT:"
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,8 +0,0 @@
|
||||||
{
|
|
||||||
"stop": [
|
|
||||||
"<|system|>",
|
|
||||||
"</s>",
|
|
||||||
"<|user|>",
|
|
||||||
"<|assistant|>"
|
|
||||||
]
|
|
||||||
}
|
|
Loading…
Reference in a new issue