Merge branch 'ollama:main' into mannix-server

This commit is contained in:
ManniX-ITA 2024-04-18 18:45:15 +02:00 committed by GitHub
commit c496967e56
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 148 additions and 204 deletions

View file

@ -0,0 +1,60 @@
name: Bug report
labels: [bug]
description: Something isn't working right.
body:
- type: textarea
id: description
attributes:
label: What is the issue?
description: What happened? What did you expect to happen?
validations:
required: true
- type: dropdown
id: os
attributes:
label: OS
description: Which operating system are you using?
multiple: true
options:
- Linux
- macOS
- Windows
- Docker
- WSL2
validations:
required: false
- type: dropdown
id: gpu
attributes:
label: GPU
description: Which GPU are you using?
multiple: true
options:
- Nvidia
- AMD
- Intel
- Apple
- Other
validations:
required: false
- type: dropdown
id: cpu
attributes:
label: CPU
description: Which CPU are you using?
multiple: true
options:
- Intel
- AMD
- Apple
- Other
validations:
required: false
- type: input
id: version
attributes:
label: Ollama version
description: What version of Ollama are you using? (`ollama --version`)
placeholder: e.g., 0.1.32
validations:
required: false

View file

@ -1,18 +0,0 @@
name: Model request
description: Request a new model for the library
labels: [mr]
body:
- type: markdown
attributes:
value: |
Please check if your Model request is [already available](https://ollama.com/search) or that you cannot [import it](https://github.com/ollama/ollama/blob/main/docs/import.md#import-a-model) yourself.
Tell us about which Model you'd like to see in the library!
- type: textarea
id: problem
attributes:
label: What model would you like?
description: Please provide a link to the model.
- type: markdown
attributes:
value: |
Thanks for filing a model request!

View file

@ -0,0 +1,6 @@
---
name: Feature request
about: Request a new feature
labels: feature request
---

View file

@ -1,41 +0,0 @@
name: Feature request
description: Propose a new feature
labels: [needs-triage, fr]
body:
- type: markdown
attributes:
value: |
Please check if your feature request is [already filed](https://github.com/ollama/ollama/issues).
Tell us about your idea!
- type: textarea
id: problem
attributes:
label: What are you trying to do?
description: Tell us about the problem you're trying to solve.
validations:
required: false
- type: textarea
id: solution
attributes:
label: How should we solve this?
description: If you have an idea of how you'd like to see this feature work, let us know.
validations:
required: false
- type: textarea
id: alternative
attributes:
label: What is the impact of not solving this?
description: (How) Are you currently working around the issue?
validations:
required: false
- type: textarea
id: context
attributes:
label: Anything else?
description: Any additional context to share, e.g., links
validations:
required: false
- type: markdown
attributes:
value: |
Thanks for filing a feature request!

View file

@ -0,0 +1,5 @@
---
name: Model request
about: Request support for a new model to be added to Ollama
labels: model request
---

View file

@ -1,125 +0,0 @@
name: Bug report
description: File a bug report. If you need help, please join our Discord server.
labels: [needs-triage, bug]
body:
- type: markdown
attributes:
value: |
Please check if your bug is [already filed](https://github.com/ollama/ollama/issues) before filing a new one.
- type: textarea
id: what-happened
attributes:
label: What is the issue?
description: What happened? What did you expect to happen?
validations:
required: true
- type: textarea
id: what-was-expected
attributes:
label: What did you expect to see?
description: What did you expect to see/happen instead?
validations:
required: false
- type: textarea
id: steps
attributes:
label: Steps to reproduce
description: What are the steps you took that hit this issue?
validations:
required: false
- type: textarea
id: changes
attributes:
label: Are there any recent changes that introduced the issue?
description: If so, what are those changes?
validations:
required: false
- type: dropdown
id: os
attributes:
label: OS
description: What OS are you using? You may select more than one.
multiple: true
options:
- Linux
- macOS
- Windows
- Other
validations:
required: false
- type: dropdown
id: architecture
attributes:
label: Architecture
description: What architecture are you using? You may select more than one.
multiple: true
options:
- arm64
- amd64
- x86
- Other
- type: dropdown
id: platform
attributes:
label: Platform
description: What platform are you using? You may select more than one.
multiple: true
options:
- Docker
- WSL
- WSL2
validations:
required: false
- type: input
id: ollama-version
attributes:
label: Ollama version
description: What Ollama version are you using? (`ollama --version`)
placeholder: e.g., 1.14.4
validations:
required: false
- type: dropdown
id: gpu
attributes:
label: GPU
description: What GPU, if any, are you using? You may select more than one.
multiple: true
options:
- Nvidia
- AMD
- Intel
- Apple
- Other
validations:
required: false
- type: textarea
id: gpu-info
attributes:
label: GPU info
description: What GPU info do you have? (`nvidia-smi`, `rocminfo`, `system_profiler SPDisplaysDataType`, etc.)
validations:
required: false
- type: dropdown
id: cpu
attributes:
label: CPU
description: What CPU are you using? You may select more than one.
multiple: true
options:
- Intel
- AMD
- Apple
- Other
validations:
required: false
- type: textarea
id: other-software
attributes:
label: Other software
description: What other software are you using that might be related to this issue?
validations:
required: false
- type: markdown
attributes:
value: |
Thanks for filing a bug report!

View file

@ -60,7 +60,6 @@ Here are some example models that can be downloaded:
| Llama 2 13B | 13B | 7.3GB | `ollama run llama2:13b` |
| Llama 2 70B | 70B | 39GB | `ollama run llama2:70b` |
| Orca Mini | 3B | 1.9GB | `ollama run orca-mini` |
| Vicuna | 7B | 3.8GB | `ollama run vicuna` |
| LLaVA | 7B | 4.5GB | `ollama run llava` |
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
@ -378,3 +377,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
- [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
- [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
- [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
### Supported backends
- [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.

View file

@ -164,7 +164,8 @@ func (ts Tensors) Layers() map[string]Layer {
for _, t := range ts {
parts := strings.Split(t.Name, ".")
if parts[0] == "blk" {
parts = parts[1:]
// join first and second part, e.g. blk.%d
parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
}
if _, ok := layers[parts[0]]; !ok {
@ -380,6 +381,12 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
)
partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
case "stablelm":
fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
partialOffload = max(
4*batch*(vocab+2*embedding),
fullOffload,
)
}
return

View file

@ -248,13 +248,17 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
}
padding := llm.padding(offset, int64(alignment))
if _, err := rs.Seek(padding-offset, io.SeekCurrent); err != nil {
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
}
for _, tensor := range llm.tensors {
padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
if _, err := rs.Seek(padded, io.SeekCurrent); err != nil {
if _, err := rs.Seek(int64(tensor.size()), io.SeekCurrent); err != nil {
return err
}
padding := llm.padding(int64(tensor.size()), int64(alignment))
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
return err
}
}
@ -623,8 +627,9 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
return err
}
padding := llm.padding(offset, 32)
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding-offset))); err != nil {
var alignment int64 = 32
padding := llm.padding(offset, alignment)
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
return err
}
@ -638,8 +643,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
return err
}
padding := llm.padding(offset, 32)
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding-offset))); err != nil {
padding := llm.padding(offset, alignment)
if err := binary.Write(ws, llm.ByteOrder, bytes.Repeat([]byte{0}, int(padding))); err != nil {
return err
}
}
@ -648,5 +653,5 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
}
func (gguf) padding(offset, align int64) int64 {
return (offset + align - 1) / align * align
return (align - offset%align) % align
}

View file

@ -97,7 +97,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
var layerCount int
layers := ggml.Tensors().Layers()
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
memoryLayer := layers[fmt.Sprintf("%d", i)].size()
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
// KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount()
@ -109,7 +109,13 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
}
}
memoryLayerOutput := layers["output"].size()
var memoryLayerOutput uint64
for k, v := range layers {
if !strings.HasPrefix(k, "blk.") {
memoryLayerOutput += v.size()
}
}
memoryRequiredTotal += memoryLayerOutput
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
@ -124,16 +130,47 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
opts.NumGPU = layerCount
}
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
slog.Info(
"offload to gpu",
"reallayers", opts.NumGPU,
"layers", layerCount,
"required", format.HumanBytes2(memoryRequiredTotal),
"used", format.HumanBytes2(memoryRequiredPartial),
"available", format.HumanBytes2(memoryAvailable),
"kv", format.HumanBytes2(kv),
"fulloffload", format.HumanBytes2(graphFullOffload),
"partialoffload", format.HumanBytes2(graphPartialOffload),
slog.Group(
"layers",
// actual number of layers offloaded
"real", opts.NumGPU,
// estimated number of layers that can be offloaded
"estimate", layerCount,
),
slog.Group(
"memory",
// memory available for offloading
"available", format.HumanBytes2(memoryAvailable),
slog.Group(
"required",
// memory required for full offloading
"full", format.HumanBytes2(memoryRequiredTotal),
// memory required to offload layers.estimate layers
"partial", format.HumanBytes2(memoryRequiredPartial),
// memory of KV cache
"kv", format.HumanBytes2(kv),
),
slog.Group(
"weights",
// memory of the weights
"total", format.HumanBytes2(memoryWeights),
// memory of repeating layers
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
// memory of non-repeating layers
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
),
slog.Group(
"graph",
// memory of graph when fully offloaded
"full", format.HumanBytes2(graphFullOffload),
// memory of graph when not fully offloaded
"partial", format.HumanBytes2(graphPartialOffload),
),
),
)
if len(adapters) > 1 {

View file

@ -521,6 +521,8 @@ func parts(s string) iter_Seq2[PartKind, string] {
return
}
state, j, partLen = PartModel, i, 0
case PartHost:
// noop: support for host:port
default:
yield(PartExtraneous, s[i+1:j])
return
@ -678,6 +680,9 @@ func isValidByteFor(kind PartKind, c byte) bool {
if kind == PartNamespace && c == '.' {
return false
}
if kind == PartHost && c == ':' {
return true
}
if c == '.' || c == '-' {
return true
}

View file

@ -40,6 +40,7 @@ var testNames = map[string]fields{
"user/model": {namespace: "user", model: "model"},
"example.com/ns/mistral:7b+Q4_0": {host: "example.com", namespace: "ns", model: "mistral", tag: "7b", build: "Q4_0"},
"example.com/ns/mistral:7b+X": {host: "example.com", namespace: "ns", model: "mistral", tag: "7b", build: "X"},
"localhost:5000/ns/mistral": {host: "localhost:5000", namespace: "ns", model: "mistral"},
// invalid digest
"mistral:latest@invalid256-": {},