Merge branch 'ollama:main' into arm64static

2024-04-17 15:11:38 -04:00 · 2024-04-17 15:11:38 -04:00 · ea4c284a48
commit ea4c284a48
parent 8aec92fa6d 2bdc320216
8 changed files with 125 additions and 188 deletions
--- a/.github/ISSUE_TEMPLATE/10_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/10_bug_report.yml
@ -0,0 +1,60 @@
 name: Bug report
 labels: [bug]
 description: Something isn't working right.
 body:
  - type: textarea
    id: description
    attributes:
      label: What is the issue?
      description: What happened? What did you expect to happen?
    validations:
      required: true
  - type: dropdown
    id: os
    attributes:
      label: OS
      description: Which operating system are you using?
      multiple: true
      options:
        - Linux
        - macOS
        - Windows
        - Docker
        - WSL2
    validations:
      required: false
  - type: dropdown
    id: gpu
    attributes:
      label: GPU
      description: Which GPU are you using?
      multiple: true
      options:
        - Nvidia
        - AMD
        - Intel
        - Apple
        - Other
    validations:
      required: false
  - type: dropdown
    id: cpu
    attributes:
      label: CPU
      description: Which CPU are you using?
      multiple: true
      options:
        - Intel
        - AMD
        - Apple
        - Other
    validations:
      required: false
  - type: input
    id: version
    attributes:
      label: Ollama version
      description: What version of Ollama are you using? (`ollama --version`)
      placeholder: e.g., 0.1.32
    validations:
      required: false
--- a/.github/ISSUE_TEMPLATE/10_model_request.yml
+++ b/.github/ISSUE_TEMPLATE/10_model_request.yml
@ -1,18 +0,0 @@
 name: Model request
 description: Request a new model for the library
 labels: [mr]
 body:
  - type: markdown
    attributes:
      value: |
        Please check if your Model request is [already available](https://ollama.com/search) or that you cannot [import it](https://github.com/ollama/ollama/blob/main/docs/import.md#import-a-model) yourself.
        Tell us about which Model you'd like to see in the library!
  - type: textarea
    id: problem
    attributes:
      label: What model would you like?
      description: Please provide a link to the model.
  - type: markdown
    attributes:
      value: |
        Thanks for filing a model request!
--- a/.github/ISSUE_TEMPLATE/20_feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/20_feature_request.yml
@ -1,41 +1,11 @@
 name: Feature request
-description: Propose a new feature
+labels: ['feature request']
-labels: [needs-triage, fr]
+description: Request a new feature.
 body:
  - type: markdown
    attributes:
      value: |
        Please check if your feature request is [already filed](https://github.com/ollama/ollama/issues).
        Tell us about your idea!
  - type: textarea
    id: problem
    attributes:
-      label: What are you trying to do?
+      label: What new feature would you like to see?
      description: Tell us about the problem you're trying to solve.
    validations:
      required: false
  - type: textarea
    id: solution
    attributes:
      label: How should we solve this?
      description: If you have an idea of how you'd like to see this feature work, let us know.
    validations:
      required: false
  - type: textarea
    id: alternative
    attributes:
      label: What is the impact of not solving this?
      description: (How) Are you currently working around the issue?
    validations:
      required: false
  - type: textarea
    id: context
    attributes:
      label: Anything else?
      description: Any additional context to share, e.g., links
    validations:
      required: false
  - type: markdown
    attributes:
      value: |
        Thanks for filing a feature request!
--- a/.github/ISSUE_TEMPLATE/30_model_request.yml
+++ b/.github/ISSUE_TEMPLATE/30_model_request.yml
@ -0,0 +1,9 @@
 name: Model request
 labels: ['model request']
 description: Request a new model.
 body:
  - type: textarea
    id: problem
    attributes:
      label: What model would you like?
      description: Please provide a link to the model.
--- a/.github/ISSUE_TEMPLATE/90_bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/90_bug_report.yml
@ -1,125 +0,0 @@
 name: Bug report
 description: File a bug report. If you need help, please join our Discord server.
 labels: [needs-triage, bug]
 body:
  - type: markdown
    attributes:
      value: |
        Please check if your bug is [already filed](https://github.com/ollama/ollama/issues) before filing a new one.
  - type: textarea
    id: what-happened
    attributes:
      label: What is the issue?
      description: What happened? What did you expect to happen?
    validations:
      required: true
  - type: textarea
    id: what-was-expected
    attributes:
      label: What did you expect to see?
      description: What did you expect to see/happen instead?
    validations:
      required: false
  - type: textarea
    id: steps
    attributes:
      label: Steps to reproduce
      description: What are the steps you took that hit this issue?
    validations:
      required: false
  - type: textarea
    id: changes
    attributes:
      label: Are there any recent changes that introduced the issue?
      description: If so, what are those changes?
    validations:
      required: false
  - type: dropdown
    id: os
    attributes:
      label: OS
      description: What OS are you using? You may select more than one.
      multiple: true
      options:
        - Linux
        - macOS
        - Windows
        - Other
    validations:
      required: false
  - type: dropdown
    id: architecture
    attributes:
      label: Architecture
      description: What architecture are you using? You may select more than one.
      multiple: true
      options:
        - arm64
        - amd64
        - x86
        - Other
  - type: dropdown
    id: platform
    attributes:
      label: Platform
      description: What platform are you using? You may select more than one.
      multiple: true
      options:
        - Docker
        - WSL
        - WSL2
    validations:
      required: false
  - type: input
    id: ollama-version
    attributes:
      label: Ollama version
      description: What Ollama version are you using? (`ollama --version`)
      placeholder: e.g., 1.14.4
    validations:
      required: false
  - type: dropdown
    id: gpu
    attributes:
      label: GPU
      description: What GPU, if any, are you using? You may select more than one.
      multiple: true
      options:
        - Nvidia
        - AMD
        - Intel
        - Apple
        - Other
    validations:
      required: false
  - type: textarea
    id: gpu-info
    attributes:
      label: GPU info
      description: What GPU info do you have? (`nvidia-smi`, `rocminfo`, `system_profiler SPDisplaysDataType`, etc.)
    validations:
      required: false
  - type: dropdown
    id: cpu
    attributes:
      label: CPU
      description: What CPU are you using? You may select more than one.
      multiple: true
      options:
        - Intel
        - AMD
        - Apple
        - Other
    validations:
      required: false
  - type: textarea
    id: other-software
    attributes:
      label: Other software
      description: What other software are you using that might be related to this issue?
    validations:
      required: false
  - type: markdown
    attributes:
      value: |
        Thanks for filing a bug report!
--- a/README.md
+++ b/README.md
@ -60,7 +60,6 @@ Here are some example models that can be downloaded:
 | Llama 2 13B        | 13B        | 7.3GB | `ollama run llama2:13b`        |
 | Llama 2 70B        | 70B        | 39GB  | `ollama run llama2:70b`        |
 | Orca Mini          | 3B         | 1.9GB | `ollama run orca-mini`         |
 | Vicuna             | 7B         | 3.8GB | `ollama run vicuna`            |
 | LLaVA              | 7B         | 4.5GB | `ollama run llava`             |
 | Gemma              | 2B         | 1.4GB | `ollama run gemma:2b`          |
 | Gemma              | 7B         | 4.8GB | `ollama run gemma:7b`          |
@ -378,3 +377,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Page Assist](https://github.com/n4ze3m/page-assist) (Chrome Extension)
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
 ### Supported backends 
 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov. 
--- a/llm/ggml.go
+++ b/llm/ggml.go
@ -164,7 +164,8 @@ func (ts Tensors) Layers() map[string]Layer {
 	for _, t := range ts {
 		parts := strings.Split(t.Name, ".")
 		if parts[0] == "blk" {
-			parts = parts[1:]
+			// join first and second part, e.g. blk.%d
 			parts = append([]string{fmt.Sprintf("%s.%s", parts[0], parts[1])}, parts[2:]...)
 		}
 		if _, ok := layers[parts[0]]; !ok {
--- a/llm/server.go
+++ b/llm/server.go
@ -97,7 +97,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 	var layerCount int
 	layers := ggml.Tensors().Layers()
 	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
-		memoryLayer := layers[fmt.Sprintf("%d", i)].size()
+		memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
 		// KV is proportional to the number of layers
 		memoryLayer += kv / ggml.KV().BlockCount()
@ -109,7 +109,14 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 		}
 	}
-	memoryLayerOutput := layers["output"].size()
+	var memoryLayerOutput uint64
 	for k, v := range layers {
 		if !strings.HasPrefix(k, "blk.") {
 			slog.Info("aaa", "name", k, "size", format.HumanBytes2(v.size()))
 			memoryLayerOutput += v.size()
 		}
 	}
 	memoryRequiredTotal += memoryLayerOutput
 	if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
@ -124,16 +131,47 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 		opts.NumGPU = layerCount
 	}
 	memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
 	slog.Info(
 		"offload to gpu",
-		"reallayers", opts.NumGPU,
+		slog.Group(
-		"layers", layerCount,
+			"layers",
-		"required", format.HumanBytes2(memoryRequiredTotal),
+			// actual number of layers offloaded
-		"used", format.HumanBytes2(memoryRequiredPartial),
+			"real", opts.NumGPU,
 			// estimated number of layers that can be offloaded
 			"estimate", layerCount,
 		),
 		slog.Group(
 			"memory",
 			// memory available for offloading
 			"available", format.HumanBytes2(memoryAvailable),
 			slog.Group(
 				"required",
 				// memory required for full offloading
 				"full", format.HumanBytes2(memoryRequiredTotal),
 				// memory required to offload layers.estimate layers
 				"partial", format.HumanBytes2(memoryRequiredPartial),
 				// memory of KV cache
 				"kv", format.HumanBytes2(kv),
-		"fulloffload", format.HumanBytes2(graphFullOffload),
+			),
-		"partialoffload", format.HumanBytes2(graphPartialOffload),
+			slog.Group(
 				"weights",
 				// memory of the weights
 				"total", format.HumanBytes2(memoryWeights),
 				// memory of repeating layers
 				"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
 				// memory of non-repeating layers
 				"nonrepeating", format.HumanBytes2(memoryLayerOutput),
 			),
 			slog.Group(
 				"graph",
 				// memory of graph when fully offloaded
 				"full", format.HumanBytes2(graphFullOffload),
 				// memory of graph when not fully offloaded
 				"partial", format.HumanBytes2(graphPartialOffload),
 			),
 		),
 	)
 	if len(adapters) > 1 {