Compare commits
No commits in common. "77dfd8ba16ed8e7569cb0763c926bfeb1d5da656" and "1134287d3833ce98a02202d6a5174fe9832ab6e9" have entirely different histories.
77dfd8ba16
...
1134287d38
341 changed files with 11562 additions and 19591 deletions
2
.github/workflows/test.yaml
vendored
2
.github/workflows/test.yaml
vendored
|
@ -333,4 +333,4 @@ jobs:
|
||||||
submodules: recursive
|
submodules: recursive
|
||||||
- name: Verify patches carry all the changes
|
- name: Verify patches carry all the changes
|
||||||
run: |
|
run: |
|
||||||
cd llama && make apply-patches sync && git diff --compact-summary --exit-code .
|
cd llama && ./sync.sh && git diff --compact-summary --exit-code .
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -15,4 +15,3 @@ build/*/*/*
|
||||||
!build/**/placeholder
|
!build/**/placeholder
|
||||||
llama/build
|
llama/build
|
||||||
__debug_bin*
|
__debug_bin*
|
||||||
llama/vendor
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
|
||||||
|
2 KiB
|
||||||
|
Docker
|
||||||
# Build stage
|
# Build stage
|
||||||
FROM golang:1.22-bookworm as build
|
FROM golang:1.22-bookworm as build
|
||||||
|
|
||||||
|
|
|
@ -330,7 +330,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
|
- [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
|
||||||
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
|
- [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
|
||||||
- [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
|
- [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
|
||||||
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
|
||||||
|
|
||||||
### Terminal
|
### Terminal
|
||||||
|
|
||||||
|
@ -412,13 +411,11 @@ See the [API documentation](./docs/api.md) for all endpoints.
|
||||||
- [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
|
- [High-level function abstraction in Go](https://gitlab.com/tozd/go/fun)
|
||||||
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
|
- [Ollama PHP](https://github.com/ArdaGnsrn/ollama-php)
|
||||||
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
|
- [Agents-Flex for Java](https://github.com/agents-flex/agents-flex) with [example](https://github.com/agents-flex/agents-flex/tree/main/agents-flex-llm/agents-flex-llm-ollama/src/test/java/com/agentsflex/llm/ollama)
|
||||||
- [Ollama for Swift](https://github.com/mattt/ollama-swift)
|
|
||||||
|
|
||||||
### Mobile
|
### Mobile
|
||||||
|
|
||||||
- [Enchanted](https://github.com/AugustDev/enchanted)
|
- [Enchanted](https://github.com/AugustDev/enchanted)
|
||||||
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
|
- [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
|
||||||
- [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
|
|
||||||
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
- [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
|
||||||
|
|
||||||
### Extensions & Plugins
|
### Extensions & Plugins
|
||||||
|
|
68
cmd/cmd.go
68
cmd/cmd.go
|
@ -21,6 +21,7 @@ import (
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
@ -46,58 +47,28 @@ import (
|
||||||
"github.com/ollama/ollama/version"
|
"github.com/ollama/ollama/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
|
||||||
errModelNotFound = errors.New("no Modelfile or safetensors files found")
|
|
||||||
errModelfileNotFound = errors.New("specified Modelfile wasn't found")
|
|
||||||
)
|
|
||||||
|
|
||||||
func getModelfileName(cmd *cobra.Command) (string, error) {
|
|
||||||
fn, _ := cmd.Flags().GetString("file")
|
|
||||||
|
|
||||||
filename := fn
|
|
||||||
if filename == "" {
|
|
||||||
filename = "Modelfile"
|
|
||||||
}
|
|
||||||
|
|
||||||
absName, err := filepath.Abs(filename)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
|
|
||||||
_, err = os.Stat(absName)
|
|
||||||
if err != nil {
|
|
||||||
return fn, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return absName, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func CreateHandler(cmd *cobra.Command, args []string) error {
|
func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||||
|
filename, _ := cmd.Flags().GetString("file")
|
||||||
|
filename, err := filepath.Abs(filename)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
client, err := api.ClientFromEnvironment()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
p := progress.NewProgress(os.Stderr)
|
p := progress.NewProgress(os.Stderr)
|
||||||
defer p.Stop()
|
defer p.Stop()
|
||||||
|
|
||||||
var reader io.Reader
|
|
||||||
|
|
||||||
filename, err := getModelfileName(cmd)
|
|
||||||
if os.IsNotExist(err) {
|
|
||||||
if filename == "" {
|
|
||||||
reader = strings.NewReader("FROM .\n")
|
|
||||||
} else {
|
|
||||||
return errModelfileNotFound
|
|
||||||
}
|
|
||||||
} else if err != nil {
|
|
||||||
return err
|
|
||||||
} else {
|
|
||||||
f, err := os.Open(filename)
|
f, err := os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
reader = f
|
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
}
|
|
||||||
|
|
||||||
modelfile, err := parser.ParseFile(reader)
|
modelfile, err := parser.ParseFile(f)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -112,11 +83,6 @@ func CreateHandler(cmd *cobra.Command, args []string) error {
|
||||||
p.Add(status, spinner)
|
p.Add(status, spinner)
|
||||||
defer p.Stop()
|
defer p.Stop()
|
||||||
|
|
||||||
client, err := api.ClientFromEnvironment()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
for i := range modelfile.Commands {
|
for i := range modelfile.Commands {
|
||||||
switch modelfile.Commands[i].Name {
|
switch modelfile.Commands[i].Name {
|
||||||
case "model", "adapter":
|
case "model", "adapter":
|
||||||
|
@ -255,7 +221,7 @@ func tempZipFiles(path string) (string, error) {
|
||||||
// covers consolidated.x.pth, consolidated.pth
|
// covers consolidated.x.pth, consolidated.pth
|
||||||
files = append(files, pt...)
|
files = append(files, pt...)
|
||||||
} else {
|
} else {
|
||||||
return "", errModelNotFound
|
return "", errors.New("no safetensors or torch files found")
|
||||||
}
|
}
|
||||||
|
|
||||||
// add configuration files, json files are detected as text/plain
|
// add configuration files, json files are detected as text/plain
|
||||||
|
@ -487,7 +453,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
opts.MultiModal = len(info.ProjectorInfo) != 0
|
opts.MultiModal = slices.Contains(info.Details.Families, "clip")
|
||||||
opts.ParentModel = info.Details.ParentModel
|
opts.ParentModel = info.Details.ParentModel
|
||||||
|
|
||||||
if interactive {
|
if interactive {
|
||||||
|
@ -1318,7 +1284,7 @@ func NewCLI() *cobra.Command {
|
||||||
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
||||||
cobra.EnableCommandSorting = false
|
cobra.EnableCommandSorting = false
|
||||||
|
|
||||||
if runtime.GOOS == "windows" && term.IsTerminal(int(os.Stdout.Fd())) {
|
if runtime.GOOS == "windows" {
|
||||||
console.ConsoleFromFile(os.Stdin) //nolint:errcheck
|
console.ConsoleFromFile(os.Stdin) //nolint:errcheck
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1350,7 +1316,7 @@ func NewCLI() *cobra.Command {
|
||||||
RunE: CreateHandler,
|
RunE: CreateHandler,
|
||||||
}
|
}
|
||||||
|
|
||||||
createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
|
createCmd.Flags().StringP("file", "f", "Modelfile", "Name of the Modelfile")
|
||||||
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
|
createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
|
||||||
|
|
||||||
showCmd := &cobra.Command{
|
showCmd := &cobra.Command{
|
||||||
|
|
|
@ -270,102 +270,3 @@ func TestDeleteHandler(t *testing.T) {
|
||||||
t.Fatalf("DeleteHandler failed: expected error about stopping non-existent model, got %v", err)
|
t.Fatalf("DeleteHandler failed: expected error about stopping non-existent model, got %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGetModelfileName(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
modelfileName string
|
|
||||||
fileExists bool
|
|
||||||
expectedName string
|
|
||||||
expectedErr error
|
|
||||||
}{
|
|
||||||
{
|
|
||||||
name: "no modelfile specified, no modelfile exists",
|
|
||||||
modelfileName: "",
|
|
||||||
fileExists: false,
|
|
||||||
expectedName: "",
|
|
||||||
expectedErr: os.ErrNotExist,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "no modelfile specified, modelfile exists",
|
|
||||||
modelfileName: "",
|
|
||||||
fileExists: true,
|
|
||||||
expectedName: "Modelfile",
|
|
||||||
expectedErr: nil,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "modelfile specified, no modelfile exists",
|
|
||||||
modelfileName: "crazyfile",
|
|
||||||
fileExists: false,
|
|
||||||
expectedName: "crazyfile",
|
|
||||||
expectedErr: os.ErrNotExist,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "modelfile specified, modelfile exists",
|
|
||||||
modelfileName: "anotherfile",
|
|
||||||
fileExists: true,
|
|
||||||
expectedName: "anotherfile",
|
|
||||||
expectedErr: nil,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
cmd := &cobra.Command{
|
|
||||||
Use: "fakecmd",
|
|
||||||
}
|
|
||||||
cmd.Flags().String("file", "", "path to modelfile")
|
|
||||||
|
|
||||||
var expectedFilename string
|
|
||||||
|
|
||||||
if tt.fileExists {
|
|
||||||
tempDir, err := os.MkdirTemp("", "modelfiledir")
|
|
||||||
defer os.RemoveAll(tempDir)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("temp modelfile dir creation failed: %v", err)
|
|
||||||
}
|
|
||||||
var fn string
|
|
||||||
if tt.modelfileName != "" {
|
|
||||||
fn = tt.modelfileName
|
|
||||||
} else {
|
|
||||||
fn = "Modelfile"
|
|
||||||
}
|
|
||||||
|
|
||||||
tempFile, err := os.CreateTemp(tempDir, fn)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("temp modelfile creation failed: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
expectedFilename = tempFile.Name()
|
|
||||||
err = cmd.Flags().Set("file", expectedFilename)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("couldn't set file flag: %v", err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if tt.modelfileName != "" {
|
|
||||||
expectedFilename = tt.modelfileName
|
|
||||||
err := cmd.Flags().Set("file", tt.modelfileName)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("couldn't set file flag: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
actualFilename, actualErr := getModelfileName(cmd)
|
|
||||||
|
|
||||||
if actualFilename != expectedFilename {
|
|
||||||
t.Errorf("expected filename: '%s' actual filename: '%s'", expectedFilename, actualFilename)
|
|
||||||
}
|
|
||||||
|
|
||||||
if tt.expectedErr != os.ErrNotExist {
|
|
||||||
if actualErr != tt.expectedErr {
|
|
||||||
t.Errorf("expected err: %v actual err: %v", tt.expectedErr, actualErr)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if !os.IsNotExist(actualErr) {
|
|
||||||
t.Errorf("expected err: %v actual err: %v", tt.expectedErr, actualErr)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -494,22 +494,28 @@ func buildModelfile(opts runOptions) string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func normalizeFilePath(fp string) string {
|
func normalizeFilePath(fp string) string {
|
||||||
return strings.NewReplacer(
|
// Define a map of escaped characters and their replacements
|
||||||
"\\ ", " ", // Escaped space
|
replacements := map[string]string{
|
||||||
"\\(", "(", // Escaped left parenthesis
|
"\\ ": " ", // Escaped space
|
||||||
"\\)", ")", // Escaped right parenthesis
|
"\\(": "(", // Escaped left parenthesis
|
||||||
"\\[", "[", // Escaped left square bracket
|
"\\)": ")", // Escaped right parenthesis
|
||||||
"\\]", "]", // Escaped right square bracket
|
"\\[": "[", // Escaped left square bracket
|
||||||
"\\{", "{", // Escaped left curly brace
|
"\\]": "]", // Escaped right square bracket
|
||||||
"\\}", "}", // Escaped right curly brace
|
"\\{": "{", // Escaped left curly brace
|
||||||
"\\$", "$", // Escaped dollar sign
|
"\\}": "}", // Escaped right curly brace
|
||||||
"\\&", "&", // Escaped ampersand
|
"\\$": "$", // Escaped dollar sign
|
||||||
"\\;", ";", // Escaped semicolon
|
"\\&": "&", // Escaped ampersand
|
||||||
"\\'", "'", // Escaped single quote
|
"\\;": ";", // Escaped semicolon
|
||||||
"\\\\", "\\", // Escaped backslash
|
"\\'": "'", // Escaped single quote
|
||||||
"\\*", "*", // Escaped asterisk
|
"\\\\": "\\", // Escaped backslash
|
||||||
"\\?", "?", // Escaped question mark
|
"\\*": "*", // Escaped asterisk
|
||||||
).Replace(fp)
|
"\\?": "?", // Escaped question mark
|
||||||
|
}
|
||||||
|
|
||||||
|
for escaped, actual := range replacements {
|
||||||
|
fp = strings.ReplaceAll(fp, escaped, actual)
|
||||||
|
}
|
||||||
|
return fp
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractFileNames(input string) []string {
|
func extractFileNames(input string) []string {
|
||||||
|
@ -529,9 +535,10 @@ func extractFileData(input string) (string, []api.ImageData, error) {
|
||||||
for _, fp := range filePaths {
|
for _, fp := range filePaths {
|
||||||
nfp := normalizeFilePath(fp)
|
nfp := normalizeFilePath(fp)
|
||||||
data, err := getImageData(nfp)
|
data, err := getImageData(nfp)
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
if err != nil {
|
||||||
|
if os.IsNotExist(err) {
|
||||||
continue
|
continue
|
||||||
} else if err != nil {
|
}
|
||||||
fmt.Fprintf(os.Stderr, "Couldn't process image: %q\n", err)
|
fmt.Fprintf(os.Stderr, "Couldn't process image: %q\n", err)
|
||||||
return "", imgs, err
|
return "", imgs, err
|
||||||
}
|
}
|
||||||
|
@ -539,7 +546,7 @@ func extractFileData(input string) (string, []api.ImageData, error) {
|
||||||
input = strings.ReplaceAll(input, fp, "")
|
input = strings.ReplaceAll(input, fp, "")
|
||||||
imgs = append(imgs, data)
|
imgs = append(imgs, data)
|
||||||
}
|
}
|
||||||
return strings.TrimSpace(input), imgs, nil
|
return input, imgs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getImageData(filePath string) ([]byte, error) {
|
func getImageData(filePath string) ([]byte, error) {
|
||||||
|
|
|
@ -29,7 +29,7 @@ type tensorData struct {
|
||||||
Shape []int `json:"shape"`
|
Shape []int `json:"shape"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
f, err := os.CreateTemp(t.TempDir(), "f16")
|
f, err := os.CreateTemp(t.TempDir(), "f16")
|
||||||
|
@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
|
||||||
return r, m.KV(), m.Tensors()
|
return r, m.KV(), m.Tensors()
|
||||||
}
|
}
|
||||||
|
|
||||||
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
|
func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string {
|
||||||
actual := make(map[string]string)
|
actual := make(map[string]string)
|
||||||
for k, v := range kv {
|
for k, v := range kv {
|
||||||
if s, ok := v.(json.Marshaler); !ok {
|
if s, ok := v.(json.Marshaler); !ok {
|
||||||
|
|
|
@ -1,186 +0,0 @@
|
||||||
package discover
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bufio"
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
"reflect"
|
|
||||||
"regexp"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
|
||||||
)
|
|
||||||
|
|
||||||
var CudartGlobs = []string{
|
|
||||||
"/usr/local/cuda/lib64/libcudart.so*",
|
|
||||||
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
|
|
||||||
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
|
|
||||||
"/usr/lib/wsl/lib/libcudart.so*",
|
|
||||||
"/usr/lib/wsl/drivers/*/libcudart.so*",
|
|
||||||
"/opt/cuda/lib64/libcudart.so*",
|
|
||||||
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
|
|
||||||
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
|
|
||||||
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
|
|
||||||
"/usr/local/cuda/lib*/libcudart.so*",
|
|
||||||
"/usr/lib*/libcudart.so*",
|
|
||||||
"/usr/local/lib*/libcudart.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvmlGlobs = []string{}
|
|
||||||
|
|
||||||
var NvcudaGlobs = []string{
|
|
||||||
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
|
|
||||||
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
|
|
||||||
"/usr/lib/*-linux-gnu/libcuda.so*",
|
|
||||||
"/usr/lib/wsl/lib/libcuda.so*",
|
|
||||||
"/usr/lib/wsl/drivers/*/libcuda.so*",
|
|
||||||
"/opt/cuda/lib*/libcuda.so*",
|
|
||||||
"/usr/local/cuda/lib*/libcuda.so*",
|
|
||||||
"/usr/lib*/libcuda.so*",
|
|
||||||
"/usr/local/lib*/libcuda.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
var OneapiGlobs = []string{
|
|
||||||
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
|
|
||||||
"/usr/lib*/libze_intel_gpu.so*",
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
CudartMgmtName = "libcudart.so*"
|
|
||||||
NvcudaMgmtName = "libcuda.so*"
|
|
||||||
NvmlMgmtName = "" // not currently wired on linux
|
|
||||||
OneapiMgmtName = "libze_intel_gpu.so*"
|
|
||||||
)
|
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
|
||||||
var mem memInfo
|
|
||||||
var total, available, free, buffers, cached, freeSwap uint64
|
|
||||||
f, err := os.Open("/proc/meminfo")
|
|
||||||
if err != nil {
|
|
||||||
return mem, err
|
|
||||||
}
|
|
||||||
defer f.Close()
|
|
||||||
s := bufio.NewScanner(f)
|
|
||||||
for s.Scan() {
|
|
||||||
line := s.Text()
|
|
||||||
switch {
|
|
||||||
case strings.HasPrefix(line, "MemTotal:"):
|
|
||||||
_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
|
|
||||||
case strings.HasPrefix(line, "MemAvailable:"):
|
|
||||||
_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
|
|
||||||
case strings.HasPrefix(line, "MemFree:"):
|
|
||||||
_, err = fmt.Sscanf(line, "MemFree:%d", &free)
|
|
||||||
case strings.HasPrefix(line, "Buffers:"):
|
|
||||||
_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
|
|
||||||
case strings.HasPrefix(line, "Cached:"):
|
|
||||||
_, err = fmt.Sscanf(line, "Cached:%d", &cached)
|
|
||||||
case strings.HasPrefix(line, "SwapFree:"):
|
|
||||||
_, err = fmt.Sscanf(line, "SwapFree:%d", &freeSwap)
|
|
||||||
default:
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
return mem, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mem.TotalMemory = total * format.KibiByte
|
|
||||||
mem.FreeSwap = freeSwap * format.KibiByte
|
|
||||||
if available > 0 {
|
|
||||||
mem.FreeMemory = available * format.KibiByte
|
|
||||||
} else {
|
|
||||||
mem.FreeMemory = (free + buffers + cached) * format.KibiByte
|
|
||||||
}
|
|
||||||
return mem, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
const CpuInfoFilename = "/proc/cpuinfo"
|
|
||||||
|
|
||||||
type linuxCpuInfo struct {
|
|
||||||
ID string `cpuinfo:"processor"`
|
|
||||||
VendorID string `cpuinfo:"vendor_id"`
|
|
||||||
ModelName string `cpuinfo:"model name"`
|
|
||||||
PhysicalID string `cpuinfo:"physical id"`
|
|
||||||
Siblings string `cpuinfo:"siblings"`
|
|
||||||
CoreID string `cpuinfo:"core id"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetCPUDetails() ([]CPU, error) {
|
|
||||||
file, err := os.Open(CpuInfoFilename)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
reColumns := regexp.MustCompile("\t+: ")
|
|
||||||
scanner := bufio.NewScanner(file)
|
|
||||||
cpuInfos := []linuxCpuInfo{}
|
|
||||||
cpu := &linuxCpuInfo{}
|
|
||||||
for scanner.Scan() {
|
|
||||||
line := scanner.Text()
|
|
||||||
if sl := reColumns.Split(line, 2); len(sl) > 1 {
|
|
||||||
t := reflect.TypeOf(cpu).Elem()
|
|
||||||
s := reflect.ValueOf(cpu).Elem()
|
|
||||||
for i := range t.NumField() {
|
|
||||||
field := t.Field(i)
|
|
||||||
tag := field.Tag.Get("cpuinfo")
|
|
||||||
if tag == sl[0] {
|
|
||||||
s.FieldByName(field.Name).SetString(sl[1])
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if strings.TrimSpace(line) == "" && cpu.ID != "" {
|
|
||||||
cpuInfos = append(cpuInfos, *cpu)
|
|
||||||
cpu = &linuxCpuInfo{}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process the sockets/cores/threads
|
|
||||||
socketByID := map[string]*CPU{}
|
|
||||||
coreBySocket := map[string]map[string]struct{}{}
|
|
||||||
threadsByCoreBySocket := map[string]map[string]int{}
|
|
||||||
for _, c := range cpuInfos {
|
|
||||||
if _, found := socketByID[c.PhysicalID]; !found {
|
|
||||||
socketByID[c.PhysicalID] = &CPU{
|
|
||||||
ID: c.PhysicalID,
|
|
||||||
VendorID: c.VendorID,
|
|
||||||
ModelName: c.ModelName,
|
|
||||||
}
|
|
||||||
coreBySocket[c.PhysicalID] = map[string]struct{}{}
|
|
||||||
threadsByCoreBySocket[c.PhysicalID] = map[string]int{}
|
|
||||||
}
|
|
||||||
if c.CoreID != "" {
|
|
||||||
coreBySocket[c.PhysicalID][c.PhysicalID+":"+c.CoreID] = struct{}{}
|
|
||||||
threadsByCoreBySocket[c.PhysicalID][c.PhysicalID+":"+c.CoreID]++
|
|
||||||
} else {
|
|
||||||
coreBySocket[c.PhysicalID][c.PhysicalID+":"+c.ID] = struct{}{}
|
|
||||||
threadsByCoreBySocket[c.PhysicalID][c.PhysicalID+":"+c.ID]++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tally up the values from the tracking maps
|
|
||||||
for id, s := range socketByID {
|
|
||||||
s.CoreCount = len(coreBySocket[id])
|
|
||||||
s.ThreadCount = 0
|
|
||||||
for _, tc := range threadsByCoreBySocket[id] {
|
|
||||||
s.ThreadCount += tc
|
|
||||||
}
|
|
||||||
|
|
||||||
// This only works if HT is enabled, consider a more reliable model, maybe cache size comparisons?
|
|
||||||
efficiencyCoreCount := 0
|
|
||||||
for _, threads := range threadsByCoreBySocket[id] {
|
|
||||||
if threads == 1 {
|
|
||||||
efficiencyCoreCount++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if efficiencyCoreCount == s.CoreCount {
|
|
||||||
// 1:1 mapping means they're not actually efficiency cores, but regular cores
|
|
||||||
s.EfficiencyCoreCount = 0
|
|
||||||
} else {
|
|
||||||
s.EfficiencyCoreCount = efficiencyCoreCount
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
result := []CPU{}
|
|
||||||
for _, c := range socketByID {
|
|
||||||
result = append(result, *c)
|
|
||||||
}
|
|
||||||
return result, nil
|
|
||||||
}
|
|
|
@ -1,234 +0,0 @@
|
||||||
package discover
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"log/slog"
|
|
||||||
"syscall"
|
|
||||||
"unsafe"
|
|
||||||
)
|
|
||||||
|
|
||||||
type MEMORYSTATUSEX struct {
|
|
||||||
length uint32
|
|
||||||
MemoryLoad uint32
|
|
||||||
TotalPhys uint64
|
|
||||||
AvailPhys uint64
|
|
||||||
TotalPageFile uint64
|
|
||||||
AvailPageFile uint64
|
|
||||||
TotalVirtual uint64
|
|
||||||
AvailVirtual uint64
|
|
||||||
AvailExtendedVirtual uint64
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
k32 = syscall.NewLazyDLL("kernel32.dll")
|
|
||||||
globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
|
|
||||||
sizeofMemoryStatusEx = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
|
|
||||||
GetLogicalProcessorInformationEx = k32.NewProc("GetLogicalProcessorInformationEx")
|
|
||||||
)
|
|
||||||
|
|
||||||
var CudartGlobs = []string{
|
|
||||||
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvmlGlobs = []string{
|
|
||||||
"c:\\Windows\\System32\\nvml.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var NvcudaGlobs = []string{
|
|
||||||
"c:\\windows\\system*\\nvcuda.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var OneapiGlobs = []string{
|
|
||||||
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
CudartMgmtName = "cudart64_*.dll"
|
|
||||||
NvcudaMgmtName = "nvcuda.dll"
|
|
||||||
NvmlMgmtName = "nvml.dll"
|
|
||||||
OneapiMgmtName = "ze_intel_gpu64.dll"
|
|
||||||
)
|
|
||||||
|
|
||||||
func GetCPUMem() (memInfo, error) {
|
|
||||||
memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
|
|
||||||
r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
|
|
||||||
if r1 == 0 {
|
|
||||||
return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
|
|
||||||
}
|
|
||||||
return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys, FreeSwap: memStatus.AvailPageFile}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type LOGICAL_PROCESSOR_RELATIONSHIP uint32
|
|
||||||
|
|
||||||
const (
|
|
||||||
RelationProcessorCore LOGICAL_PROCESSOR_RELATIONSHIP = iota
|
|
||||||
RelationNumaNode
|
|
||||||
RelationCache
|
|
||||||
RelationProcessorPackage
|
|
||||||
RelationGroup
|
|
||||||
RelationProcessorDie
|
|
||||||
RelationNumaNodeEx
|
|
||||||
RelationProcessorModule
|
|
||||||
)
|
|
||||||
const RelationAll LOGICAL_PROCESSOR_RELATIONSHIP = 0xffff
|
|
||||||
|
|
||||||
type GROUP_AFFINITY struct {
|
|
||||||
Mask uintptr // KAFFINITY
|
|
||||||
Group uint16
|
|
||||||
Reserved [3]uint16
|
|
||||||
}
|
|
||||||
|
|
||||||
type PROCESSOR_RELATIONSHIP struct {
|
|
||||||
Flags byte
|
|
||||||
EfficiencyClass byte
|
|
||||||
Reserved [20]byte
|
|
||||||
GroupCount uint16
|
|
||||||
GroupMask [1]GROUP_AFFINITY // len GroupCount
|
|
||||||
}
|
|
||||||
|
|
||||||
// Omitted unused structs: NUMA_NODE_RELATIONSHIP CACHE_RELATIONSHIP GROUP_RELATIONSHIP
|
|
||||||
|
|
||||||
type SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX struct {
|
|
||||||
Relationship LOGICAL_PROCESSOR_RELATIONSHIP
|
|
||||||
Size uint32
|
|
||||||
U [1]byte // Union len Size
|
|
||||||
// PROCESSOR_RELATIONSHIP
|
|
||||||
// NUMA_NODE_RELATIONSHIP
|
|
||||||
// CACHE_RELATIONSHIP
|
|
||||||
// GROUP_RELATIONSHIP
|
|
||||||
}
|
|
||||||
|
|
||||||
func (group *GROUP_AFFINITY) IsMember(target *GROUP_AFFINITY) bool {
|
|
||||||
if group == nil || target == nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return group.Mask&target.Mask != 0
|
|
||||||
}
|
|
||||||
|
|
||||||
type winPackage struct {
|
|
||||||
groups []*GROUP_AFFINITY
|
|
||||||
coreCount int // performance cores = coreCount - efficiencyCoreCount
|
|
||||||
efficiencyCoreCount int
|
|
||||||
threadCount int
|
|
||||||
}
|
|
||||||
|
|
||||||
func (pkg *winPackage) IsMember(target *GROUP_AFFINITY) bool {
|
|
||||||
for _, group := range pkg.groups {
|
|
||||||
if group.IsMember(target) {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
func getLogicalProcessorInformationEx() ([]byte, error) {
|
|
||||||
buf := make([]byte, 1)
|
|
||||||
bufSize := len(buf)
|
|
||||||
ret, _, err := GetLogicalProcessorInformationEx.Call(
|
|
||||||
uintptr(RelationAll),
|
|
||||||
uintptr(unsafe.Pointer(&buf[0])),
|
|
||||||
uintptr(unsafe.Pointer(&bufSize)),
|
|
||||||
)
|
|
||||||
if ret != 0 {
|
|
||||||
return nil, fmt.Errorf("failed to determine size info ret:%d %w", ret, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
buf = make([]byte, bufSize)
|
|
||||||
ret, _, err = GetLogicalProcessorInformationEx.Call(
|
|
||||||
uintptr(RelationAll),
|
|
||||||
uintptr(unsafe.Pointer(&buf[0])),
|
|
||||||
uintptr(unsafe.Pointer(&bufSize)),
|
|
||||||
)
|
|
||||||
if ret == 0 {
|
|
||||||
return nil, fmt.Errorf("failed to gather processor information ret:%d buflen:%d %w", ret, bufSize, err)
|
|
||||||
}
|
|
||||||
return buf, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
|
|
||||||
var slpi *SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX
|
|
||||||
// Find all the packages first
|
|
||||||
packages := []*winPackage{}
|
|
||||||
for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
|
|
||||||
slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
|
|
||||||
if slpi.Relationship != RelationProcessorPackage {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
|
|
||||||
pkg := &winPackage{}
|
|
||||||
ga0 := unsafe.Pointer(&pr.GroupMask[0])
|
|
||||||
for j := range pr.GroupCount {
|
|
||||||
gm := (*GROUP_AFFINITY)(unsafe.Pointer(uintptr(ga0) + uintptr(j)*unsafe.Sizeof(GROUP_AFFINITY{})))
|
|
||||||
pkg.groups = append(pkg.groups, gm)
|
|
||||||
}
|
|
||||||
packages = append(packages, pkg)
|
|
||||||
}
|
|
||||||
|
|
||||||
slog.Info("packages", "count", len(packages))
|
|
||||||
|
|
||||||
// To identify efficiency cores we have to compare the relative values
|
|
||||||
// Larger values are "less efficient" (aka, more performant)
|
|
||||||
var maxEfficiencyClass byte
|
|
||||||
for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
|
|
||||||
slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
|
|
||||||
if slpi.Relationship != RelationProcessorCore {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
|
|
||||||
if pr.EfficiencyClass > maxEfficiencyClass {
|
|
||||||
maxEfficiencyClass = pr.EfficiencyClass
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if maxEfficiencyClass > 0 {
|
|
||||||
slog.Info("efficiency cores detected", "maxEfficiencyClass", maxEfficiencyClass)
|
|
||||||
}
|
|
||||||
|
|
||||||
// then match up the Cores to the Packages, count up cores, threads and efficiency cores
|
|
||||||
for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
|
|
||||||
slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
|
|
||||||
if slpi.Relationship != RelationProcessorCore {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
|
|
||||||
ga0 := unsafe.Pointer(&pr.GroupMask[0])
|
|
||||||
for j := range pr.GroupCount {
|
|
||||||
gm := (*GROUP_AFFINITY)(unsafe.Pointer(uintptr(ga0) + uintptr(j)*unsafe.Sizeof(GROUP_AFFINITY{})))
|
|
||||||
for _, pkg := range packages {
|
|
||||||
if pkg.IsMember(gm) {
|
|
||||||
pkg.coreCount++
|
|
||||||
if pr.Flags == 0 {
|
|
||||||
pkg.threadCount++
|
|
||||||
} else {
|
|
||||||
pkg.threadCount += 2
|
|
||||||
}
|
|
||||||
if pr.EfficiencyClass < maxEfficiencyClass {
|
|
||||||
pkg.efficiencyCoreCount++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sumarize the results
|
|
||||||
for i, pkg := range packages {
|
|
||||||
slog.Info("", "package", i, "cores", pkg.coreCount, "efficiency", pkg.efficiencyCoreCount, "threads", pkg.threadCount)
|
|
||||||
}
|
|
||||||
|
|
||||||
return packages
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetCPUDetails() ([]CPU, error) {
|
|
||||||
buf, err := getLogicalProcessorInformationEx()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
packages := processSystemLogicalProcessorInforationList(buf)
|
|
||||||
cpus := make([]CPU, len(packages))
|
|
||||||
|
|
||||||
for i, pkg := range packages {
|
|
||||||
cpus[i].CoreCount = pkg.coreCount
|
|
||||||
cpus[i].EfficiencyCoreCount = pkg.efficiencyCoreCount
|
|
||||||
cpus[i].ThreadCount = pkg.threadCount
|
|
||||||
}
|
|
||||||
return cpus, nil
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
|
@ -74,10 +74,6 @@ would set `HSA_OVERRIDE_GFX_VERSION="10.3.0"` as an environment variable for the
|
||||||
server. If you have an unsupported AMD GPU you can experiment using the list of
|
server. If you have an unsupported AMD GPU you can experiment using the list of
|
||||||
supported types below.
|
supported types below.
|
||||||
|
|
||||||
If you have multiple GPUs with different GFX versions, append the numeric device
|
|
||||||
number to the environment variable to set them individually. For example,
|
|
||||||
`HSA_OVERRIDE_GFX_VERSION_0=10.3.0` and `HSA_OVERRIDE_GFX_VERSION_1=11.0.0`
|
|
||||||
|
|
||||||
At this time, the known supported GPU types on linux are the following LLVM Targets.
|
At this time, the known supported GPU types on linux are the following LLVM Targets.
|
||||||
This table shows some example GPUs that map to these LLVM targets:
|
This table shows some example GPUs that map to these LLVM targets:
|
||||||
| **LLVM Target** | **An Example GPU** |
|
| **LLVM Target** | **An Example GPU** |
|
||||||
|
@ -103,10 +99,9 @@ Reach out on [Discord](https://discord.gg/ollama) or file an
|
||||||
### GPU Selection
|
### GPU Selection
|
||||||
|
|
||||||
If you have multiple AMD GPUs in your system and want to limit Ollama to use a
|
If you have multiple AMD GPUs in your system and want to limit Ollama to use a
|
||||||
subset, you can set `ROCR_VISIBLE_DEVICES` to a comma separated list of GPUs.
|
subset, you can set `HIP_VISIBLE_DEVICES` to a comma separated list of GPUs.
|
||||||
You can see the list of devices with `rocminfo`. If you want to ignore the GPUs
|
You can see the list of devices with `rocminfo`. If you want to ignore the GPUs
|
||||||
and force CPU usage, use an invalid GPU ID (e.g., "-1"). When available, use the
|
and force CPU usage, use an invalid GPU ID (e.g., "-1")
|
||||||
`Uuid` to uniquely identify the device instead of numeric value.
|
|
||||||
|
|
||||||
### Container Permission
|
### Container Permission
|
||||||
|
|
||||||
|
|
|
@ -37,7 +37,7 @@ response = client.chat.completions.create(
|
||||||
{"type": "text", "text": "What's in this image?"},
|
{"type": "text", "text": "What's in this image?"},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": "",
|
"image_url": "iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
@ -86,7 +86,7 @@ const response = await openai.chat.completions.create({
|
||||||
{ type: "text", text: "What's in this image?" },
|
{ type: "text", text: "What's in this image?" },
|
||||||
{
|
{
|
||||||
type: "image_url",
|
type: "image_url",
|
||||||
image_url: "",
|
image_url: "iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
@ -142,7 +142,7 @@ curl http://localhost:11434/v1/chat/completions \
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {
|
||||||
"url": ""
|
"url": "iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA3VSURBVHgB7Z27r0zdG8fX743i1bi1ikMoFMQloXRpKFFIqI7LH4BEQ+NWIkjQuSWCRIEoULk0gsK1kCBI0IhrQVT7tz/7zZo888yz1r7MnDl7z5xvsjkzs2fP3uu71nNfa7lkAsm7d++Sffv2JbNmzUqcc8m0adOSzZs3Z+/XES4ZckAWJEGWPiCxjsQNLWmQsWjRIpMseaxcuTKpG/7HP27I8P79e7dq1ars/yL4/v27S0ejqwv+cUOGEGGpKHR37tzJCEpHV9tnT58+dXXCJDdECBE2Ojrqjh071hpNECjx4cMHVycM1Uhbv359B2F79+51586daxN/+pyRkRFXKyRDAqxEp4yMlDDzXG1NPnnyJKkThoK0VFd1ELZu3TrzXKxKfW7dMBQ6bcuWLW2v0VlHjx41z717927ba22U9APcw7Nnz1oGEPeL3m3p2mTAYYnFmMOMXybPPXv2bNIPpFZr1NHn4HMw0KRBjg9NuRw95s8PEcz/6DZELQd/09C9QGq5RsmSRybqkwHGjh07OsJSsYYm3ijPpyHzoiacg35MLdDSIS/O1yM778jOTwYUkKNHWUzUWaOsylE00MyI0fcnOwIdjvtNdW/HZwNLGg+sR1kMepSNJXmIwxBZiG8tDTpEZzKg0GItNsosY8USkxDhD0Rinuiko2gfL/RbiD2LZAjU9zKQJj8RDR0vJBR1/Phx9+PHj9Z7REF4nTZkxzX4LCXHrV271qXkBAPGfP/atWvu/PnzHe4C97F48eIsRLZ9+3a3f/9+87dwP1JxaF7/3r17ba+5l4EcaVo0lj3SBq5kGTJSQmLWMjgYNei2GPT1MuMqGTDEFHzeQSP2wi/jGnkmPJ/nhccs44jvDAxpVcxnq0F6eT8h4ni/iIWpR5lPyA6ETkNXoSukvpJAD3AsXLiwpZs49+fPn5ke4j10TqYvegSfn0OnafC+Tv9ooA/JPkgQysqQNBzagXY55nO/oa1F7qvIPWkRL12WRpMWUvpVDYmxAPehxWSe8ZEXL20sadYIozfmNch4QJPAfeJgW3rNsnzphBKNJM2KKODo1rVOMRYik5ETy3ix4qWNI81qAAirizgMIc+yhTytx0JWZuNI03qsrgWlGtwjoS9XwgUhWGyhUaRZZQNNIEwCiXD16tXcAHUs79co0vSD8rrJCIW98pzvxpAWyyo3HYwqS0+H0BjStClcZJT5coMm6D2LOF8TolGJtK9fvyZpyiC5ePFi9nc/oJU4eiEP0jVoAnHa9wyJycITMP78+eMeP37sXrx44d6+fdt6f82aNdkx1pg9e3Zb5W+RSRE+n+VjksQWifvVaTKFhn5O8my63K8Qabdv33b379/PiAP//vuvW7BggZszZ072/+TJk91YgkafPn166zXB1rQHFvouAWHq9z3SEevSUerqCn2/dDCeta2jxYbr69evk4MHDyY7d+7MjhMnTiTPnz9Pfv/+nfQT2ggpO2dMF8cghuoM7Ygj5iWCqRlGFml0QC/ftGmTmzt3rmsaKDsgBSPh0/8yPeLLBihLkOKJc0jp8H8vUzcxIA1k6QJ/c78tWEyj5P3o4u9+jywNPdJi5rAH9x0KHcl4Hg570eQp3+vHXGyrmEeigzQsQsjavXt38ujRo44LQuDDhw+TW7duRS1HGgMxhNXHgflaNTOsHyKvHK5Ijo2jbFjJBQK9YwFd6RVMzfgRBmEfP37suBBm/p49e1qjEP2mwTViNRo0VJWH1deMXcNK08uUjVUu7s/zRaL+oLNxz1bpANco4npUgX4G2eFbpDFyQoQxojBCpEGSytmOH8qrH5Q9vuzD6ofQylkCUmh8DBAr+q8JCyVNtWQIidKQE9wNtLSQnS4jDSsxNHogzFuQBw4cyM61UKVsjfr3ooBkPSqqQHesUPWVtzi9/vQi1T+rJj7WiTz4Pt/l3LxUkr5P2VYZaZ4URpsE+st/dujQoaBBYokbrz/8TJNQYLSonrPS9kUaSkPeZyj1AWSj+d+VBoy1pIWVNed8P0Ll/ee5HdGRhrHhR5GGN0r4LGZBaj8oFDJitBTJzIZgFcmU0Y8ytWMZMzJOaXUSrUs5RxKnrxmbb5YXO9VGUhtpXldhEUogFr3IzIsvlpmdosVcGVGXFWp2oU9kLFL3dEkSz6NHEY1sjSRdIuDFWEhd8KxFqsRi1uM/nz9/zpxnwlESONdg6dKlbsaMGS4EHFHtjFIDHwKOo46l4TxSuxgDzi+rE2jg+BaFruOX4HXa0Nnf1lwAPufZeF8/r6zD97WK2qFnGjBxTw5qNGPxT+5T/r7/7RawFC3j4vTp09koCxkeHjqbHJqArmH5UrFKKksnxrK7FuRIs8STfBZv+luugXZ2pR/pP9Ois4z+TiMzUUkUjD0iEi1fzX8GmXyuxUBRcaUfykV0YZnlJGKQpOiGB76x5GeWkWWJc3mOrK6S7xdND+W5N6XyaRgtWJFe13GkaZnKOsYqGdOVVVbGupsyA/l7emTLHi7vwTdirNEt0qxnzAvBFcnQF16xh/TMpUuXHDowhlA9vQVraQhkudRdzOnK+04ZSP3DUhVSP61YsaLtd/ks7ZgtPcXqPqEafHkdqa84X6aCeL7YWlv6edGFHb+ZFICPlljHhg0bKuk0CSvVznWsotRu433alNdFrqG45ejoaPCaUkWERpLXjzFL2Rpllp7PJU2a/v7Ab8N05/9t27Z16KUqoFGsxnI9EosS2niSYg9SpU6B4JgTrvVW1flt1sT+0ADIJU2maXzcUTraGCRaL1Wp9rUMk16PMom8QhruxzvZIegJjFU7LLCePfS8uaQdPny4jTTL0dbee5mYokQsXTIWNY46kuMbnt8Kmec+LGWtOVIl9cT1rCB0V8WqkjAsRwta93TbwNYoGKsUSChN44lgBNCoHLHzquYKrU6qZ8lolCIN0Rh6cP0Q3U6I6IXILYOQI513hJaSKAorFpuHXJNfVlpRtmYBk1Su1obZr5dnKAO+L10Hrj3WZW+E3qh6IszE37F6EB+68mGpvKm4eb9bFrlzrok7fvr0Kfv727dvWRmdVTJHw0qiiCUSZ6wCK+7XL/AcsgNyL74DQQ730sv78Su7+t/A36MdY0sW5o40ahslXr58aZ5HtZB8GH64m9EmMZ7FpYw4T6QnrZfgenrhFxaSiSGXtPnz57e9TkNZLvTjeqhr734CNtrK41L40sUQckmj1lGKQ0rC37x544r8eNXRpnVE3ZZY7zXo8NomiO0ZUCj2uHz58rbXoZ6gc0uA+F6ZeKS/jhRDUq8MKrTho9fEkihMmhxtBI1DxKFY9XLpVcSkfoi8JGnToZO5sU5aiDQIW716ddt7ZLYtMQlhECdBGXZZMWldY5BHm5xgAroWj4C0hbYkSc/jBmggIrXJWlZM6pSETsEPGqZOndr2uuuR5rF169a2HoHPdurUKZM4CO1WTPqaDaAd+GFGKdIQkxAn9RuEWcTRyN2KSUgiSgF5aWzPTeA/lN5rZubMmR2bE4SIC4nJoltgAV/dVefZm72AtctUCJU2CMJ327hxY9t7EHbkyJFseq+EJSY16RPo3Dkq1kkr7+q0bNmyDuLQcZBEPYmHVdOBiJyIlrRDq41YPWfXOxUysi5fvtyaj+2BpcnsUV/oSoEMOk2CQGlr4ckhBwaetBhjCwH0ZHtJROPJkyc7UjcYLDjmrH7ADTEBXFfOYmB0k9oYBOjJ8b4aOYSe7QkKcYhFlq3QYLQhSidNmtS2RATwy8YOM3EQJsUjKiaWZ+vZToUQgzhkHXudb/PW5YMHD9yZM2faPsMwoc7RciYJXbGuBqJ1UIGKKLv915jsvgtJxCZDubdXr165mzdvtr1Hz5LONA8jrUwKPqsmVesKa49S3Q4WxmRPUEYdTjgiUcfUwLx589ySJUva3oMkP6IYddq6HMS4o55xBJBUeRjzfa4Zdeg56QZ43LhxoyPo7Lf1kNt7oO8wWAbNwaYjIv5lhyS7kRf96dvm5Jah8vfvX3flyhX35cuX6HfzFHOToS1H4BenCaHvO8pr8iDuwoUL7tevX+b5ZdbBair0xkFIlFDlW4ZknEClsp/TzXyAKVOmmHWFVSbDNw1l1+4f90U6IY/q4V27dpnE9bJ+v87QEydjqx/UamVVPRG+mwkNTYN+9tjkwzEx+atCm/X9WvWtDtAb68Wy9LXa1UmvCDDIpPkyOQ5ZwSzJ4jMrvFcr0rSjOUh+GcT4LSg5ugkW1Io0/SCDQBojh0hPlaJdah+tkVYrnTZowP8iq1F1TgMBBauufyB33x1v+NWFYmT5KmppgHC+NkAgbmRkpD3yn9QIseXymoTQFGQmIOKTxiZIWpvAatenVqRVXf2nTrAWMsPnKrMZHz6bJq5jvce6QK8J1cQNgKxlJapMPdZSR64/UivS9NztpkVEdKcrs5alhhWP9NeqlfWopzhZScI6QxseegZRGeg5a8C3Re1Mfl1ScP36ddcUaMuv24iOJtz7sbUjTS4qBvKmstYJoUauiuD3k5qhyr7QdUHMeCgLa1Ear9NquemdXgmum4fvJ6w1lqsuDhNrg1qSpleJK7K3TF0Q2jSd94uSZ60kK1e3qyVpQK6PVWXp2/FC3mp6jBhKKOiY2h3gtUV64TWM6wDETRPLDfSakXmH3w8g9Jlug8ZtTt4kVF0kLUYYmCCtD/DrQ5YhMGbA9L3ucdjh0y8kOHW5gU/VEEmJTcL4Pz/f7mgoAbYkAAAAAElFTkSuQmCC"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -72,7 +72,6 @@ func Origins() (origins []string) {
|
||||||
"app://*",
|
"app://*",
|
||||||
"file://*",
|
"file://*",
|
||||||
"tauri://*",
|
"tauri://*",
|
||||||
"vscode-webview://*",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return origins
|
return origins
|
||||||
|
@ -265,9 +264,9 @@ func AsMap() map[string]EnvVar {
|
||||||
|
|
||||||
if runtime.GOOS != "darwin" {
|
if runtime.GOOS != "darwin" {
|
||||||
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
ret["CUDA_VISIBLE_DEVICES"] = EnvVar{"CUDA_VISIBLE_DEVICES", CudaVisibleDevices(), "Set which NVIDIA devices are visible"}
|
||||||
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible by numeric ID"}
|
ret["HIP_VISIBLE_DEVICES"] = EnvVar{"HIP_VISIBLE_DEVICES", HipVisibleDevices(), "Set which AMD devices are visible"}
|
||||||
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible by UUID or numeric ID"}
|
ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices(), "Set which AMD devices are visible"}
|
||||||
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible by numeric ID"}
|
ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal(), "Set which AMD devices are visible"}
|
||||||
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion(), "Override the gfx used for all detected AMD GPUs"}
|
||||||
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
|
||||||
}
|
}
|
||||||
|
|
|
@ -68,7 +68,6 @@ func TestOrigins(t *testing.T) {
|
||||||
"app://*",
|
"app://*",
|
||||||
"file://*",
|
"file://*",
|
||||||
"tauri://*",
|
"tauri://*",
|
||||||
"vscode-webview://*",
|
|
||||||
}},
|
}},
|
||||||
{"http://10.0.0.1", []string{
|
{"http://10.0.0.1", []string{
|
||||||
"http://10.0.0.1",
|
"http://10.0.0.1",
|
||||||
|
@ -87,7 +86,6 @@ func TestOrigins(t *testing.T) {
|
||||||
"app://*",
|
"app://*",
|
||||||
"file://*",
|
"file://*",
|
||||||
"tauri://*",
|
"tauri://*",
|
||||||
"vscode-webview://*",
|
|
||||||
}},
|
}},
|
||||||
{"http://172.16.0.1,https://192.168.0.1", []string{
|
{"http://172.16.0.1,https://192.168.0.1", []string{
|
||||||
"http://172.16.0.1",
|
"http://172.16.0.1",
|
||||||
|
@ -107,7 +105,6 @@ func TestOrigins(t *testing.T) {
|
||||||
"app://*",
|
"app://*",
|
||||||
"file://*",
|
"file://*",
|
||||||
"tauri://*",
|
"tauri://*",
|
||||||
"vscode-webview://*",
|
|
||||||
}},
|
}},
|
||||||
{"http://totally.safe,http://definitely.legit", []string{
|
{"http://totally.safe,http://definitely.legit", []string{
|
||||||
"http://totally.safe",
|
"http://totally.safe",
|
||||||
|
@ -127,7 +124,6 @@ func TestOrigins(t *testing.T) {
|
||||||
"app://*",
|
"app://*",
|
||||||
"file://*",
|
"file://*",
|
||||||
"tauri://*",
|
"tauri://*",
|
||||||
"vscode-webview://*",
|
|
||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
for _, tt := range cases {
|
for _, tt := range cases {
|
||||||
|
|
3
go.mod
3
go.mod
|
@ -1,6 +1,6 @@
|
||||||
module github.com/ollama/ollama
|
module github.com/ollama/ollama
|
||||||
|
|
||||||
go 1.22.8
|
go 1.22.5
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/containerd/console v1.0.3
|
github.com/containerd/console v1.0.3
|
||||||
|
@ -22,7 +22,6 @@ require (
|
||||||
github.com/mattn/go-runewidth v0.0.14
|
github.com/mattn/go-runewidth v0.0.14
|
||||||
github.com/nlpodyssey/gopickle v0.3.0
|
github.com/nlpodyssey/gopickle v0.3.0
|
||||||
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
|
||||||
golang.org/x/image v0.14.0
|
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
|
2
go.sum
2
go.sum
|
@ -230,8 +230,6 @@ golang.org/x/image v0.0.0-20200430140353-33d19683fad8/go.mod h1:FeLwcggjj3mMvU+o
|
||||||
golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
golang.org/x/image v0.0.0-20200618115811-c13761719519/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
||||||
golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
golang.org/x/image v0.0.0-20201208152932-35266b937fa6/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
||||||
golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
golang.org/x/image v0.0.0-20210216034530-4410531fe030/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
||||||
golang.org/x/image v0.14.0 h1:tNgSxAFe3jC4uYqvZdTr84SZoM1KfwdC9SKIFrLjFn4=
|
|
||||||
golang.org/x/image v0.14.0/go.mod h1:HUYqC05R2ZcZ3ejNQsIHQDQiwWM4JBqmm6MKANTp4LE=
|
|
||||||
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
|
golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE=
|
||||||
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
|
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
|
||||||
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
|
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
//go:build linux || windows
|
//go:build linux || windows
|
||||||
|
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
@ -37,6 +37,19 @@ func GetSupportedGFX(libDir string) ([]string, error) {
|
||||||
return ret, nil
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
||||||
|
ids := []string{}
|
||||||
|
for _, info := range gpuInfo {
|
||||||
|
if info.Library != "rocm" {
|
||||||
|
// TODO shouldn't happen if things are wired correctly...
|
||||||
|
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ids = append(ids, info.ID)
|
||||||
|
}
|
||||||
|
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
|
||||||
|
}
|
||||||
|
|
||||||
func commonAMDValidateLibDir() (string, error) {
|
func commonAMDValidateLibDir() (string, error) {
|
||||||
// Favor our bundled version
|
// Favor our bundled version
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
@ -64,7 +64,7 @@ func NewHipLib() (*HipLib, error) {
|
||||||
return hl, nil
|
return hl, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// The hip library only evaluates the ROCR_VISIBLE_DEVICES variable at startup
|
// The hip library only evaluates the HIP_VISIBLE_DEVICES variable at startup
|
||||||
// so we have to unload/reset the library after we do our initial discovery
|
// so we have to unload/reset the library after we do our initial discovery
|
||||||
// to make sure our updates to that variable are processed by llama.cpp
|
// to make sure our updates to that variable are processed by llama.cpp
|
||||||
func (hl *HipLib) Release() {
|
func (hl *HipLib) Release() {
|
|
@ -1,4 +1,4 @@
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
|
@ -64,13 +64,16 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||||
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
// Determine if the user has already pre-selected which GPUs to look at, then ignore the others
|
||||||
var visibleDevices []string
|
var visibleDevices []string
|
||||||
hipVD := envconfig.HipVisibleDevices() // zero based index only
|
hipVD := envconfig.HipVisibleDevices() // zero based index only
|
||||||
rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID
|
rocrVD := envconfig.RocrVisibleDevices() // zero based index or UUID, but consumer cards seem to not support UUID
|
||||||
gpuDO := envconfig.GpuDeviceOrdinal() // zero based index
|
gpuDO := envconfig.GpuDeviceOrdinal() // zero based index
|
||||||
switch {
|
switch {
|
||||||
case rocrVD != "":
|
// TODO is this priorty order right?
|
||||||
visibleDevices = strings.Split(rocrVD, ",")
|
|
||||||
case hipVD != "":
|
case hipVD != "":
|
||||||
visibleDevices = strings.Split(hipVD, ",")
|
visibleDevices = strings.Split(hipVD, ",")
|
||||||
|
case rocrVD != "":
|
||||||
|
visibleDevices = strings.Split(rocrVD, ",")
|
||||||
|
// TODO - since we don't yet support UUIDs, consider detecting and reporting here
|
||||||
|
// all our test systems show GPU-XX indicating UUID is not supported
|
||||||
case gpuDO != "":
|
case gpuDO != "":
|
||||||
visibleDevices = strings.Split(gpuDO, ",")
|
visibleDevices = strings.Split(gpuDO, ",")
|
||||||
}
|
}
|
||||||
|
@ -96,7 +99,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||||
}
|
}
|
||||||
return a < b
|
return a < b
|
||||||
})
|
})
|
||||||
gpuCount := 0
|
cpuCount := 0
|
||||||
for _, match := range matches {
|
for _, match := range matches {
|
||||||
slog.Debug("evaluating amdgpu node " + match)
|
slog.Debug("evaluating amdgpu node " + match)
|
||||||
fp, err := os.Open(match)
|
fp, err := os.Open(match)
|
||||||
|
@ -105,6 +108,11 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
defer fp.Close()
|
defer fp.Close()
|
||||||
|
nodeID, err := strconv.Atoi(filepath.Base(filepath.Dir(match)))
|
||||||
|
if err != nil {
|
||||||
|
slog.Debug("failed to parse node ID", "error", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
scanner := bufio.NewScanner(fp)
|
scanner := bufio.NewScanner(fp)
|
||||||
isCPU := false
|
isCPU := false
|
||||||
|
@ -178,18 +186,19 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||||
// do reliably report VRAM usage.
|
// do reliably report VRAM usage.
|
||||||
|
|
||||||
if isCPU {
|
if isCPU {
|
||||||
|
cpuCount++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip over any GPUs that are masked
|
// CPUs are always first in the list
|
||||||
if major == 0 && minor == 0 && patch == 0 {
|
gpuID := nodeID - cpuCount
|
||||||
slog.Debug("skipping gpu with gfx000")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Keep track of numeric IDs based on valid GPUs
|
// Shouldn't happen, but just in case...
|
||||||
gpuID := gpuCount
|
if gpuID < 0 {
|
||||||
gpuCount += 1
|
err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
|
||||||
|
slog.Error(err.Error())
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
// Look up the memory for the current node
|
// Look up the memory for the current node
|
||||||
totalMemory := uint64(0)
|
totalMemory := uint64(0)
|
||||||
|
@ -264,14 +273,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||||
name = fmt.Sprintf("%04x:%04x", vendor, device)
|
name = fmt.Sprintf("%04x:%04x", vendor, device)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Favor UUIDs if available to reduce possibility of getting the numeric IDs wrong
|
|
||||||
var ID string
|
|
||||||
if uniqueID != 0 {
|
|
||||||
ID = fmt.Sprintf("GPU-%016x", uniqueID)
|
|
||||||
} else {
|
|
||||||
ID = strconv.Itoa(gpuID)
|
|
||||||
}
|
|
||||||
|
|
||||||
gpuInfo := RocmGPUInfo{
|
gpuInfo := RocmGPUInfo{
|
||||||
GpuInfo: GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
Library: "rocm",
|
Library: "rocm",
|
||||||
|
@ -279,7 +280,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||||
TotalMemory: totalMemory,
|
TotalMemory: totalMemory,
|
||||||
FreeMemory: (totalMemory - usedMemory),
|
FreeMemory: (totalMemory - usedMemory),
|
||||||
},
|
},
|
||||||
ID: ID,
|
ID: strconv.Itoa(gpuID),
|
||||||
Name: name,
|
Name: name,
|
||||||
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
|
||||||
MinimumMemory: rocmMinimumMemory,
|
MinimumMemory: rocmMinimumMemory,
|
||||||
|
@ -287,7 +288,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||||
DriverMinor: driverMinor,
|
DriverMinor: driverMinor,
|
||||||
},
|
},
|
||||||
usedFilepath: usedFile,
|
usedFilepath: usedFile,
|
||||||
index: gpuID,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
||||||
|
@ -319,7 +319,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||||
if len(visibleDevices) > 0 {
|
if len(visibleDevices) > 0 {
|
||||||
include := false
|
include := false
|
||||||
for _, visible := range visibleDevices {
|
for _, visible := range visibleDevices {
|
||||||
if visible == gpuInfo.ID || visible == strconv.Itoa(gpuInfo.index) {
|
if visible == gpuInfo.ID {
|
||||||
include = true
|
include = true
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
@ -516,20 +516,3 @@ func verifyKFDDriverAccess() error {
|
||||||
fd.Close()
|
fd.Close()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
|
||||||
ids := []string{}
|
|
||||||
for _, info := range gpuInfo {
|
|
||||||
if info.Library != "rocm" {
|
|
||||||
// TODO shouldn't happen if things are wired correctly...
|
|
||||||
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ids = append(ids, info.ID)
|
|
||||||
}
|
|
||||||
// There are 3 potential env vars to use to select GPUs.
|
|
||||||
// ROCR_VISIBLE_DEVICES supports UUID or numeric so is our preferred on linux
|
|
||||||
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
|
||||||
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
|
||||||
return "ROCR_VISIBLE_DEVICES", strings.Join(ids, ",")
|
|
||||||
}
|
|
|
@ -1,4 +1,4 @@
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
@ -43,7 +43,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
|
||||||
slog.Debug("error looking up amd driver version", "error", err)
|
slog.Debug("error looking up amd driver version", "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
|
// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
|
||||||
count := hl.HipGetDeviceCount()
|
count := hl.HipGetDeviceCount()
|
||||||
if count == 0 {
|
if count == 0 {
|
||||||
err := fmt.Errorf("no compatible amdgpu devices detected")
|
err := fmt.Errorf("no compatible amdgpu devices detected")
|
||||||
|
@ -201,20 +201,3 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
|
|
||||||
ids := []string{}
|
|
||||||
for _, info := range gpuInfo {
|
|
||||||
if info.Library != "rocm" {
|
|
||||||
// TODO shouldn't happen if things are wired correctly...
|
|
||||||
slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ids = append(ids, info.ID)
|
|
||||||
}
|
|
||||||
// There are 3 potential env vars to use to select GPUs.
|
|
||||||
// ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
|
|
||||||
// HIP_VISIBLE_DEVICES supports numeric IDs only
|
|
||||||
// GPU_DEVICE_ORDINAL supports numeric IDs only
|
|
||||||
return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
|
|
||||||
}
|
|
|
@ -1,4 +1,4 @@
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
|
@ -1,6 +1,6 @@
|
||||||
//go:build linux || windows
|
//go:build linux || windows
|
||||||
|
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
"log/slog"
|
|
@ -1,6 +1,6 @@
|
||||||
//go:build linux || windows
|
//go:build linux || windows
|
||||||
|
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
||||||
|
@ -229,10 +229,7 @@ func GetGPUInfo() GpuInfoList {
|
||||||
slog.Warn("error looking up system memory", "error", err)
|
slog.Warn("error looking up system memory", "error", err)
|
||||||
}
|
}
|
||||||
depPath := LibraryDir()
|
depPath := LibraryDir()
|
||||||
details, err := GetCPUDetails()
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to lookup CPU details", "error", err)
|
|
||||||
}
|
|
||||||
cpus = []CPUInfo{
|
cpus = []CPUInfo{
|
||||||
{
|
{
|
||||||
GpuInfo: GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
|
@ -242,7 +239,6 @@ func GetGPUInfo() GpuInfoList {
|
||||||
ID: "0",
|
ID: "0",
|
||||||
DependencyPath: depPath,
|
DependencyPath: depPath,
|
||||||
},
|
},
|
||||||
CPUs: details,
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
//go:build darwin
|
//go:build darwin
|
||||||
|
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#cgo CFLAGS: -x objective-c
|
#cgo CFLAGS: -x objective-c
|
||||||
|
@ -10,9 +10,7 @@ package discover
|
||||||
import "C"
|
import "C"
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
|
||||||
"runtime"
|
"runtime"
|
||||||
"syscall"
|
|
||||||
|
|
||||||
"github.com/ollama/ollama/format"
|
"github.com/ollama/ollama/format"
|
||||||
)
|
)
|
||||||
|
@ -71,30 +69,11 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
|
||||||
|
|
||||||
func GetSystemInfo() SystemInfo {
|
func GetSystemInfo() SystemInfo {
|
||||||
mem, _ := GetCPUMem()
|
mem, _ := GetCPUMem()
|
||||||
query := "hw.perflevel0.physicalcpu"
|
|
||||||
perfCores, err := syscall.SysctlUint32(query)
|
|
||||||
if err != nil {
|
|
||||||
slog.Warn("failed to discover physical CPU details", "query", query, "error", err)
|
|
||||||
}
|
|
||||||
query = "hw.perflevel1.physicalcpu"
|
|
||||||
efficiencyCores, _ := syscall.SysctlUint32(query) // On x86 xeon this wont return data
|
|
||||||
|
|
||||||
// Determine thread count
|
|
||||||
query = "hw.logicalcpu"
|
|
||||||
logicalCores, _ := syscall.SysctlUint32(query)
|
|
||||||
|
|
||||||
return SystemInfo{
|
return SystemInfo{
|
||||||
System: CPUInfo{
|
System: CPUInfo{
|
||||||
GpuInfo: GpuInfo{
|
GpuInfo: GpuInfo{
|
||||||
memInfo: mem,
|
memInfo: mem,
|
||||||
},
|
},
|
||||||
CPUs: []CPU{
|
|
||||||
{
|
|
||||||
CoreCount: int(perfCores + efficiencyCores),
|
|
||||||
EfficiencyCoreCount: int(efficiencyCores),
|
|
||||||
ThreadCount: int(logicalCores),
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
GPUs: GetGPUInfo(),
|
GPUs: GetGPUInfo(),
|
||||||
}
|
}
|
92
gpu/gpu_linux.go
Normal file
92
gpu/gpu_linux.go
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
package gpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/format"
|
||||||
|
)
|
||||||
|
|
||||||
|
var CudartGlobs = []string{
|
||||||
|
"/usr/local/cuda/lib64/libcudart.so*",
|
||||||
|
"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
|
||||||
|
"/usr/lib/x86_64-linux-gnu/libcudart.so*",
|
||||||
|
"/usr/lib/wsl/lib/libcudart.so*",
|
||||||
|
"/usr/lib/wsl/drivers/*/libcudart.so*",
|
||||||
|
"/opt/cuda/lib64/libcudart.so*",
|
||||||
|
"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
|
||||||
|
"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
|
||||||
|
"/usr/lib/aarch64-linux-gnu/libcudart.so*",
|
||||||
|
"/usr/local/cuda/lib*/libcudart.so*",
|
||||||
|
"/usr/lib*/libcudart.so*",
|
||||||
|
"/usr/local/lib*/libcudart.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvmlGlobs = []string{}
|
||||||
|
|
||||||
|
var NvcudaGlobs = []string{
|
||||||
|
"/usr/local/cuda*/targets/*/lib/libcuda.so*",
|
||||||
|
"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
|
||||||
|
"/usr/lib/*-linux-gnu/libcuda.so*",
|
||||||
|
"/usr/lib/wsl/lib/libcuda.so*",
|
||||||
|
"/usr/lib/wsl/drivers/*/libcuda.so*",
|
||||||
|
"/opt/cuda/lib*/libcuda.so*",
|
||||||
|
"/usr/local/cuda/lib*/libcuda.so*",
|
||||||
|
"/usr/lib*/libcuda.so*",
|
||||||
|
"/usr/local/lib*/libcuda.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var OneapiGlobs = []string{
|
||||||
|
"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
|
||||||
|
"/usr/lib*/libze_intel_gpu.so*",
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
CudartMgmtName = "libcudart.so*"
|
||||||
|
NvcudaMgmtName = "libcuda.so*"
|
||||||
|
NvmlMgmtName = "" // not currently wired on linux
|
||||||
|
OneapiMgmtName = "libze_intel_gpu.so*"
|
||||||
|
)
|
||||||
|
|
||||||
|
func GetCPUMem() (memInfo, error) {
|
||||||
|
var mem memInfo
|
||||||
|
var total, available, free, buffers, cached, freeSwap uint64
|
||||||
|
f, err := os.Open("/proc/meminfo")
|
||||||
|
if err != nil {
|
||||||
|
return mem, err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
s := bufio.NewScanner(f)
|
||||||
|
for s.Scan() {
|
||||||
|
line := s.Text()
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(line, "MemTotal:"):
|
||||||
|
_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
|
||||||
|
case strings.HasPrefix(line, "MemAvailable:"):
|
||||||
|
_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
|
||||||
|
case strings.HasPrefix(line, "MemFree:"):
|
||||||
|
_, err = fmt.Sscanf(line, "MemFree:%d", &free)
|
||||||
|
case strings.HasPrefix(line, "Buffers:"):
|
||||||
|
_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
|
||||||
|
case strings.HasPrefix(line, "Cached:"):
|
||||||
|
_, err = fmt.Sscanf(line, "Cached:%d", &cached)
|
||||||
|
case strings.HasPrefix(line, "SwapFree:"):
|
||||||
|
_, err = fmt.Sscanf(line, "SwapFree:%d", &freeSwap)
|
||||||
|
default:
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return mem, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mem.TotalMemory = total * format.KibiByte
|
||||||
|
mem.FreeSwap = freeSwap * format.KibiByte
|
||||||
|
if available > 0 {
|
||||||
|
mem.FreeMemory = available * format.KibiByte
|
||||||
|
} else {
|
||||||
|
mem.FreeMemory = (free + buffers + cached) * format.KibiByte
|
||||||
|
}
|
||||||
|
return mem, nil
|
||||||
|
}
|
|
@ -1,6 +1,6 @@
|
||||||
//go:build linux || windows
|
//go:build linux || windows
|
||||||
|
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"log/slog"
|
"log/slog"
|
|
@ -1,4 +1,4 @@
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"runtime"
|
"runtime"
|
57
gpu/gpu_windows.go
Normal file
57
gpu/gpu_windows.go
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
package gpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"syscall"
|
||||||
|
"unsafe"
|
||||||
|
)
|
||||||
|
|
||||||
|
type MEMORYSTATUSEX struct {
|
||||||
|
length uint32
|
||||||
|
MemoryLoad uint32
|
||||||
|
TotalPhys uint64
|
||||||
|
AvailPhys uint64
|
||||||
|
TotalPageFile uint64
|
||||||
|
AvailPageFile uint64
|
||||||
|
TotalVirtual uint64
|
||||||
|
AvailVirtual uint64
|
||||||
|
AvailExtendedVirtual uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
k32 = syscall.NewLazyDLL("kernel32.dll")
|
||||||
|
globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
|
||||||
|
sizeofMemoryStatusEx = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
|
||||||
|
)
|
||||||
|
|
||||||
|
var CudartGlobs = []string{
|
||||||
|
"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvmlGlobs = []string{
|
||||||
|
"c:\\Windows\\System32\\nvml.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var NvcudaGlobs = []string{
|
||||||
|
"c:\\windows\\system*\\nvcuda.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var OneapiGlobs = []string{
|
||||||
|
"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
CudartMgmtName = "cudart64_*.dll"
|
||||||
|
NvcudaMgmtName = "nvcuda.dll"
|
||||||
|
NvmlMgmtName = "nvml.dll"
|
||||||
|
OneapiMgmtName = "ze_intel_gpu64.dll"
|
||||||
|
)
|
||||||
|
|
||||||
|
func GetCPUMem() (memInfo, error) {
|
||||||
|
memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
|
||||||
|
r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
|
||||||
|
if r1 == 0 {
|
||||||
|
return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
|
||||||
|
}
|
||||||
|
return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys, FreeSwap: memStatus.AvailPageFile}, nil
|
||||||
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
package discover
|
package gpu
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
@ -10,11 +10,11 @@ import (
|
||||||
type memInfo struct {
|
type memInfo struct {
|
||||||
TotalMemory uint64 `json:"total_memory,omitempty"`
|
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||||
FreeMemory uint64 `json:"free_memory,omitempty"`
|
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||||
FreeSwap uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
|
FreeSwap uint64 `json:"free_swap,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Beginning of an `ollama info` command
|
// Beginning of an `ollama info` command
|
||||||
type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
|
type GpuInfo struct {
|
||||||
memInfo
|
memInfo
|
||||||
Library string `json:"library,omitempty"`
|
Library string `json:"library,omitempty"`
|
||||||
|
|
||||||
|
@ -49,17 +49,6 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
|
||||||
|
|
||||||
type CPUInfo struct {
|
type CPUInfo struct {
|
||||||
GpuInfo
|
GpuInfo
|
||||||
CPUs []CPU
|
|
||||||
}
|
|
||||||
|
|
||||||
// CPU type represents a CPU Package occupying a socket
|
|
||||||
type CPU struct {
|
|
||||||
ID string `cpuinfo:"processor"`
|
|
||||||
VendorID string `cpuinfo:"vendor_id"`
|
|
||||||
ModelName string `cpuinfo:"model name"`
|
|
||||||
CoreCount int
|
|
||||||
EfficiencyCoreCount int // Performance = CoreCount - Efficiency
|
|
||||||
ThreadCount int
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type CudaGPUInfo struct {
|
type CudaGPUInfo struct {
|
||||||
|
@ -169,12 +158,3 @@ type SystemInfo struct {
|
||||||
UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
|
UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
|
||||||
DiscoveryErrors []string `json:"discovery_errors"`
|
DiscoveryErrors []string `json:"discovery_errors"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the optimal number of threads to use for inference
|
|
||||||
func (si SystemInfo) GetOptimalThreadCount() int {
|
|
||||||
if len(si.System.CPUs) == 0 {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
// Allocate thread count matching the performance cores on a single socket
|
|
||||||
return si.System.CPUs[0].CoreCount - si.System.CPUs[0].EfficiencyCoreCount
|
|
||||||
}
|
|
|
@ -30,39 +30,6 @@ func TestOrcaMiniBlueSky(t *testing.T) {
|
||||||
GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
|
GenerateTestHelper(ctx, t, req, []string{"rayleigh", "scattering"})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestUnicode(t *testing.T) {
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
|
||||||
defer cancel()
|
|
||||||
// Set up the test data
|
|
||||||
req := api.GenerateRequest{
|
|
||||||
// DeepSeek has a Unicode tokenizer regex, making it a unicode torture test
|
|
||||||
Model: "deepseek-coder-v2:16b-lite-instruct-q2_K",
|
|
||||||
Prompt: "天空为什么是蓝色的?",
|
|
||||||
Stream: &stream,
|
|
||||||
Options: map[string]interface{}{
|
|
||||||
"temperature": 0,
|
|
||||||
"seed": 123,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
GenerateTestHelper(ctx, t, req, []string{"散射", "频率"})
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestExtendedUnicodeOutput(t *testing.T) {
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
|
||||||
defer cancel()
|
|
||||||
// Set up the test data
|
|
||||||
req := api.GenerateRequest{
|
|
||||||
Model: "gemma2:2b",
|
|
||||||
Prompt: "Output some smily face emoji",
|
|
||||||
Stream: &stream,
|
|
||||||
Options: map[string]interface{}{
|
|
||||||
"temperature": 0,
|
|
||||||
"seed": 123,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
GenerateTestHelper(ctx, t, req, []string{"😀", "😊", "😁", "😂", "😄", "😃"})
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestUnicodeModelDir(t *testing.T) {
|
func TestUnicodeModelDir(t *testing.T) {
|
||||||
// This is only useful for Windows with utf-16 characters, so skip this test for other platforms
|
// This is only useful for Windows with utf-16 characters, so skip this test for other platforms
|
||||||
if runtime.GOOS != "windows" {
|
if runtime.GOOS != "windows" {
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,5 +1,5 @@
|
||||||
# Note: once we have fully transitioned to the Go server, this will replace the old Dockerfile at the top of the tree
|
# Note: once we have fully transitioned to the Go server, this will replace the old Dockerfile at the top of the tree
|
||||||
ARG GOLANG_VERSION=1.22.8
|
ARG GOLANG_VERSION=1.22.5
|
||||||
ARG CMAKE_VERSION=3.22.1
|
ARG CMAKE_VERSION=3.22.1
|
||||||
ARG CUDA_VERSION_11=11.3.1
|
ARG CUDA_VERSION_11=11.3.1
|
||||||
ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
|
ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
|
||||||
|
@ -9,7 +9,7 @@ ARG ROCM_VERSION=6.1.2
|
||||||
|
|
||||||
### To create a local image for building linux binaries on mac or windows with efficient incremental builds
|
### To create a local image for building linux binaries on mac or windows with efficient incremental builds
|
||||||
#
|
#
|
||||||
# docker build --platform linux/amd64 -t builder-amd64 -f llama/Dockerfile --target unified-builder-amd64 .
|
# docker build --platform linux/amd64 -t builder-amd64 -f Dockerfile.new --target unified-builder-amd64 .
|
||||||
# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
|
# docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64
|
||||||
#
|
#
|
||||||
### Then incremental builds will be much faster in this container
|
### Then incremental builds will be much faster in this container
|
||||||
|
@ -41,7 +41,7 @@ ENTRYPOINT [ "zsh" ]
|
||||||
### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
|
### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds
|
||||||
# Note: this does not contain jetson variants
|
# Note: this does not contain jetson variants
|
||||||
#
|
#
|
||||||
# docker build --platform linux/arm64 -t builder-arm64 -f llama/Dockerfile --target unified-builder-arm64 .
|
# docker build --platform linux/arm64 -t builder-arm64 -f Dockerfile.new --target unified-builder-arm64 .
|
||||||
# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
|
# docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
|
||||||
#
|
#
|
||||||
FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
|
FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
|
||||||
|
|
|
@ -9,7 +9,8 @@ ifeq ($(OS),windows)
|
||||||
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
|
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
|
||||||
CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
|
CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
|
||||||
CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
|
CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
|
||||||
HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
|
HIP_PATH_83 := $(shell cygpath -m -s "$(subst \,/,$(HIP_PATH))" 2>/dev/null)
|
||||||
|
HIP_LIB_DIR := $(shell ls -d $(HIP_PATH_83)/lib 2>/dev/null)
|
||||||
else ifeq ($(OS),linux)
|
else ifeq ($(OS),linux)
|
||||||
HIP_PATH?=/opt/rocm
|
HIP_PATH?=/opt/rocm
|
||||||
HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
|
HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
|
||||||
|
@ -40,12 +41,8 @@ runners: $(RUNNER_TARGETS)
|
||||||
$(RUNNER_TARGETS):
|
$(RUNNER_TARGETS):
|
||||||
$(MAKE) -f make/Makefile.$@
|
$(MAKE) -f make/Makefile.$@
|
||||||
|
|
||||||
help-sync apply-patches create-patches sync:
|
|
||||||
$(MAKE) -f make/Makefile.sync $@
|
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
|
rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
|
||||||
go clean -cache
|
|
||||||
|
|
||||||
clean-payload:
|
clean-payload:
|
||||||
rm -rf $(addprefix $(RUNNERS_PAYLOAD_DIR)/, $(RUNNER_TARGETS) metal cpu cpu_avx cpu_avx2)
|
rm -rf $(addprefix $(RUNNERS_PAYLOAD_DIR)/, $(RUNNER_TARGETS) metal cpu cpu_avx cpu_avx2)
|
||||||
|
|
|
@ -91,84 +91,10 @@ go build -tags avx,rocm .
|
||||||
make -j
|
make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
## Vendoring
|
## Syncing with llama.cpp
|
||||||
|
|
||||||
Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model. While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit. A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
|
To update this package to the latest llama.cpp code, use the `sync.sh` script:
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
> Prior to merging #7157 we continue to leverage a submodule for llama.cpp which establishes the tracking commit. After merging that PR a new manifest file we be utilized
|
|
||||||
|
|
||||||
If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
|
|
||||||
|
|
||||||
```
|
```
|
||||||
make -C llama apply-patches
|
./sync.sh ../../llama.cpp
|
||||||
```
|
```
|
||||||
|
|
||||||
### Updating Base Commit
|
|
||||||
|
|
||||||
**Pin to new base commit**
|
|
||||||
|
|
||||||
To update to a newer base commit, select the upstream git tag or commit
|
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
> After merging #7157 a manifest will be used instead of the submodule
|
|
||||||
|
|
||||||
```
|
|
||||||
cd llm/llama.cpp
|
|
||||||
git fetch
|
|
||||||
git checkout NEW_BASE_COMMIT
|
|
||||||
cd ..
|
|
||||||
git add llama.cpp
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Applying patches
|
|
||||||
|
|
||||||
When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
|
|
||||||
|
|
||||||
Start by applying the patches. If any of the patches have conflicts, the `git am` will stop at the first failure.
|
|
||||||
|
|
||||||
```
|
|
||||||
make -C llama apply-patches
|
|
||||||
```
|
|
||||||
|
|
||||||
If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed. Save the file(s) and continue the patch series with `git am --continue` . If any additional patches fail, follow the same pattern until the full patch series is applied. Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
|
|
||||||
|
|
||||||
```
|
|
||||||
make -C llama create-patches sync
|
|
||||||
```
|
|
||||||
|
|
||||||
Build and test Ollama, and make any necessary changes to the Go code based on the new base commit. Submit your PR to the Ollama repo.
|
|
||||||
|
|
||||||
### Generating Patches
|
|
||||||
|
|
||||||
When working on new fixes or features that impact vendored code, use the following model. First get a clean tracking repo with all current patches applied:
|
|
||||||
|
|
||||||
```
|
|
||||||
make -C llama apply-patches
|
|
||||||
```
|
|
||||||
|
|
||||||
Now edit the upstream native code in the `./vendor/` directory. You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing. Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
|
|
||||||
|
|
||||||
```
|
|
||||||
make -C llama sync
|
|
||||||
make -C llama -j 8
|
|
||||||
go build .
|
|
||||||
```
|
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo. It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
|
|
||||||
|
|
||||||
Iterate until you're ready to submit PRs. Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
|
|
||||||
|
|
||||||
```
|
|
||||||
make -C llama create-patches
|
|
||||||
```
|
|
||||||
|
|
||||||
> [!IMPORTANT]
|
|
||||||
> Once you have completed this step, it is safe to run `apply-patches` since your change is preserved in the patches.
|
|
||||||
|
|
||||||
In your `./vendor/` directory, create a branch, and cherry-pick the new commit to that branch, then submit a PR upstream to llama.cpp.
|
|
||||||
|
|
||||||
Commit the changes in the ollama repo and submit a PR to Ollama, which will include the vendored code update with your change, along with the patches.
|
|
||||||
|
|
||||||
After your PR upstream is merged, follow the **Updating Base Commit** instructions above, however first remove your patch before running `apply-patches` since the new base commit contains your change already.
|
|
28
llama/build-info.cpp
vendored
28
llama/build-info.cpp
vendored
|
@ -1,4 +1,30 @@
|
||||||
|
/**
|
||||||
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
|
*
|
||||||
|
* MIT License
|
||||||
|
*
|
||||||
|
* Copyright (c) 2023-2024 The ggml authors
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
int LLAMA_BUILD_NUMBER = 0;
|
int LLAMA_BUILD_NUMBER = 0;
|
||||||
char const *LLAMA_COMMIT = "3f1ae2e32cde00c39b96be6d01c2997c29bae555";
|
char const *LLAMA_COMMIT = "";
|
||||||
char const *LLAMA_COMPILER = "";
|
char const *LLAMA_COMPILER = "";
|
||||||
char const *LLAMA_BUILD_TARGET = "";
|
char const *LLAMA_BUILD_TARGET = "";
|
||||||
|
|
155
llama/clip.cpp
vendored
155
llama/clip.cpp
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -29,6 +29,7 @@
|
||||||
// I'll gradually clean and extend it
|
// I'll gradually clean and extend it
|
||||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
|
#include "log.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
@ -65,11 +66,6 @@
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
|
||||||
#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
|
||||||
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
||||||
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
||||||
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
#define WIN32_LEAN_AND_MEAN
|
#define WIN32_LEAN_AND_MEAN
|
||||||
#ifndef NOMINMAX
|
#ifndef NOMINMAX
|
||||||
|
@ -208,7 +204,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||||
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
||||||
int i = gguf_find_key(ctx, key);
|
int i = gguf_find_key(ctx, key);
|
||||||
if (i == -1) {
|
if (i == -1) {
|
||||||
LOG_ERR("key %s not found in file\n", key);
|
LOG_TEE("key %s not found in file\n", key);
|
||||||
throw std::runtime_error(format("Missing required key: %s", key));
|
throw std::runtime_error(format("Missing required key: %s", key));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -313,7 +309,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||||
|
|
||||||
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
||||||
size_t tensor_size = ggml_nbytes(tensor);
|
size_t tensor_size = ggml_nbytes(tensor);
|
||||||
LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
||||||
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
||||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
||||||
}
|
}
|
||||||
|
@ -331,7 +327,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
||||||
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
||||||
std::ofstream file(filename, std::ios::binary);
|
std::ofstream file(filename, std::ios::binary);
|
||||||
if (!file.is_open()) {
|
if (!file.is_open()) {
|
||||||
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -350,7 +346,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
|
||||||
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
||||||
std::ofstream file(filename, std::ios::binary);
|
std::ofstream file(filename, std::ios::binary);
|
||||||
if (!file.is_open()) {
|
if (!file.is_open()) {
|
||||||
LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
|
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -611,7 +607,7 @@ struct clip_ctx {
|
||||||
|
|
||||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -625,7 +621,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
if (load_image_size == nullptr) {
|
if (load_image_size == nullptr) {
|
||||||
load_image_size = clip_image_size_init();
|
load_image_size = clip_image_size_init();
|
||||||
}
|
}
|
||||||
LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||||
image_size_width = load_image_size->width;
|
image_size_width = load_image_size->width;
|
||||||
image_size_height = load_image_size->height;
|
image_size_height = load_image_size->height;
|
||||||
if (is_inf) {
|
if (is_inf) {
|
||||||
|
@ -1090,21 +1086,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
||||||
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
||||||
const std::string name = gguf_get_val_str(ctx, idx_name);
|
const std::string name = gguf_get_val_str(ctx, idx_name);
|
||||||
LOG_INF("%s: model name: %s\n", __func__, name.c_str());
|
LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
|
||||||
}
|
}
|
||||||
LOG_INF("%s: description: %s\n", __func__, description.c_str());
|
LOG_TEE("%s: description: %s\n", __func__, description.c_str());
|
||||||
LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
||||||
LOG_INF("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||||
LOG_INF("%s: n_tensors: %d\n", __func__, n_tensors);
|
LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||||
LOG_INF("%s: n_kv: %d\n", __func__, n_kv);
|
LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
|
||||||
LOG_INF("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
||||||
LOG_INF("\n");
|
LOG_TEE("\n");
|
||||||
}
|
}
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
|
||||||
// kv
|
// kv
|
||||||
const int n_kv = gguf_get_n_kv(ctx);
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
||||||
__func__, n_kv, n_tensors, fname);
|
__func__, n_kv, n_tensors, fname);
|
||||||
{
|
{
|
||||||
std::map<enum ggml_type, uint32_t> n_type;
|
std::map<enum ggml_type, uint32_t> n_type;
|
||||||
|
@ -1115,7 +1111,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
n_type[type]++;
|
n_type[type]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||||
for (int i = 0; i < n_kv; i++) {
|
for (int i = 0; i < n_kv; i++) {
|
||||||
const char * name = gguf_get_key(ctx, i);
|
const char * name = gguf_get_key(ctx, i);
|
||||||
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||||
|
@ -1131,7 +1127,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
replace_all(value, "\n", "\\n");
|
replace_all(value, "\n", "\\n");
|
||||||
|
|
||||||
LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// print type counts
|
// print type counts
|
||||||
|
@ -1140,7 +1136,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1155,7 +1151,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
size_t tensor_size = ggml_nbytes(cur);
|
size_t tensor_size = ggml_nbytes(cur);
|
||||||
model_size += tensor_size;
|
model_size += tensor_size;
|
||||||
if (verbosity >= 3) {
|
if (verbosity >= 3) {
|
||||||
LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
||||||
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1182,27 +1178,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
new_clip->backend = ggml_backend_cuda_init(0);
|
new_clip->backend = ggml_backend_cuda_init(0);
|
||||||
LOG_INF("%s: CLIP using CUDA backend\n", __func__);
|
LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
new_clip->backend = ggml_backend_metal_init();
|
new_clip->backend = ggml_backend_metal_init();
|
||||||
LOG_INF("%s: CLIP using Metal backend\n", __func__);
|
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_CANN
|
#ifdef GGML_USE_CANN
|
||||||
new_clip->backend = ggml_backend_cann_init(0);
|
new_clip->backend = ggml_backend_cann_init(0);
|
||||||
LOG_INF("%s: CLIP using CANN backend\n", __func__);
|
LOG_TEE("%s: CLIP using CANN backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_VULKAN
|
#ifdef GGML_USE_VULKAN
|
||||||
new_clip->backend = ggml_backend_vk_init(0);
|
new_clip->backend = ggml_backend_vk_init(0);
|
||||||
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
|
LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!new_clip->backend) {
|
if (!new_clip->backend) {
|
||||||
new_clip->backend = ggml_backend_cpu_init();
|
new_clip->backend = ggml_backend_cpu_init();
|
||||||
LOG_INF("%s: CLIP using CPU backend\n", __func__);
|
LOG_TEE("%s: CLIP using CPU backend\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
// model size and capabilities
|
// model size and capabilities
|
||||||
|
@ -1237,16 +1233,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||||
|
|
||||||
if (verbosity >= 1) {
|
if (verbosity >= 1) {
|
||||||
LOG_INF("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||||
LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||||
LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||||
LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
LOG_TEE("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector);
|
||||||
LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||||
LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
||||||
|
|
||||||
// load tensors
|
// load tensors
|
||||||
{
|
{
|
||||||
|
@ -1259,11 +1255,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
|
|
||||||
new_clip->ctx_data = ggml_init(params);
|
new_clip->ctx_data = ggml_init(params);
|
||||||
if (!new_clip->ctx_data) {
|
if (!new_clip->ctx_data) {
|
||||||
LOG_ERR("%s: ggml_init() failed\n", __func__);
|
LOG_TEE("%s: ggml_init() failed\n", __func__);
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
|
int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
|
||||||
if (!wlen) {
|
if (!wlen) {
|
||||||
|
@ -1288,7 +1285,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
auto fin = std::ifstream(fname, std::ios::binary);
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
#endif
|
#endif
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
LOG_ERR("cannot open model file for loading tensors\n");
|
LOG_TEE("cannot open model file for loading tensors\n");
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -1310,7 +1307,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||||
fin.seekg(offset, std::ios::beg);
|
fin.seekg(offset, std::ios::beg);
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
|
LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
|
||||||
clip_free(new_clip);
|
clip_free(new_clip);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -1385,23 +1382,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (verbosity >= 2) {
|
if (verbosity >= 2) {
|
||||||
LOG_INF("\n%s: vision model hparams\n", __func__);
|
LOG_TEE("\n%s: vision model hparams\n", __func__);
|
||||||
LOG_INF("image_size %d\n", hparams.image_size);
|
LOG_TEE("image_size %d\n", hparams.image_size);
|
||||||
LOG_INF("patch_size %d\n", hparams.patch_size);
|
LOG_TEE("patch_size %d\n", hparams.patch_size);
|
||||||
LOG_INF("v_hidden_size %d\n", hparams.hidden_size);
|
LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
|
||||||
LOG_INF("v_n_intermediate %d\n", hparams.n_intermediate);
|
LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
|
||||||
LOG_INF("v_projection_dim %d\n", hparams.projection_dim);
|
LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
|
||||||
LOG_INF("v_n_head %d\n", hparams.n_head);
|
LOG_TEE("v_n_head %d\n", hparams.n_head);
|
||||||
LOG_INF("v_n_layer %d\n", hparams.n_layer);
|
LOG_TEE("v_n_layer %d\n", hparams.n_layer);
|
||||||
LOG_INF("v_eps %f\n", hparams.eps);
|
LOG_TEE("v_eps %f\n", hparams.eps);
|
||||||
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||||
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||||
LOG_INF("v_image_grid_pinpoints: ");
|
LOG_TEE("v_image_grid_pinpoints: ");
|
||||||
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
||||||
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
|
LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
|
||||||
}
|
}
|
||||||
LOG_INF("\n");
|
LOG_TEE("\n");
|
||||||
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1439,7 +1436,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
||||||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||||
} catch(const std::exception& /*e*/) {
|
} catch(const std::exception& /*e*/) {
|
||||||
LOG_ERR("%s: failed to load vision model tensors\n", __func__);
|
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
|
||||||
}
|
}
|
||||||
|
|
||||||
// LLaVA projection
|
// LLaVA projection
|
||||||
|
@ -1468,7 +1465,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
} catch (std::runtime_error & /*e*/) { }
|
} catch (std::runtime_error & /*e*/) { }
|
||||||
try {
|
try {
|
||||||
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
||||||
// LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
||||||
} catch (std::runtime_error & /*e*/) { }
|
} catch (std::runtime_error & /*e*/) { }
|
||||||
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||||
// MobileVLM projection
|
// MobileVLM projection
|
||||||
|
@ -1569,7 +1566,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
|
||||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||||
LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new_clip;
|
return new_clip;
|
||||||
|
@ -1620,7 +1617,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
build_clip_img_from_data(data, nx, ny, img);
|
build_clip_img_from_data(data, nx, ny, img);
|
||||||
|
@ -1632,7 +1629,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
||||||
int nx, ny, nc;
|
int nx, ny, nc;
|
||||||
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||||
if (!data) {
|
if (!data) {
|
||||||
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
LOG_TEE("%s: failed to decode image bytes\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
build_clip_img_from_data(data, nx, ny, img);
|
build_clip_img_from_data(data, nx, ny, img);
|
||||||
|
@ -1822,7 +1819,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
|
||||||
int downscaled_height = static_cast<int>(original_height * scale);
|
int downscaled_height = static_cast<int>(original_height * scale);
|
||||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||||
int wasted_resolution = (width * height) - effective_resolution;
|
int wasted_resolution = (width * height) - effective_resolution;
|
||||||
// LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
||||||
max_effective_resolution = effective_resolution;
|
max_effective_resolution = effective_resolution;
|
||||||
min_wasted_resolution = wasted_resolution;
|
min_wasted_resolution = wasted_resolution;
|
||||||
|
@ -1940,7 +1937,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
||||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||||
|
|
||||||
std::vector<std::vector<clip_image_u8 *>> images;
|
std::vector<std::vector<clip_image_u8 *>> images;
|
||||||
LOG_INF("%s: multiple %d\n", __func__, multiple);
|
LOG_TEE("%s: multiple %d\n", __func__, multiple);
|
||||||
images.push_back(std::vector<clip_image_u8 *>());
|
images.push_back(std::vector<clip_image_u8 *>());
|
||||||
|
|
||||||
if (multiple <= 1) {
|
if (multiple <= 1) {
|
||||||
|
@ -1955,17 +1952,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
|
||||||
clip_image_u8 * source_image = clip_image_u8_init();
|
clip_image_u8 * source_image = clip_image_u8_init();
|
||||||
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||||
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
||||||
LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
||||||
images[images.size()-1].push_back(source_image);
|
images[images.size()-1].push_back(source_image);
|
||||||
|
|
||||||
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||||
LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
||||||
|
|
||||||
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
||||||
clip_image_u8 * refine_image = clip_image_u8_init();
|
clip_image_u8 * refine_image = clip_image_u8_init();
|
||||||
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
||||||
|
|
||||||
LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
||||||
|
|
||||||
// split_to_patches
|
// split_to_patches
|
||||||
int width = refine_image->nx;
|
int width = refine_image->nx;
|
||||||
|
@ -2022,7 +2019,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
int idx = 0;
|
int idx = 0;
|
||||||
for (size_t i = 0; i < imgs.size(); ++i) {
|
for (size_t i = 0; i < imgs.size(); ++i) {
|
||||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||||
LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
||||||
clip_image_f32 * res = clip_image_f32_init();
|
clip_image_f32 * res = clip_image_f32_init();
|
||||||
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
||||||
res_imgs->data[idx++] = *res;
|
res_imgs->data[idx++] = *res;
|
||||||
|
@ -2034,7 +2031,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
|
|
||||||
bool pad_to_square = true;
|
bool pad_to_square = true;
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
auto & params = ctx->vision_model.hparams;
|
auto & params = ctx->vision_model.hparams;
|
||||||
|
@ -2111,7 +2108,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < patches.size(); i++) {
|
for (size_t i = 0; i < patches.size(); i++) {
|
||||||
// LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
// LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
||||||
clip_image_u8_free(patches[i]);
|
clip_image_u8_free(patches[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2347,7 +2344,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
|
||||||
|
|
||||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2359,7 +2356,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
|
||||||
|
|
||||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
LOG_ERR("This gguf file seems to have no vision encoder\n");
|
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2508,10 +2505,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
if (ggml_backend_is_metal(ctx->backend)) {
|
||||||
|
ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
ggml_backend_graph_compute(ctx->backend, gf);
|
ggml_backend_graph_compute(ctx->backend, gf);
|
||||||
|
|
||||||
// the last node is the embedding tensor
|
// the last node is the embedding tensor
|
||||||
struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
|
||||||
|
|
||||||
// copy the embeddings to the location passed by the user
|
// copy the embeddings to the location passed by the user
|
||||||
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
|
||||||
|
@ -2583,7 +2586,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
new_type = type;
|
new_type = type;
|
||||||
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
||||||
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
||||||
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
// LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
||||||
}
|
}
|
||||||
const size_t n_elms = ggml_nelements(cur);
|
const size_t n_elms = ggml_nelements(cur);
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
|
@ -2602,7 +2605,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
f32_data = (float *)conv_buf.data();
|
f32_data = (float *)conv_buf.data();
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
LOG_ERR("Please use an input file in f32 or f16\n");
|
LOG_TEE("Please use an input file in f32 or f16\n");
|
||||||
gguf_free(ctx_out);
|
gguf_free(ctx_out);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -2629,7 +2632,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
fout.put(0);
|
fout.put(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
||||||
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2645,8 +2648,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||||
gguf_free(ctx_out);
|
gguf_free(ctx_out);
|
||||||
|
|
||||||
{
|
{
|
||||||
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||||
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
2
llama/clip.h
vendored
2
llama/clip.h
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2000
llama/common.cpp
vendored
2000
llama/common.cpp
vendored
File diff suppressed because it is too large
Load diff
187
llama/common.h
vendored
187
llama/common.h
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -30,9 +30,18 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "sampling.h"
|
||||||
|
|
||||||
|
#define LOG_NO_FILE_LINE_FUNCTION
|
||||||
|
#include "log.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <sstream>
|
#include <random>
|
||||||
|
#include <thread>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <tuple>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
|
@ -71,6 +80,19 @@ struct llama_control_vector_load_info;
|
||||||
// CPU utils
|
// CPU utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
int32_t cpu_get_num_physical_cores();
|
||||||
|
int32_t cpu_get_num_math();
|
||||||
|
|
||||||
|
//
|
||||||
|
// CLI argument parsing
|
||||||
|
//
|
||||||
|
|
||||||
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
enum dimre_method {
|
||||||
|
DIMRE_METHOD_PCA,
|
||||||
|
DIMRE_METHOD_MEAN,
|
||||||
|
};
|
||||||
|
|
||||||
struct cpu_params {
|
struct cpu_params {
|
||||||
int n_threads = -1;
|
int n_threads = -1;
|
||||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||||
|
@ -80,94 +102,9 @@ struct cpu_params {
|
||||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||||
};
|
};
|
||||||
|
|
||||||
int32_t cpu_get_num_physical_cores();
|
|
||||||
int32_t cpu_get_num_math();
|
|
||||||
|
|
||||||
//
|
|
||||||
// Common params
|
|
||||||
//
|
|
||||||
|
|
||||||
enum llama_example {
|
|
||||||
LLAMA_EXAMPLE_COMMON,
|
|
||||||
LLAMA_EXAMPLE_SPECULATIVE,
|
|
||||||
LLAMA_EXAMPLE_MAIN,
|
|
||||||
LLAMA_EXAMPLE_INFILL,
|
|
||||||
LLAMA_EXAMPLE_EMBEDDING,
|
|
||||||
LLAMA_EXAMPLE_PERPLEXITY,
|
|
||||||
LLAMA_EXAMPLE_RETRIEVAL,
|
|
||||||
LLAMA_EXAMPLE_PASSKEY,
|
|
||||||
LLAMA_EXAMPLE_IMATRIX,
|
|
||||||
LLAMA_EXAMPLE_BENCH,
|
|
||||||
LLAMA_EXAMPLE_SERVER,
|
|
||||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
|
||||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
|
||||||
LLAMA_EXAMPLE_LLAVA,
|
|
||||||
LLAMA_EXAMPLE_LOOKUP,
|
|
||||||
LLAMA_EXAMPLE_PARALLEL,
|
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
|
||||||
};
|
|
||||||
|
|
||||||
enum gpt_sampler_type {
|
|
||||||
GPT_SAMPLER_TYPE_NONE = 0,
|
|
||||||
GPT_SAMPLER_TYPE_TOP_K = 1,
|
|
||||||
GPT_SAMPLER_TYPE_TOP_P = 2,
|
|
||||||
GPT_SAMPLER_TYPE_MIN_P = 3,
|
|
||||||
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
|
||||||
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
|
||||||
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
|
||||||
};
|
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
|
||||||
enum dimre_method {
|
|
||||||
DIMRE_METHOD_PCA,
|
|
||||||
DIMRE_METHOD_MEAN,
|
|
||||||
};
|
|
||||||
|
|
||||||
// sampler parameters
|
|
||||||
struct gpt_sampler_params {
|
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
||||||
|
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
||||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
|
||||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
||||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
||||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
||||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
||||||
float penalty_present = 0.00f; // 0.0 = disabled
|
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
|
||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
|
||||||
bool ignore_eos = false;
|
|
||||||
bool no_perf = false; // disable performance metrics
|
|
||||||
|
|
||||||
std::vector<enum gpt_sampler_type> samplers = {
|
|
||||||
GPT_SAMPLER_TYPE_TOP_K,
|
|
||||||
GPT_SAMPLER_TYPE_TFS_Z,
|
|
||||||
GPT_SAMPLER_TYPE_TYPICAL_P,
|
|
||||||
GPT_SAMPLER_TYPE_TOP_P,
|
|
||||||
GPT_SAMPLER_TYPE_MIN_P,
|
|
||||||
GPT_SAMPLER_TYPE_TEMPERATURE
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
|
||||||
|
|
||||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
||||||
|
|
||||||
// print the parameters into a string
|
|
||||||
std::string print() const;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 0; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
|
@ -209,25 +146,26 @@ struct gpt_params {
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||||
|
|
||||||
struct gpt_sampler_params sparams;
|
// // sampling parameters
|
||||||
|
struct llama_sampling_params sparams;
|
||||||
|
|
||||||
std::string model = ""; // model path // NOLINT
|
std::string model = ""; // model path
|
||||||
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
std::string model_alias = "unknown"; // model alias // NOLINT
|
std::string model_alias = "unknown"; // model alias
|
||||||
std::string model_url = ""; // model url to download // NOLINT
|
std::string model_url = ""; // model url to download
|
||||||
std::string hf_token = ""; // HF token // NOLINT
|
std::string hf_token = ""; // HF token
|
||||||
std::string hf_repo = ""; // HF repo // NOLINT
|
std::string hf_repo = ""; // HF repo
|
||||||
std::string hf_file = ""; // HF file // NOLINT
|
std::string hf_file = ""; // HF file
|
||||||
std::string prompt = ""; // NOLINT
|
std::string prompt = "";
|
||||||
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
std::string prompt_file = ""; // store the external prompt file name
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
std::string input_prefix = ""; // string to prefix user inputs with
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
std::string input_suffix = ""; // string to suffix user inputs with
|
||||||
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
std::string logdir = ""; // directory in which to save YAML log files
|
||||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
||||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
||||||
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
std::string logits_file = ""; // file for saving *all* logits
|
||||||
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||||
|
|
||||||
std::vector<std::string> in_files; // all input files
|
std::vector<std::string> in_files; // all input files
|
||||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
|
@ -271,15 +209,15 @@ struct gpt_params {
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
||||||
bool flash_attn = false; // flash attention
|
bool flash_attn = false; // flash attention
|
||||||
bool no_perf = false; // disable performance metrics
|
|
||||||
bool ctx_shift = true; // context shift on inifinite text generation
|
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
|
bool ignore_eos = false; // ignore generated EOS tokens
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool display_prompt = true; // print prompt before generation
|
bool display_prompt = true; // print prompt before generation
|
||||||
|
bool infill = false; // use infill mode
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
bool warmup = true; // warmup run
|
bool warmup = true; // warmup run
|
||||||
|
@ -289,7 +227,7 @@ struct gpt_params {
|
||||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
// embedding
|
// embedding
|
||||||
|
@ -297,7 +235,6 @@ struct gpt_params {
|
||||||
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
std::string embd_sep = "\n"; // separator of embendings
|
std::string embd_sep = "\n"; // separator of embendings
|
||||||
bool reranking = false; // enable reranking support on server
|
|
||||||
|
|
||||||
// server params
|
// server params
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
|
@ -306,15 +243,15 @@ struct gpt_params {
|
||||||
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = "";
|
||||||
std::string chat_template = ""; // NOLINT
|
std::string chat_template = "";
|
||||||
std::string system_prompt = ""; // NOLINT
|
std::string system_prompt = "";
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
|
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
std::string ssl_file_key = ""; // NOLINT
|
std::string ssl_file_key = "";
|
||||||
std::string ssl_file_cert = ""; // NOLINT
|
std::string ssl_file_cert = "";
|
||||||
|
|
||||||
bool endpoint_slots = true;
|
bool endpoint_slots = true;
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
|
@ -364,14 +301,15 @@ struct gpt_params {
|
||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
|
|
||||||
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
std::string lora_outfile = "ggml-lora-merged-f16.gguf";
|
||||||
|
|
||||||
// batched-bench params
|
|
||||||
bool batched_bench_output_jsonl = false;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// call once at the start of a program if it uses libcommon
|
void gpt_params_parse_from_env(gpt_params & params);
|
||||||
// initializes the logging system and prints info about the build
|
void gpt_params_handle_model_default(gpt_params & params);
|
||||||
void gpt_init();
|
|
||||||
|
bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
|
||||||
|
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
|
||||||
|
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
|
||||||
|
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||||
|
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
|
@ -408,11 +346,6 @@ static std::vector<T> string_split(const std::string & str, char delim) {
|
||||||
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|
||||||
void string_process_escapes(std::string & input);
|
void string_process_escapes(std::string & input);
|
||||||
|
|
||||||
std::string string_from(bool value);
|
|
||||||
std::string string_from(const std::vector<int> & values);
|
|
||||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
|
||||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Filesystem utils
|
// Filesystem utils
|
||||||
//
|
//
|
||||||
|
|
1213
llama/ggml-aarch64.c
vendored
1213
llama/ggml-aarch64.c
vendored
File diff suppressed because it is too large
Load diff
2
llama/ggml-aarch64.h
vendored
2
llama/ggml-aarch64.h
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
8
llama/ggml-alloc.c
vendored
8
llama/ggml-alloc.c
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -320,12 +320,6 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||||
alloc->free_blocks[0].offset = 0;
|
alloc->free_blocks[0].offset = 0;
|
||||||
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
||||||
alloc->max_size = 0;
|
alloc->max_size = 0;
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
|
||||||
for (int i = 0; i < 1024; i++) {
|
|
||||||
alloc->allocated_tensors[i].tensor = NULL;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
||||||
|
|
2
llama/ggml-alloc.h
vendored
2
llama/ggml-alloc.h
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
7
llama/ggml-backend-impl.h
vendored
7
llama/ggml-backend-impl.h
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -65,10 +65,9 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_backend_buffer_i {
|
struct ggml_backend_buffer_i {
|
||||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
||||||
void (*GGML_CALL free_buffer) (ggml_backend_buffer_t buffer);
|
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
|
||||||
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
||||||
void (*GGML_CALL init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
void (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
|
||||||
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
||||||
|
|
41
llama/ggml-backend.c
vendored
41
llama/ggml-backend.c
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -277,22 +277,6 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
|
||||||
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
||||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
||||||
|
|
||||||
GGML_ASSERT(buf != NULL && "tensor buffer not set");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
|
|
||||||
if (!size) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
|
|
||||||
|
|
||||||
buf->iface.memset_tensor(buf, tensor, value, offset, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_backend_synchronize(ggml_backend_t backend) {
|
void ggml_backend_synchronize(ggml_backend_t backend) {
|
||||||
if (backend->iface.synchronize == NULL) {
|
if (backend->iface.synchronize == NULL) {
|
||||||
return;
|
return;
|
||||||
|
@ -616,12 +600,6 @@ GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t
|
||||||
free(buffer->context);
|
free(buffer->context);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
||||||
memset((char *)tensor->data + offset, value, size);
|
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
|
@ -653,7 +631,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
||||||
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL, // no initialization required
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
||||||
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
||||||
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
||||||
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
||||||
|
@ -667,7 +644,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
||||||
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
||||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL, // no initialization required
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
|
||||||
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
|
||||||
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
||||||
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
|
||||||
|
@ -882,10 +858,6 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
||||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||||
case GGML_OP_ROPE_BACK:
|
|
||||||
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
|
||||||
case GGML_OP_IM2COL_BACK:
|
|
||||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
|
||||||
default:
|
default:
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1035,7 +1007,6 @@ static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(
|
||||||
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
|
||||||
/* .get_base = */ NULL,
|
/* .get_base = */ NULL,
|
||||||
/* .init_tensor = */ NULL,
|
/* .init_tensor = */ NULL,
|
||||||
/* .memset_tensor = */ NULL,
|
|
||||||
/* .set_tensor = */ NULL,
|
/* .set_tensor = */ NULL,
|
||||||
/* .get_tensor = */ NULL,
|
/* .get_tensor = */ NULL,
|
||||||
/* .cpy_tensor = */ NULL,
|
/* .cpy_tensor = */ NULL,
|
||||||
|
@ -1225,11 +1196,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
|
||||||
// since the tensor is pre-allocated, it cannot be moved to another backend
|
|
||||||
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
|
||||||
}
|
|
||||||
|
|
||||||
// graph input
|
// graph input
|
||||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
||||||
|
@ -1709,7 +1675,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
sched->prev_leaf_backend_ids = tmp;
|
sched->prev_leaf_backend_ids = tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||||
if (sched->graph.size < graph_size) {
|
if (sched->graph.size < graph_size) {
|
||||||
sched->graph.size = graph_size;
|
sched->graph.size = graph_size;
|
||||||
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
|
||||||
|
@ -1761,7 +1727,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||||
assert(graph_copy->size > graph_copy->n_leafs);
|
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1775,7 +1740,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
||||||
assert(graph_copy->size > graph_copy->n_leafs);
|
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1786,7 +1750,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
for (int i = 0; i < graph->n_leafs; i++) {
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = graph->leafs[i];
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
||||||
assert(graph_copy->size > graph_copy->n_leafs);
|
|
||||||
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
5
llama/ggml-backend.h
vendored
5
llama/ggml-backend.h
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -92,7 +92,6 @@ extern "C" {
|
||||||
// "offset" refers to the offset of the tensor data for setting/getting data
|
// "offset" refers to the offset of the tensor data for setting/getting data
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||||
|
|
||||||
|
@ -149,7 +148,7 @@ extern "C" {
|
||||||
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_reg_get_count(void);
|
GGML_API size_t ggml_backend_reg_get_count(void);
|
||||||
GGML_API size_t ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found
|
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
||||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
||||||
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
||||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
||||||
|
|
3
llama/ggml-blas.cpp
vendored
3
llama/ggml-blas.cpp
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -26,7 +26,6 @@
|
||||||
|
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
|
|
||||||
#include "ggml-impl.h"
|
|
||||||
#include "ggml-blas.h"
|
#include "ggml-blas.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
|
|
2
llama/ggml-blas.h
vendored
2
llama/ggml-blas.h
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
22
llama/ggml-common.h
vendored
22
llama/ggml-common.h
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -253,25 +253,6 @@ typedef struct {
|
||||||
} block_q8_0x8;
|
} block_q8_0x8;
|
||||||
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
||||||
|
|
||||||
//
|
|
||||||
// Ternary quantization
|
|
||||||
//
|
|
||||||
|
|
||||||
// 1.6875 bpw
|
|
||||||
typedef struct {
|
|
||||||
uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
|
|
||||||
uint8_t qh[QK_K/64]; // 4 elements per byte
|
|
||||||
ggml_half d;
|
|
||||||
} block_tq1_0;
|
|
||||||
static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
|
|
||||||
|
|
||||||
// 2.0625 bpw
|
|
||||||
typedef struct {
|
|
||||||
uint8_t qs[QK_K/4]; // 2 bits per element
|
|
||||||
ggml_half d;
|
|
||||||
} block_tq2_0;
|
|
||||||
static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Super-block quantization structures
|
// Super-block quantization structures
|
||||||
//
|
//
|
||||||
|
@ -406,7 +387,6 @@ typedef struct {
|
||||||
} block_iq3_s;
|
} block_iq3_s;
|
||||||
static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
||||||
|
|
||||||
// 1.5625 bpw
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
ggml_half d;
|
ggml_half d;
|
||||||
uint8_t qs[QK_K/8];
|
uint8_t qs[QK_K/8];
|
||||||
|
|
640
llama/ggml-cpu-impl.h
vendored
640
llama/ggml-cpu-impl.h
vendored
|
@ -1,640 +0,0 @@
|
||||||
/**
|
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
|
||||||
*
|
|
||||||
* MIT License
|
|
||||||
*
|
|
||||||
* Copyright (c) 2023-2024 The ggml authors
|
|
||||||
*
|
|
||||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
* of this software and associated documentation files (the "Software"), to deal
|
|
||||||
* in the Software without restriction, including without limitation the rights
|
|
||||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
* copies of the Software, and to permit persons to whom the Software is
|
|
||||||
* furnished to do so, subject to the following conditions:
|
|
||||||
*
|
|
||||||
* The above copyright notice and this permission notice shall be included in all
|
|
||||||
* copies or substantial portions of the Software.
|
|
||||||
*
|
|
||||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
* SOFTWARE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
// GGML CPU internal header
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-impl.h"
|
|
||||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
|
||||||
//#include <stddef.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <string.h> // memcpy
|
|
||||||
#include <math.h> // fabsf
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
|
|
||||||
#define m512bh(p) p
|
|
||||||
#define m512i(p) p
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#define m512bh(p) (__m512bh)(p)
|
|
||||||
#define m512i(p) (__m512i)(p)
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Converts brain16 to float32.
|
|
||||||
*
|
|
||||||
* The bfloat16 floating point format has the following structure:
|
|
||||||
*
|
|
||||||
* ┌sign
|
|
||||||
* │
|
|
||||||
* │ ┌exponent
|
|
||||||
* │ │
|
|
||||||
* │ │ ┌mantissa
|
|
||||||
* │ │ │
|
|
||||||
* │┌──┴───┐┌─┴───┐
|
|
||||||
* 0b0000000000000000 brain16
|
|
||||||
*
|
|
||||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
|
||||||
* encoding and decoding numbers becomes relatively straightforward.
|
|
||||||
*
|
|
||||||
* ┌sign
|
|
||||||
* │
|
|
||||||
* │ ┌exponent
|
|
||||||
* │ │
|
|
||||||
* │ │ ┌mantissa
|
|
||||||
* │ │ │
|
|
||||||
* │┌──┴───┐┌─┴───────────────────┐
|
|
||||||
* 0b00000000000000000000000000000000 IEEE binary32
|
|
||||||
*
|
|
||||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
|
||||||
*
|
|
||||||
* ┌sign
|
|
||||||
* │
|
|
||||||
* │ ┌exponent
|
|
||||||
* │ │
|
|
||||||
* │ │ ┌mantissa
|
|
||||||
* │ │ │
|
|
||||||
* │┌─┴─┐┌─┴──────┐
|
|
||||||
* 0b0000000000000000 IEEE binary16
|
|
||||||
*
|
|
||||||
* @see IEEE 754-2008
|
|
||||||
*/
|
|
||||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
|
||||||
union {
|
|
||||||
float f;
|
|
||||||
uint32_t i;
|
|
||||||
} u;
|
|
||||||
u.i = (uint32_t)h.bits << 16;
|
|
||||||
return u.f;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Converts float32 to brain16.
|
|
||||||
*
|
|
||||||
* This is binary identical with Google Brain float conversion.
|
|
||||||
* Floats shall round to nearest even, and NANs shall be quiet.
|
|
||||||
* Subnormals aren't flushed to zero, except perhaps when used.
|
|
||||||
* This code should vectorize nicely if using modern compilers.
|
|
||||||
*/
|
|
||||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
|
||||||
ggml_bf16_t h;
|
|
||||||
union {
|
|
||||||
float f;
|
|
||||||
uint32_t i;
|
|
||||||
} u;
|
|
||||||
u.f = s;
|
|
||||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
|
||||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
|
||||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
|
||||||
|
|
||||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
|
||||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
|
||||||
#ifndef __FMA__
|
|
||||||
#define __FMA__
|
|
||||||
#endif
|
|
||||||
#ifndef __F16C__
|
|
||||||
#define __F16C__
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
|
||||||
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
|
||||||
#ifndef __SSE3__
|
|
||||||
#define __SSE3__
|
|
||||||
#endif
|
|
||||||
#ifndef __SSSE3__
|
|
||||||
#define __SSSE3__
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
|
||||||
#include <arm_sve.h>
|
|
||||||
#include <sys/prctl.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// 16-bit float
|
|
||||||
// on Arm, we use __fp16
|
|
||||||
// on x86, we use uint16_t
|
|
||||||
#if defined(__ARM_NEON)
|
|
||||||
|
|
||||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
|
||||||
//
|
|
||||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
|
||||||
//
|
|
||||||
#include <arm_neon.h>
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
|
|
||||||
typedef uint16_t ggml_fp16_internal_t;
|
|
||||||
|
|
||||||
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
typedef __fp16 ggml_fp16_internal_t;
|
|
||||||
|
|
||||||
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
|
||||||
|
|
||||||
#endif // _MSC_VER
|
|
||||||
|
|
||||||
#if !defined(__aarch64__)
|
|
||||||
|
|
||||||
// 32-bit ARM compatibility
|
|
||||||
|
|
||||||
// vaddlvq_s16
|
|
||||||
// vpaddq_s16
|
|
||||||
// vpaddq_s32
|
|
||||||
// vaddvq_s32
|
|
||||||
// vaddvq_f32
|
|
||||||
// vmaxvq_f32
|
|
||||||
// vcvtnq_s32_f32
|
|
||||||
// vzip1_u8
|
|
||||||
// vzip2_u8
|
|
||||||
|
|
||||||
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
|
||||||
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
|
||||||
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
|
||||||
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
|
||||||
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
|
||||||
return vcombine_s16(a0, b0);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
|
||||||
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
|
||||||
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
|
||||||
return vcombine_s32(a0, b0);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
||||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static float vaddvq_f32(float32x4_t v) {
|
|
||||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static float vmaxvq_f32(float32x4_t v) {
|
|
||||||
return
|
|
||||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
|
||||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
|
||||||
int32x4_t res;
|
|
||||||
|
|
||||||
res[0] = roundf(vgetq_lane_f32(v, 0));
|
|
||||||
res[1] = roundf(vgetq_lane_f32(v, 1));
|
|
||||||
res[2] = roundf(vgetq_lane_f32(v, 2));
|
|
||||||
res[3] = roundf(vgetq_lane_f32(v, 3));
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
|
||||||
uint8x8_t res;
|
|
||||||
|
|
||||||
res[0] = a[0]; res[1] = b[0];
|
|
||||||
res[2] = a[1]; res[3] = b[1];
|
|
||||||
res[4] = a[2]; res[5] = b[2];
|
|
||||||
res[6] = a[3]; res[7] = b[3];
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
|
||||||
uint8x8_t res;
|
|
||||||
|
|
||||||
res[0] = a[4]; res[1] = b[4];
|
|
||||||
res[2] = a[5]; res[3] = b[5];
|
|
||||||
res[4] = a[6]; res[5] = b[6];
|
|
||||||
res[6] = a[7]; res[7] = b[7];
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
// vld1q_s16_x2
|
|
||||||
// vld1q_u8_x2
|
|
||||||
// vld1q_u8_x4
|
|
||||||
// vld1q_s8_x2
|
|
||||||
// vld1q_s8_x4
|
|
||||||
// TODO: double-check these work correctly
|
|
||||||
|
|
||||||
typedef struct ggml_int16x8x2_t {
|
|
||||||
int16x8_t val[2];
|
|
||||||
} ggml_int16x8x2_t;
|
|
||||||
|
|
||||||
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
|
||||||
ggml_int16x8x2_t res;
|
|
||||||
|
|
||||||
res.val[0] = vld1q_s16(ptr + 0);
|
|
||||||
res.val[1] = vld1q_s16(ptr + 8);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef struct ggml_uint8x16x2_t {
|
|
||||||
uint8x16_t val[2];
|
|
||||||
} ggml_uint8x16x2_t;
|
|
||||||
|
|
||||||
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
|
||||||
ggml_uint8x16x2_t res;
|
|
||||||
|
|
||||||
res.val[0] = vld1q_u8(ptr + 0);
|
|
||||||
res.val[1] = vld1q_u8(ptr + 16);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef struct ggml_uint8x16x4_t {
|
|
||||||
uint8x16_t val[4];
|
|
||||||
} ggml_uint8x16x4_t;
|
|
||||||
|
|
||||||
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
|
||||||
ggml_uint8x16x4_t res;
|
|
||||||
|
|
||||||
res.val[0] = vld1q_u8(ptr + 0);
|
|
||||||
res.val[1] = vld1q_u8(ptr + 16);
|
|
||||||
res.val[2] = vld1q_u8(ptr + 32);
|
|
||||||
res.val[3] = vld1q_u8(ptr + 48);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef struct ggml_int8x16x2_t {
|
|
||||||
int8x16_t val[2];
|
|
||||||
} ggml_int8x16x2_t;
|
|
||||||
|
|
||||||
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
|
||||||
ggml_int8x16x2_t res;
|
|
||||||
|
|
||||||
res.val[0] = vld1q_s8(ptr + 0);
|
|
||||||
res.val[1] = vld1q_s8(ptr + 16);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef struct ggml_int8x16x4_t {
|
|
||||||
int8x16_t val[4];
|
|
||||||
} ggml_int8x16x4_t;
|
|
||||||
|
|
||||||
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
|
||||||
ggml_int8x16x4_t res;
|
|
||||||
|
|
||||||
res.val[0] = vld1q_s8(ptr + 0);
|
|
||||||
res.val[1] = vld1q_s8(ptr + 16);
|
|
||||||
res.val[2] = vld1q_s8(ptr + 32);
|
|
||||||
res.val[3] = vld1q_s8(ptr + 48);
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE: not tested
|
|
||||||
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
|
||||||
int8x16_t res;
|
|
||||||
|
|
||||||
res[ 0] = a[b[ 0]];
|
|
||||||
res[ 1] = a[b[ 1]];
|
|
||||||
res[ 2] = a[b[ 2]];
|
|
||||||
res[ 3] = a[b[ 3]];
|
|
||||||
res[ 4] = a[b[ 4]];
|
|
||||||
res[ 5] = a[b[ 5]];
|
|
||||||
res[ 6] = a[b[ 6]];
|
|
||||||
res[ 7] = a[b[ 7]];
|
|
||||||
res[ 8] = a[b[ 8]];
|
|
||||||
res[ 9] = a[b[ 9]];
|
|
||||||
res[10] = a[b[10]];
|
|
||||||
res[11] = a[b[11]];
|
|
||||||
res[12] = a[b[12]];
|
|
||||||
res[13] = a[b[13]];
|
|
||||||
res[14] = a[b[14]];
|
|
||||||
res[15] = a[b[15]];
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE: not tested
|
|
||||||
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
|
||||||
uint8x16_t res;
|
|
||||||
|
|
||||||
res[ 0] = a[b[ 0]];
|
|
||||||
res[ 1] = a[b[ 1]];
|
|
||||||
res[ 2] = a[b[ 2]];
|
|
||||||
res[ 3] = a[b[ 3]];
|
|
||||||
res[ 4] = a[b[ 4]];
|
|
||||||
res[ 5] = a[b[ 5]];
|
|
||||||
res[ 6] = a[b[ 6]];
|
|
||||||
res[ 7] = a[b[ 7]];
|
|
||||||
res[ 8] = a[b[ 8]];
|
|
||||||
res[ 9] = a[b[ 9]];
|
|
||||||
res[10] = a[b[10]];
|
|
||||||
res[11] = a[b[11]];
|
|
||||||
res[12] = a[b[12]];
|
|
||||||
res[13] = a[b[13]];
|
|
||||||
res[14] = a[b[14]];
|
|
||||||
res[15] = a[b[15]];
|
|
||||||
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#define ggml_int16x8x2_t int16x8x2_t
|
|
||||||
#define ggml_uint8x16x2_t uint8x16x2_t
|
|
||||||
#define ggml_uint8x16x4_t uint8x16x4_t
|
|
||||||
#define ggml_int8x16x2_t int8x16x2_t
|
|
||||||
#define ggml_int8x16x4_t int8x16x4_t
|
|
||||||
|
|
||||||
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
|
||||||
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
|
||||||
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
|
||||||
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
|
||||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
|
||||||
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
|
||||||
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
|
||||||
|
|
||||||
#endif // !defined(__aarch64__)
|
|
||||||
|
|
||||||
#if !defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
|
|
||||||
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
|
||||||
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
|
||||||
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
|
||||||
|
|
||||||
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
|
||||||
|
|
||||||
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
|
|
||||||
#endif // defined(__ARM_NEON)
|
|
||||||
|
|
||||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
||||||
|
|
||||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
||||||
ggml_fp16_internal_t tmp;
|
|
||||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
|
||||||
return (float)tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
||||||
ggml_fp16_t res;
|
|
||||||
ggml_fp16_internal_t tmp = f;
|
|
||||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#ifdef __wasm_simd128__
|
|
||||||
#include <wasm_simd128.h>
|
|
||||||
#else
|
|
||||||
#ifdef __POWER9_VECTOR__
|
|
||||||
#include <altivec.h>
|
|
||||||
#undef bool
|
|
||||||
#define bool _Bool
|
|
||||||
#else
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
||||||
#include <intrin.h>
|
|
||||||
#else
|
|
||||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
|
||||||
#if !defined(__riscv)
|
|
||||||
#include <immintrin.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __riscv_v_intrinsic
|
|
||||||
#include <riscv_vector.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__loongarch64)
|
|
||||||
#if defined(__loongarch_asx)
|
|
||||||
#include <lasxintrin.h>
|
|
||||||
#endif
|
|
||||||
#if defined(__loongarch_sx)
|
|
||||||
#include <lsxintrin.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__loongarch_asx)
|
|
||||||
|
|
||||||
typedef union {
|
|
||||||
int32_t i;
|
|
||||||
float f;
|
|
||||||
} ft_union;
|
|
||||||
|
|
||||||
/* float type data load instructions */
|
|
||||||
static __m128 __lsx_vreplfr2vr_s(float val) {
|
|
||||||
ft_union fi_tmpval = {.f = val};
|
|
||||||
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
|
||||||
}
|
|
||||||
|
|
||||||
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
|
||||||
ft_union fi_tmpval = {.f = val};
|
|
||||||
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __F16C__
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
|
||||||
#else
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#elif defined(__POWER9_VECTOR__)
|
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
||||||
/* the inline asm below is about 12% faster than the lookup method */
|
|
||||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
|
||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
||||||
register float f;
|
|
||||||
register double d;
|
|
||||||
__asm__(
|
|
||||||
"mtfprd %0,%2\n"
|
|
||||||
"xscvhpdp %0,%0\n"
|
|
||||||
"frsp %1,%0\n" :
|
|
||||||
/* temp */ "=d"(d),
|
|
||||||
/* out */ "=f"(f):
|
|
||||||
/* in */ "r"(h));
|
|
||||||
return f;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
||||||
register double d;
|
|
||||||
register ggml_fp16_t r;
|
|
||||||
__asm__( /* xscvdphp can work on double or single precision */
|
|
||||||
"xscvdphp %0,%2\n"
|
|
||||||
"mffprd %1,%0\n" :
|
|
||||||
/* temp */ "=d"(d),
|
|
||||||
/* out */ "=r"(r):
|
|
||||||
/* in */ "f"(f));
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
// FP16 <-> FP32
|
|
||||||
// ref: https://github.com/Maratyszcza/FP16
|
|
||||||
|
|
||||||
static inline float fp32_from_bits(uint32_t w) {
|
|
||||||
union {
|
|
||||||
uint32_t as_bits;
|
|
||||||
float as_value;
|
|
||||||
} fp32;
|
|
||||||
fp32.as_bits = w;
|
|
||||||
return fp32.as_value;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint32_t fp32_to_bits(float f) {
|
|
||||||
union {
|
|
||||||
float as_value;
|
|
||||||
uint32_t as_bits;
|
|
||||||
} fp32;
|
|
||||||
fp32.as_value = f;
|
|
||||||
return fp32.as_bits;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
||||||
const uint32_t w = (uint32_t) h << 16;
|
|
||||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
||||||
const uint32_t two_w = w + w;
|
|
||||||
|
|
||||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
||||||
const float exp_scale = 0x1.0p-112f;
|
|
||||||
#else
|
|
||||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
|
||||||
#endif
|
|
||||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
|
||||||
|
|
||||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
|
||||||
const float magic_bias = 0.5f;
|
|
||||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
|
||||||
|
|
||||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
|
||||||
const uint32_t result = sign |
|
|
||||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
|
||||||
return fp32_from_bits(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
|
||||||
const float scale_to_inf = 0x1.0p+112f;
|
|
||||||
const float scale_to_zero = 0x1.0p-110f;
|
|
||||||
#else
|
|
||||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
|
||||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
|
||||||
#endif
|
|
||||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
|
||||||
|
|
||||||
const uint32_t w = fp32_to_bits(f);
|
|
||||||
const uint32_t shl1_w = w + w;
|
|
||||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
|
||||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
|
||||||
if (bias < UINT32_C(0x71000000)) {
|
|
||||||
bias = UINT32_C(0x71000000);
|
|
||||||
}
|
|
||||||
|
|
||||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
|
||||||
const uint32_t bits = fp32_to_bits(base);
|
|
||||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
|
||||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
|
||||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
|
||||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
||||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
||||||
|
|
||||||
#endif // __F16C__
|
|
||||||
|
|
||||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
|
||||||
|
|
||||||
#ifdef __ARM_FEATURE_SVE
|
|
||||||
#include <arm_sve.h>
|
|
||||||
#endif // __ARM_FEATURE_SVE
|
|
||||||
|
|
||||||
// precomputed f32 table for f16 (256 KB)
|
|
||||||
// defined in ggml.c, initialized in ggml_init()
|
|
||||||
extern float ggml_table_f32_f16[1 << 16];
|
|
||||||
|
|
||||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
|
||||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
|
||||||
// This is also true for POWER9.
|
|
||||||
#if !defined(GGML_FP16_TO_FP32)
|
|
||||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
||||||
uint16_t s;
|
|
||||||
memcpy(&s, &f, sizeof(uint16_t));
|
|
||||||
return ggml_table_f32_f16[s];
|
|
||||||
}
|
|
||||||
|
|
||||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if !defined(GGML_FP32_TO_FP16)
|
|
||||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
143
llama/ggml-cuda.cu
vendored
143
llama/ggml-cuda.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -25,7 +25,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#include "ggml-impl.h"
|
#include "ggml.h"
|
||||||
#include "ggml-backend-impl.h"
|
#include "ggml-backend-impl.h"
|
||||||
|
|
||||||
#include "ggml-cuda/common.cuh"
|
#include "ggml-cuda/common.cuh"
|
||||||
|
@ -47,20 +47,16 @@
|
||||||
#include "ggml-cuda/mmq.cuh"
|
#include "ggml-cuda/mmq.cuh"
|
||||||
#include "ggml-cuda/mmvq.cuh"
|
#include "ggml-cuda/mmvq.cuh"
|
||||||
#include "ggml-cuda/norm.cuh"
|
#include "ggml-cuda/norm.cuh"
|
||||||
#include "ggml-cuda/opt-step-adamw.cuh"
|
|
||||||
#include "ggml-cuda/out-prod.cuh"
|
|
||||||
#include "ggml-cuda/pad.cuh"
|
#include "ggml-cuda/pad.cuh"
|
||||||
#include "ggml-cuda/pool2d.cuh"
|
#include "ggml-cuda/pool2d.cuh"
|
||||||
#include "ggml-cuda/quantize.cuh"
|
#include "ggml-cuda/quantize.cuh"
|
||||||
#include "ggml-cuda/rope.cuh"
|
#include "ggml-cuda/rope.cuh"
|
||||||
#include "ggml-cuda/scale.cuh"
|
#include "ggml-cuda/scale.cuh"
|
||||||
#include "ggml-cuda/softmax.cuh"
|
#include "ggml-cuda/softmax.cuh"
|
||||||
#include "ggml-cuda/sum.cuh"
|
|
||||||
#include "ggml-cuda/sumrows.cuh"
|
#include "ggml-cuda/sumrows.cuh"
|
||||||
#include "ggml-cuda/tsembd.cuh"
|
#include "ggml-cuda/tsembd.cuh"
|
||||||
#include "ggml-cuda/unary.cuh"
|
#include "ggml-cuda/unary.cuh"
|
||||||
#include "ggml-cuda/upscale.cuh"
|
#include "ggml-cuda/upscale.cuh"
|
||||||
#include "ggml-cuda/rwkv-wkv.cuh"
|
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
|
@ -162,7 +158,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||||
return res;
|
return res;
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
||||||
{
|
{
|
||||||
|
@ -175,7 +171,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||||
return err;
|
return err;
|
||||||
#else
|
#else
|
||||||
return cudaMalloc(ptr, size);
|
return cudaMalloc(ptr, size);
|
||||||
#endif // !defined(GGML_USE_HIPBLAS)
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -213,7 +209,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
CUdevice device;
|
CUdevice device;
|
||||||
CU_CHECK(cuDeviceGet(&device, id));
|
CU_CHECK(cuDeviceGet(&device, id));
|
||||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||||
|
@ -225,7 +221,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
alloc_prop.location.id = id;
|
alloc_prop.location.id = id;
|
||||||
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
info.devices[id].vmm = !!device_vmm;
|
info.devices[id].vmm = !!device_vmm;
|
||||||
|
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
|
@ -361,7 +357,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||||
};
|
};
|
||||||
|
|
||||||
// pool with virtual memory
|
// pool with virtual memory
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||||
|
|
||||||
|
@ -455,14 +451,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
|
|
||||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
if (ggml_cuda_info().devices[device].vmm) {
|
if (ggml_cuda_info().devices[device].vmm) {
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||||
}
|
}
|
||||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -526,14 +522,6 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx->device);
|
|
||||||
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread));
|
|
||||||
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
@ -585,7 +573,6 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
|
||||||
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
||||||
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
||||||
/* .memset_tensor = */ ggml_backend_cuda_buffer_memset_tensor,
|
|
||||||
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
|
||||||
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
||||||
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
|
/* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
|
||||||
|
@ -902,7 +889,6 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
|
||||||
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
||||||
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
||||||
/* .memset_tensor = */ NULL,
|
|
||||||
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
|
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
|
||||||
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
|
||||||
/* .cpy_tensor = */ NULL,
|
/* .cpy_tensor = */ NULL,
|
||||||
|
@ -2211,9 +2197,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
ggml_cuda_op_repeat(ctx, dst);
|
ggml_cuda_op_repeat(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_REPEAT_BACK:
|
|
||||||
ggml_cuda_op_repeat_back(ctx, dst);
|
|
||||||
break;
|
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
ggml_cuda_op_get_rows(ctx, dst);
|
ggml_cuda_op_get_rows(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
@ -2227,7 +2210,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
ggml_cuda_dup(ctx, dst);
|
ggml_cuda_dup(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_ADD1: // TODO: more efficient implementation
|
|
||||||
ggml_cuda_op_add(ctx, dst);
|
ggml_cuda_op_add(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
|
@ -2244,12 +2226,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
break;
|
break;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(dst)) {
|
switch (ggml_get_unary_op(dst)) {
|
||||||
case GGML_UNARY_OP_NEG:
|
|
||||||
ggml_cuda_op_neg(ctx, dst);
|
|
||||||
break;
|
|
||||||
case GGML_UNARY_OP_STEP:
|
|
||||||
ggml_cuda_op_step(ctx, dst);
|
|
||||||
break;
|
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
ggml_cuda_op_gelu(ctx, dst);
|
ggml_cuda_op_gelu(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
@ -2274,9 +2250,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_UNARY_OP_HARDSWISH:
|
case GGML_UNARY_OP_HARDSWISH:
|
||||||
ggml_cuda_op_hardswish(ctx, dst);
|
ggml_cuda_op_hardswish(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_UNARY_OP_EXP:
|
|
||||||
ggml_cuda_op_exp(ctx, dst);
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -2296,9 +2269,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
ggml_cuda_op_pad(ctx, dst);
|
ggml_cuda_op_pad(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_UNPAD:
|
|
||||||
ggml_cuda_op_unpad(ctx, dst);
|
|
||||||
break;
|
|
||||||
case GGML_OP_ARANGE:
|
case GGML_OP_ARANGE:
|
||||||
ggml_cuda_op_arange(ctx, dst);
|
ggml_cuda_op_arange(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
@ -2322,9 +2292,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
ggml_cuda_mul_mat_id(ctx, dst);
|
ggml_cuda_mul_mat_id(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_OUT_PROD:
|
|
||||||
ggml_cuda_out_prod(ctx, dst);
|
|
||||||
break;
|
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
ggml_cuda_op_scale(ctx, dst);
|
ggml_cuda_op_scale(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
@ -2367,9 +2334,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
ggml_cuda_op_pool2d(ctx, dst);
|
ggml_cuda_op_pool2d(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_SUM:
|
|
||||||
ggml_cuda_op_sum(ctx, dst);
|
|
||||||
break;
|
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
ggml_cuda_op_sum_rows(ctx, dst);
|
ggml_cuda_op_sum_rows(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
@ -2384,15 +2348,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
ggml_cuda_cross_entropy_loss(ctx, dst);
|
ggml_cuda_cross_entropy_loss(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_RWKV_WKV:
|
|
||||||
ggml_cuda_op_rwkv_wkv(ctx, dst);
|
|
||||||
break;
|
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
|
||||||
ggml_cuda_cross_entropy_loss_back(ctx, dst);
|
|
||||||
break;
|
|
||||||
case GGML_OP_OPT_STEP_ADAMW:
|
|
||||||
ggml_cuda_opt_step_adamw(ctx, dst);
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -2520,7 +2475,6 @@ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_p
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
||||||
}
|
}
|
||||||
memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
||||||
|
@ -2552,12 +2506,6 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node->op == GGML_OP_SCALE &&
|
|
||||||
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2628,11 +2576,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
|
||||||
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
||||||
|
@ -2660,17 +2604,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||||
// store a pointer to each copy op CUDA kernel to identify it later
|
// store a pointer to each copy op CUDA kernel to identify it later
|
||||||
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||||
if (!ptr) {
|
|
||||||
use_cuda_graph = false;
|
|
||||||
#ifndef NDEBUG
|
|
||||||
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
|
||||||
#endif
|
|
||||||
} else {
|
|
||||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||||
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (!use_cuda_graph) {
|
if (!use_cuda_graph) {
|
||||||
break;
|
break;
|
||||||
|
@ -2769,9 +2706,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
// First call with null argument gets number of nodes in graph
|
// First call with null argument gets number of nodes in graph
|
||||||
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
||||||
// Subsequent call with non-null argument gets nodes
|
// Subsequent call with non-null argument gets nodes
|
||||||
cuda_ctx->cuda_graph->nodes.clear();
|
|
||||||
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
|
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
|
||||||
cuda_ctx->cuda_graph->params.clear();
|
|
||||||
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
|
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
|
||||||
if (cuda_ctx->cuda_graph->num_nodes > 0) {
|
if (cuda_ctx->cuda_graph->num_nodes > 0) {
|
||||||
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
|
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
|
||||||
|
@ -2838,8 +2773,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
case GGML_UNARY_OP_NEG:
|
|
||||||
case GGML_UNARY_OP_STEP:
|
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
|
@ -2848,7 +2781,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_UNARY_OP_HARDSWISH:
|
case GGML_UNARY_OP_HARDSWISH:
|
||||||
case GGML_UNARY_OP_GELU_QUICK:
|
case GGML_UNARY_OP_GELU_QUICK:
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
case GGML_UNARY_OP_EXP:
|
|
||||||
return ggml_is_contiguous(op->src[0]);
|
return ggml_is_contiguous(op->src[0]);
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
@ -2865,12 +2797,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
|
if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_MUSA
|
|
||||||
if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
|
|
||||||
!ggml_is_transposed(a) && !ggml_is_transposed(b)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
#endif // GGML_USE_MUSA
|
|
||||||
switch (a->type) {
|
switch (a->type) {
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
@ -2894,18 +2820,11 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_TYPE_IQ3_XXS:
|
case GGML_TYPE_IQ3_XXS:
|
||||||
case GGML_TYPE_IQ4_NL:
|
case GGML_TYPE_IQ4_NL:
|
||||||
case GGML_TYPE_IQ4_XS:
|
case GGML_TYPE_IQ4_XS:
|
||||||
#ifdef GGML_USE_MUSA
|
|
||||||
if (a->type == GGML_TYPE_Q3_K) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
#endif // GGML_USE_MUSA
|
|
||||||
return true;
|
return true;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_OUT_PROD:
|
|
||||||
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
|
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
{
|
{
|
||||||
switch (op->src[0]->type) {
|
switch (op->src[0]->type) {
|
||||||
|
@ -2934,9 +2853,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
|
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -2958,19 +2874,10 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
{
|
|
||||||
ggml_type src0_type = op->src[0]->type;
|
|
||||||
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
|
|
||||||
} break;
|
|
||||||
case GGML_OP_REPEAT_BACK:
|
|
||||||
return op->type == GGML_TYPE_F32 && op->src[0]->ne[3] == 1;
|
|
||||||
case GGML_OP_CONCAT:
|
case GGML_OP_CONCAT:
|
||||||
{
|
{
|
||||||
ggml_type src0_type = op->src[0]->type;
|
ggml_type src0_type = op->src[0]->type;
|
||||||
|
@ -2992,7 +2899,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_TRANSPOSE:
|
case GGML_OP_TRANSPOSE:
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
case GGML_OP_ADD1:
|
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
|
@ -3003,50 +2909,39 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_SIN:
|
case GGML_OP_SIN:
|
||||||
case GGML_OP_COS:
|
case GGML_OP_COS:
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
return true;
|
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
return op->src[0]->type != GGML_TYPE_BF16;
|
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
return ggml_is_contiguous(op->src[0]);
|
return ggml_is_contiguous(op->src[0]);
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
return op->src[0]->type == GGML_TYPE_F16;
|
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_SUM:
|
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
case GGML_OP_GROUP_NORM:
|
case GGML_OP_GROUP_NORM:
|
||||||
case GGML_OP_UPSCALE:
|
case GGML_OP_UPSCALE:
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
case GGML_OP_UNPAD:
|
|
||||||
case GGML_OP_ARANGE:
|
case GGML_OP_ARANGE:
|
||||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||||
case GGML_OP_LEAKY_RELU:
|
case GGML_OP_LEAKY_RELU:
|
||||||
case GGML_OP_RWKV_WKV:
|
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_FLASH_ATTN_EXT: {
|
case GGML_OP_FLASH_ATTN_EXT:
|
||||||
#ifndef FLASH_ATTN_AVAILABLE
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
return false;
|
return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
|
||||||
#endif
|
#else
|
||||||
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (op->src[0]->ne[0] == 128) {
|
if (op->src[0]->ne[0] == 128) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
|
if (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
const int cc = ggml_cuda_info().devices[cuda_ctx->device].cc;
|
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
|
||||||
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
|
||||||
}
|
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||||
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
|
|
||||||
case GGML_OP_OPT_STEP_ADAMW:
|
|
||||||
return true;
|
return true;
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
2
llama/ggml-cuda.h
vendored
2
llama/ggml-cuda.h
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/acc.cu
vendored
2
llama/ggml-cuda/acc.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/acc.cuh
vendored
2
llama/ggml-cuda/acc.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/arange.cu
vendored
2
llama/ggml-cuda/arange.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/arange.cuh
vendored
2
llama/ggml-cuda/arange.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/argsort.cu
vendored
2
llama/ggml-cuda/argsort.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/argsort.cuh
vendored
2
llama/ggml-cuda/argsort.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
69
llama/ggml-cuda/binbcast.cu
vendored
69
llama/ggml-cuda/binbcast.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -25,7 +25,6 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "binbcast.cuh"
|
#include "binbcast.cuh"
|
||||||
#include <cstdint>
|
|
||||||
|
|
||||||
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
||||||
return b;
|
return b;
|
||||||
|
@ -117,30 +116,6 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
|
||||||
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static __global__ void k_repeat_back(
|
|
||||||
const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
|
||||||
const int64_t ne0, const int64_t ne1, const int64_t ne2) {
|
|
||||||
|
|
||||||
const int64_t tid0 = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
|
|
||||||
const int64_t tid1 = (int64_t) blockIdx.y*blockDim.y + threadIdx.y;
|
|
||||||
const int64_t tid2 = (int64_t) blockIdx.z*blockDim.z + threadIdx.z;
|
|
||||||
|
|
||||||
if (tid0 >= ne0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
T sum = 0;
|
|
||||||
for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
|
|
||||||
for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
|
|
||||||
for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
|
|
||||||
sum += src[i2*ne01*ne00 + i1*ne00 + i0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
dst[tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<float (*bin_op)(const float, const float)>
|
template<float (*bin_op)(const float, const float)>
|
||||||
struct bin_bcast_cuda {
|
struct bin_bcast_cuda {
|
||||||
template<typename src0_t, typename src1_t, typename dst_t>
|
template<typename src0_t, typename src1_t, typename dst_t>
|
||||||
|
@ -298,16 +273,6 @@ struct bin_bcast_cuda {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static void repeat_back_cuda(
|
|
||||||
const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
|
||||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, cudaStream_t stream) {
|
|
||||||
|
|
||||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
||||||
const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2);
|
|
||||||
k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>(src, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class op>
|
template<class op>
|
||||||
static void ggml_cuda_op_bin_bcast(
|
static void ggml_cuda_op_bin_bcast(
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
|
@ -347,35 +312,3 @@ void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == dst->type);
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
|
||||||
GGML_ASSERT(ggml_can_repeat(dst, src0));
|
|
||||||
|
|
||||||
cudaStream_t stream = ctx.stream();
|
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
|
||||||
const int64_t ne01 = src0->ne[1];
|
|
||||||
const int64_t ne02 = src0->ne[2];
|
|
||||||
GGML_ASSERT(src0->ne[3] == 1);
|
|
||||||
|
|
||||||
const int64_t ne0 = dst->ne[0];
|
|
||||||
const int64_t ne1 = dst->ne[1];
|
|
||||||
const int64_t ne2 = dst->ne[2];
|
|
||||||
GGML_ASSERT(dst->ne[3] == 1);
|
|
||||||
|
|
||||||
switch (dst->type) {
|
|
||||||
case GGML_TYPE_F32: {
|
|
||||||
const float * src0_d = (const float *) src0->data;
|
|
||||||
float * dst_d = (float *) dst->data;
|
|
||||||
repeat_back_cuda<float>(src0_d, dst_d, ne00, ne01, ne02, ne0, ne1, ne2, stream);
|
|
||||||
} break;
|
|
||||||
default: {
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
4
llama/ggml-cuda/binbcast.cuh
vendored
4
llama/ggml-cuda/binbcast.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -31,5 +31,3 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
|
|
2
llama/ggml-cuda/clamp.cu
vendored
2
llama/ggml-cuda/clamp.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/clamp.cuh
vendored
2
llama/ggml-cuda/clamp.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
9
llama/ggml-cuda/common.cuh
vendored
9
llama/ggml-cuda/common.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -76,8 +76,6 @@
|
||||||
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
|
||||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||||
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
|
||||||
#define CC_QY1 210
|
|
||||||
#define CC_QY2 220
|
|
||||||
|
|
||||||
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||||
|
|
||||||
|
@ -162,10 +160,6 @@ typedef float2 dfloat2;
|
||||||
#define INT8_MMA_AVAILABLE
|
#define INT8_MMA_AVAILABLE
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||||
|
|
||||||
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
|
||||||
#define FLASH_ATTN_AVAILABLE
|
|
||||||
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
|
||||||
|
|
||||||
static constexpr bool fast_fp16_available(const int cc) {
|
static constexpr bool fast_fp16_available(const int cc) {
|
||||||
return cc >= CC_PASCAL && cc != 610;
|
return cc >= CC_PASCAL && cc != 610;
|
||||||
}
|
}
|
||||||
|
@ -601,7 +595,6 @@ struct ggml_graph_node_properties {
|
||||||
int64_t ne[GGML_MAX_DIMS];
|
int64_t ne[GGML_MAX_DIMS];
|
||||||
size_t nb[GGML_MAX_DIMS];
|
size_t nb[GGML_MAX_DIMS];
|
||||||
void * src_address[GGML_MAX_SRC];
|
void * src_address[GGML_MAX_SRC];
|
||||||
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_cuda_graph {
|
struct ggml_cuda_graph {
|
||||||
|
|
2
llama/ggml-cuda/concat.cu
vendored
2
llama/ggml-cuda/concat.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/concat.cuh
vendored
2
llama/ggml-cuda/concat.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/conv-transpose-1d.cu
vendored
2
llama/ggml-cuda/conv-transpose-1d.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/conv-transpose-1d.cuh
vendored
2
llama/ggml-cuda/conv-transpose-1d.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/convert.cu
vendored
2
llama/ggml-cuda/convert.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/convert.cuh
vendored
2
llama/ggml-cuda/convert.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
68
llama/ggml-cuda/cpy.cu
vendored
68
llama/ggml-cuda/cpy.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -107,17 +107,6 @@ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
|
|
||||||
const block_q8_0 * xi = (const block_q8_0 *) cxi;
|
|
||||||
float * dsti = (float *) cdsti;
|
|
||||||
|
|
||||||
const float d = (float)xi->d;
|
|
||||||
|
|
||||||
for (int j = 0; j < QK8_0; j++) {
|
|
||||||
dsti[j] = xi->qs[j] * d;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
|
||||||
const float * xi = (const float *) cxi;
|
const float * xi = (const float *) cxi;
|
||||||
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
block_q4_0 * dsti = (block_q4_0 *) cdsti;
|
||||||
|
@ -325,32 +314,6 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <cpy_kernel_t cpy_blck, int qk>
|
|
||||||
static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
|
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
|
||||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
|
||||||
const int nb12, const int nb13) {
|
|
||||||
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
|
||||||
|
|
||||||
if (i >= ne) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int i03 = i/(ne00 * ne01 * ne02);
|
|
||||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
|
||||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
|
||||||
const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
|
|
||||||
const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
|
|
||||||
|
|
||||||
const int i13 = i/(ne10 * ne11 * ne12);
|
|
||||||
const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
|
|
||||||
const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
|
|
||||||
const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
|
|
||||||
const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
|
|
||||||
|
|
||||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_cpy_f16_f32_cuda(
|
static void ggml_cpy_f16_f32_cuda(
|
||||||
const char * cx, char * cdst, const int ne,
|
const char * cx, char * cdst, const int ne,
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||||
|
@ -392,16 +355,6 @@ static void ggml_cpy_f32_q8_0_cuda(
|
||||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cpy_q8_0_f32_cuda(
|
|
||||||
const char * cx, char * cdst, const int ne,
|
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
|
||||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
|
||||||
|
|
||||||
const int num_blocks = ne;
|
|
||||||
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
|
|
||||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_cpy_f32_q4_0_cuda(
|
static void ggml_cpy_f32_q4_0_cuda(
|
||||||
const char * cx, char * cdst, const int ne,
|
const char * cx, char * cdst, const int ne,
|
||||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||||
|
@ -501,17 +454,12 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||||
char * src0_ddc = (char *) src0->data;
|
char * src0_ddc = (char *) src0->data;
|
||||||
char * src1_ddc = (char *) src1->data;
|
char * src1_ddc = (char *) src1->data;
|
||||||
|
|
||||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
|
||||||
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||||
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||||
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
|
||||||
ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||||
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||||
|
@ -527,8 +475,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||||
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -538,16 +487,12 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
return nullptr;
|
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
||||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
|
||||||
return (void*) cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>;
|
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||||
return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
|
return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
|
||||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||||
|
@ -563,7 +508,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||||
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
||||||
} else {
|
} else {
|
||||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
2
llama/ggml-cuda/cpy.cuh
vendored
2
llama/ggml-cuda/cpy.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
66
llama/ggml-cuda/cross-entropy-loss.cu
vendored
66
llama/ggml-cuda/cross-entropy-loss.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -26,7 +26,7 @@
|
||||||
|
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
#include "cross-entropy-loss.cuh"
|
#include "cross-entropy-loss.cuh"
|
||||||
#include "sum.cuh"
|
#include "sumrows.cuh"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
@ -97,32 +97,6 @@ static __global__ void cross_entropy_loss_f32(const float * logits, const float
|
||||||
dst[blockIdx.x] = loss;
|
dst[blockIdx.x] = loss;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void cross_entropy_loss_back_f32(const float * logits, const float * labels, const float * loss, float * dst, const int nclasses) {
|
|
||||||
extern __shared__ float tmp[];
|
|
||||||
|
|
||||||
float maxval = -INFINITY;
|
|
||||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
|
||||||
const float val = logits[blockIdx.x*nclasses + i];
|
|
||||||
maxval = fmaxf(maxval, val);
|
|
||||||
tmp[i] = val;
|
|
||||||
}
|
|
||||||
maxval = warp_reduce_max(maxval);
|
|
||||||
|
|
||||||
float sum = 0.0f;
|
|
||||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
|
||||||
const float val = expf(tmp[i] - maxval);
|
|
||||||
sum += val;
|
|
||||||
tmp[i] = val;
|
|
||||||
}
|
|
||||||
sum = warp_reduce_sum(sum);
|
|
||||||
const float sm_scale = 1.0f/sum;
|
|
||||||
|
|
||||||
const float d_by_nrows = *loss/gridDim.x;
|
|
||||||
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
|
|
||||||
dst[blockIdx.x*nclasses + i] = (tmp[i]*sm_scale - labels[blockIdx.x*nclasses + i])*d_by_nrows;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
const ggml_tensor * src1 = dst->src[1];
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
@ -154,39 +128,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||||
|
|
||||||
// Combine results from individual blocks:
|
// Combine results from individual blocks:
|
||||||
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
|
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
|
||||||
}
|
|
||||||
|
|
||||||
void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
|
||||||
const ggml_tensor * src1 = dst->src[1];
|
|
||||||
const ggml_tensor * opt0 = dst->src[2];
|
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT(opt0->type == GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(opt0));
|
|
||||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
|
||||||
|
|
||||||
const int64_t ne00 = src0->ne[0];
|
|
||||||
const int64_t nrows = ggml_nrows(src0);
|
|
||||||
|
|
||||||
const float * src0_d = (const float *) src0->data;
|
|
||||||
const float * src1_d = (const float *) src1->data;
|
|
||||||
const float * opt0_d = (const float *) opt0->data;
|
|
||||||
float * dst_d = (float *) dst->data;
|
|
||||||
|
|
||||||
cudaStream_t stream = ctx.stream();
|
|
||||||
|
|
||||||
const dim3 blocks_dim(WARP_SIZE, 1, 1);
|
|
||||||
const dim3 blocks_num(nrows, 1, 1);
|
|
||||||
const int shmem = ne00*sizeof(float);
|
|
||||||
|
|
||||||
cross_entropy_loss_back_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, opt0_d, dst_d, ne00);
|
|
||||||
}
|
}
|
||||||
|
|
4
llama/ggml-cuda/cross-entropy-loss.cuh
vendored
4
llama/ggml-cuda/cross-entropy-loss.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -29,5 +29,3 @@
|
||||||
#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
|
#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
|
||||||
|
|
||||||
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
||||||
|
|
2
llama/ggml-cuda/dequantize.cuh
vendored
2
llama/ggml-cuda/dequantize.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/diagmask.cu
vendored
2
llama/ggml-cuda/diagmask.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/diagmask.cuh
vendored
2
llama/ggml-cuda/diagmask.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/dmmv.cu
vendored
2
llama/ggml-cuda/dmmv.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/dmmv.cuh
vendored
2
llama/ggml-cuda/dmmv.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/fattn-common.cuh
vendored
2
llama/ggml-cuda/fattn-common.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/fattn-tile-f16.cu
vendored
2
llama/ggml-cuda/fattn-tile-f16.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/fattn-tile-f16.cuh
vendored
2
llama/ggml-cuda/fattn-tile-f16.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
8
llama/ggml-cuda/fattn-tile-f32.cu
vendored
8
llama/ggml-cuda/fattn-tile-f32.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -70,17 +70,13 @@ static __global__ void flash_attn_tile_ext_f32(
|
||||||
const int ne1,
|
const int ne1,
|
||||||
const int ne2,
|
const int ne2,
|
||||||
const int ne3) {
|
const int ne3) {
|
||||||
#ifndef FLASH_ATTN_AVAILABLE
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
return;
|
|
||||||
#endif // FLASH_ATTN_AVAILABLE
|
|
||||||
// Skip unused kernel variants for faster compilation:
|
// Skip unused kernel variants for faster compilation:
|
||||||
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
if (use_logit_softcap && !(D == 128 || D == 256)) {
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||||
|
|
||||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||||
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||||
|
|
2
llama/ggml-cuda/fattn-tile-f32.cuh
vendored
2
llama/ggml-cuda/fattn-tile-f32.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/fattn-vec-f16.cuh
vendored
2
llama/ggml-cuda/fattn-vec-f16.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/fattn-vec-f32.cuh
vendored
2
llama/ggml-cuda/fattn-vec-f32.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
2
llama/ggml-cuda/fattn-wmma-f16.cuh
vendored
2
llama/ggml-cuda/fattn-wmma-f16.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
8
llama/ggml-cuda/fattn.cu
vendored
8
llama/ggml-cuda/fattn.cu
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
@ -178,7 +178,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||||
} \
|
} \
|
||||||
|
|
||||||
static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
ggml_tensor * Q = dst->src[0];
|
ggml_tensor * Q = dst->src[1];
|
||||||
ggml_tensor * K = dst->src[1];
|
ggml_tensor * K = dst->src[1];
|
||||||
ggml_tensor * V = dst->src[2];
|
ggml_tensor * V = dst->src[2];
|
||||||
|
|
||||||
|
@ -253,7 +253,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
|
||||||
} \
|
} \
|
||||||
|
|
||||||
static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
ggml_tensor * Q = dst->src[0];
|
ggml_tensor * Q = dst->src[1];
|
||||||
ggml_tensor * K = dst->src[1];
|
ggml_tensor * K = dst->src[1];
|
||||||
ggml_tensor * V = dst->src[2];
|
ggml_tensor * V = dst->src[2];
|
||||||
|
|
||||||
|
@ -340,7 +340,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!fast_fp16_available(cc)) {
|
if (!fast_fp16_available(cc)) {
|
||||||
if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
|
if (Q->ne[1] <= 8) {
|
||||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||||
} else {
|
} else {
|
||||||
ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
|
ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
|
||||||
|
|
2
llama/ggml-cuda/fattn.cuh
vendored
2
llama/ggml-cuda/fattn.cuh
vendored
|
@ -1,5 +1,5 @@
|
||||||
/**
|
/**
|
||||||
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
|
* llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
|
||||||
*
|
*
|
||||||
* MIT License
|
* MIT License
|
||||||
*
|
*
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue