From b3f75fc812fc1559090a7fd9739bd203817a5979 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 14 Aug 2024 14:37:51 -0700 Subject: [PATCH 01/34] fix noprune --- server/images.go | 63 ++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 39 deletions(-) diff --git a/server/images.go b/server/images.go index 0e753f56..798ed818 100644 --- a/server/images.go +++ b/server/images.go @@ -215,25 +215,20 @@ func GetManifest(mp ModelPath) (*Manifest, string, error) { return nil, "", err } - if _, err = os.Stat(fp); err != nil { - return nil, "", err - } - - var manifest *Manifest - - bts, err := os.ReadFile(fp) + f, err := os.Open(fp) if err != nil { - return nil, "", fmt.Errorf("couldn't open file '%s'", fp) + return nil, "", err } + defer f.Close() - shaSum := sha256.Sum256(bts) - shaStr := hex.EncodeToString(shaSum[:]) + sha256sum := sha256.New() - if err := json.Unmarshal(bts, &manifest); err != nil { + var manifest Manifest + if err := json.NewDecoder(io.TeeReader(f, sha256sum)).Decode(&manifest); err != nil { return nil, "", err } - return manifest, shaStr, nil + return &manifest, hex.EncodeToString(sha256sum.Sum(nil)), nil } func GetModel(name string) (*Model, error) { @@ -716,7 +711,7 @@ func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) // save (i.e. delete from the deleteMap) any files used in other manifests manifest, _, err := GetManifest(fmp) if err != nil { - return err + return fmt.Errorf("error reading manifest %s: %w", path, err) } for _, layer := range manifest.Layers { @@ -781,8 +776,7 @@ func PruneLayers() error { slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap))) - err = deleteUnusedLayers(nil, deleteMap) - if err != nil { + if err := deleteUnusedLayers(nil, deleteMap); err != nil { slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err)) return nil } @@ -877,26 +871,19 @@ func PushModel(ctx context.Context, name string, regOpts *registryOptions, fn fu func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn func(api.ProgressResponse)) error { mp := ParseModelPath(name) - var manifest *Manifest - var err error - var noprune string - // build deleteMap to prune unused layers deleteMap := make(map[string]struct{}) - - if !envconfig.NoPrune() { - manifest, _, err = GetManifest(mp) - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err + manifest, _, err := GetManifest(mp) + if errors.Is(err, os.ErrNotExist) { + // noop + } else if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } else { + for _, l := range manifest.Layers { + deleteMap[l.Digest] = struct{}{} } - - if manifest != nil { - for _, l := range manifest.Layers { - deleteMap[l.Digest] = struct{}{} - } - if manifest.Config.Digest != "" { - deleteMap[manifest.Config.Digest] = struct{}{} - } + if manifest.Config.Digest != "" { + deleteMap[manifest.Config.Digest] = struct{}{} } } @@ -975,11 +962,9 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu return err } - if noprune == "" { - fn(api.ProgressResponse{Status: "removing any unused layers"}) - err = deleteUnusedLayers(nil, deleteMap) - if err != nil { - slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err)) + if !envconfig.NoPrune() && len(deleteMap) > 0 { + fn(api.ProgressResponse{Status: "removing unused layers"}) + if err := deleteUnusedLayers(nil, deleteMap); err != nil { fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)}) } } @@ -1000,12 +985,12 @@ func pullModelManifest(ctx context.Context, mp ModelPath, regOpts *registryOptio } defer resp.Body.Close() - var m *Manifest + var m Manifest if err := json.NewDecoder(resp.Body).Decode(&m); err != nil { return nil, err } - return m, err + return &m, err } // GetSHA256Digest returns the SHA256 hash of a given buffer and returns it, and the size of buffer From 237dccba1edb41bb65ed1ffc6eafdd40dd6085e4 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 14 Aug 2024 16:36:07 -0700 Subject: [PATCH 02/34] skip invalid manifest files --- server/images.go | 35 +++++------------------------------ server/manifest.go | 2 +- 2 files changed, 6 insertions(+), 31 deletions(-) diff --git a/server/images.go b/server/images.go index 798ed818..8b3a67cf 100644 --- a/server/images.go +++ b/server/images.go @@ -687,43 +687,18 @@ func CopyModel(src, dst model.Name) error { return err } -func deleteUnusedLayers(skipModelPath *ModelPath, deleteMap map[string]struct{}) error { - fp, err := GetManifestPath() +func deleteUnusedLayers(deleteMap map[string]struct{}) error { + manifests, err := Manifests() if err != nil { return err } - walkFunc := func(path string, info os.FileInfo, _ error) error { - if info.IsDir() { - return nil - } - - dir, file := filepath.Split(path) - dir = strings.Trim(strings.TrimPrefix(dir, fp), string(os.PathSeparator)) - tag := strings.Join([]string{dir, file}, ":") - fmp := ParseModelPath(tag) - - // skip the manifest we're trying to delete - if skipModelPath != nil && skipModelPath.GetFullTagname() == fmp.GetFullTagname() { - return nil - } - - // save (i.e. delete from the deleteMap) any files used in other manifests - manifest, _, err := GetManifest(fmp) - if err != nil { - return fmt.Errorf("error reading manifest %s: %w", path, err) - } - + for _, manifest := range manifests { for _, layer := range manifest.Layers { delete(deleteMap, layer.Digest) } delete(deleteMap, manifest.Config.Digest) - return nil - } - - if err := filepath.Walk(fp, walkFunc); err != nil { - return err } // only delete the files which are still in the deleteMap @@ -776,7 +751,7 @@ func PruneLayers() error { slog.Info(fmt.Sprintf("total blobs: %d", len(deleteMap))) - if err := deleteUnusedLayers(nil, deleteMap); err != nil { + if err := deleteUnusedLayers(deleteMap); err != nil { slog.Error(fmt.Sprintf("couldn't remove unused layers: %v", err)) return nil } @@ -964,7 +939,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu if !envconfig.NoPrune() && len(deleteMap) > 0 { fn(api.ProgressResponse{Status: "removing unused layers"}) - if err := deleteUnusedLayers(nil, deleteMap); err != nil { + if err := deleteUnusedLayers(deleteMap); err != nil { fn(api.ProgressResponse{Status: fmt.Sprintf("couldn't remove unused layers: %v", err)}) } } diff --git a/server/manifest.go b/server/manifest.go index 6a5d7b88..0f19641d 100644 --- a/server/manifest.go +++ b/server/manifest.go @@ -150,7 +150,7 @@ func Manifests() (map[model.Name]*Manifest, error) { n := model.ParseNameFromFilepath(rel) if !n.IsValid() { - slog.Warn("bad manifest name", "path", rel, "error", err) + slog.Warn("bad manifest name", "path", rel) continue } From 3a75e74e34c976d596437c8aa14587ada562301e Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 15 Aug 2024 10:29:14 -0700 Subject: [PATCH 03/34] only skip invalid json manifests --- server/manifest.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/server/manifest.go b/server/manifest.go index 0f19641d..6b04753f 100644 --- a/server/manifest.go +++ b/server/manifest.go @@ -5,6 +5,7 @@ import ( "encoding/hex" "encoding/json" "errors" + "fmt" "io" "log/slog" "os" @@ -155,9 +156,11 @@ func Manifests() (map[model.Name]*Manifest, error) { } m, err := ParseNamedManifest(n) - if err != nil { + if syntax := &(json.SyntaxError{}); errors.As(err, &syntax) { slog.Warn("bad manifest", "name", n, "error", err) continue + } else if err != nil { + return nil, fmt.Errorf("%s: %w", n, err) } ms[n] = m From a84c05cf9140c2eb288a6c7b56bb1c592bbaacc7 Mon Sep 17 00:00:00 2001 From: eust-w Date: Fri, 16 Aug 2024 06:00:12 +0800 Subject: [PATCH 04/34] fix: Add tooltip to system tray icon - Updated setIcon method to include tooltip text for the system tray icon. - Added NIF_TIP flag and set the tooltip text using UTF16 encoding. Resolves: #6372 --- app/tray/wintray/tray.go | 8 +++++++- app/tray/wintray/w32api.go | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/app/tray/wintray/tray.go b/app/tray/wintray/tray.go index ccd087a1..6f827893 100644 --- a/app/tray/wintray/tray.go +++ b/app/tray/wintray/tray.go @@ -11,6 +11,7 @@ import ( "path/filepath" "sort" "sync" + "syscall" "unsafe" "golang.org/x/sys/windows" @@ -433,7 +434,12 @@ func (t *winTray) setIcon(src string) error { t.muNID.Lock() defer t.muNID.Unlock() t.nid.Icon = h - t.nid.Flags |= NIF_ICON + t.nid.Flags |= NIF_ICON | NIF_TIP + if toolTipUTF16, err := syscall.UTF16FromString(commontray.ToolTip); err == nil { + copy(t.nid.Tip[:], toolTipUTF16) + } else { + return err + } t.nid.Size = uint32(unsafe.Sizeof(*t.nid)) return t.nid.modify() diff --git a/app/tray/wintray/w32api.go b/app/tray/wintray/w32api.go index a1e0381d..7c7c0ac8 100644 --- a/app/tray/wintray/w32api.go +++ b/app/tray/wintray/w32api.go @@ -61,6 +61,7 @@ const ( MIIM_SUBMENU = 0x00000004 MIM_APPLYTOSUBMENUS = 0x80000000 NIF_ICON = 0x00000002 + NIF_TIP = 0x00000004 NIF_INFO = 0x00000010 NIF_MESSAGE = 0x00000001 SW_HIDE = 0 From bdc4308afb72d47ce63583427f810b02d569d58a Mon Sep 17 00:00:00 2001 From: zwwhdls Date: Fri, 16 Aug 2024 11:43:19 +0800 Subject: [PATCH 05/34] fix: chmod new layer to 0o644 when creating it Signed-off-by: zwwhdls --- server/layer.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/layer.go b/server/layer.go index c666bd10..0bdee72b 100644 --- a/server/layer.go +++ b/server/layer.go @@ -51,6 +51,9 @@ func NewLayer(r io.Reader, mediatype string) (Layer, error) { if err := os.Rename(temp.Name(), blob); err != nil { return Layer{}, err } + if err := os.Chmod(blob, 0o644); err != nil { + return Layer{}, err + } } return Layer{ From 0ad0e738cd7ed1266b3c210ad54dcd2b70142563 Mon Sep 17 00:00:00 2001 From: Richard Lyons Date: Sun, 18 Aug 2024 01:43:26 +0200 Subject: [PATCH 06/34] Override numParallel only if unset. --- server/sched.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/sched.go b/server/sched.go index 9947fd32..4d9c0296 100644 --- a/server/sched.go +++ b/server/sched.go @@ -734,7 +734,9 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL // If multiple Libraries are detected, pick the Library which loads the most layers for the model func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { - *numParallel = 1 + if *numParallel <= 0 { + *numParallel = 1 + } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { return gpus From 9352eeb752531decccc7c6b91a07bc3dd5efa67e Mon Sep 17 00:00:00 2001 From: Richard Lyons Date: Sun, 18 Aug 2024 02:55:01 +0200 Subject: [PATCH 07/34] Reset NumCtx. --- server/sched.go | 1 + 1 file changed, 1 insertion(+) diff --git a/server/sched.go b/server/sched.go index 4d9c0296..3fe6d7fc 100644 --- a/server/sched.go +++ b/server/sched.go @@ -736,6 +736,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { if *numParallel <= 0 { *numParallel = 1 + req.opts.NumCtx = req.origNumCtx } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { From 885cf45087863aa2e064a05da99e8bd07d69970a Mon Sep 17 00:00:00 2001 From: Richard Lyons Date: Sun, 18 Aug 2024 03:07:16 +0200 Subject: [PATCH 08/34] Fix white space. --- server/sched.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/sched.go b/server/sched.go index 3fe6d7fc..9d8c4144 100644 --- a/server/sched.go +++ b/server/sched.go @@ -736,8 +736,8 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { if *numParallel <= 0 { *numParallel = 1 - req.opts.NumCtx = req.origNumCtx - } + req.opts.NumCtx = req.origNumCtx + } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { return gpus From 9fddef3731842bd8f40d217da6b84ab7ef5dfe97 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Mon, 19 Aug 2024 09:20:52 -0700 Subject: [PATCH 09/34] server: limit upload parts to 16 (#6411) --- server/upload.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/upload.go b/server/upload.go index 2f115436..020e8955 100644 --- a/server/upload.go +++ b/server/upload.go @@ -45,7 +45,7 @@ type blobUpload struct { } const ( - numUploadParts = 64 + numUploadParts = 16 minUploadPartSize int64 = 100 * format.MegaByte maxUploadPartSize int64 = 1000 * format.MegaByte ) From 74d45f010276c2f2653f3ca8c4f76cb0552fb46e Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 8 Jul 2024 12:50:11 -0700 Subject: [PATCH 10/34] Refactor linux packaging This adjusts linux to follow a similar model to windows with a discrete archive (zip/tgz) to cary the primary executable, and dependent libraries. Runners are still carried as payloads inside the main binary Darwin retain the payload model where the go binary is fully self contained. --- .github/workflows/release.yaml | 1 - Dockerfile | 29 ++++++------ app/ollama.iss | 11 +---- envconfig/config.go | 4 +- gpu/amd_common.go | 2 +- gpu/amd_windows.go | 2 +- gpu/gpu.go | 50 ++++++++++++++------- gpu/gpu_linux.go | 2 +- llm/ext_server/CMakeLists.txt | 3 +- llm/generate/gen_common.sh | 17 ++++++- llm/generate/gen_linux.sh | 81 ++++++++++++++++------------------ llm/generate/gen_windows.ps1 | 43 +++++++++--------- llm/server.go | 12 +++-- scripts/build_linux.sh | 10 ++--- scripts/build_windows.ps1 | 12 ++--- scripts/install.sh | 31 ++++++++++--- 16 files changed, 171 insertions(+), 139 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 5ae630c3..9287f6f7 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -363,7 +363,6 @@ jobs: - run: | ./scripts/build_linux.sh ./scripts/build_docker.sh - mv dist/deps/* dist/ - uses: actions/upload-artifact@v4 with: name: dist-linux-amd64 diff --git a/Dockerfile b/Dockerfile index c8efdd8a..120ddc21 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS +ENV GOARCH amd64 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64 @@ -28,6 +29,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS +ENV GOARCH arm64 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 @@ -40,15 +42,10 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG AMDGPU_TARGETS +ENV GOARCH amd64 RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh -RUN mkdir /tmp/scratch && \ - for dep in $(zcat /go/src/github.com/ollama/ollama/llm/build/linux/x86_64/rocm*/bin/deps.txt.gz) ; do \ - cp ${dep} /tmp/scratch/ || exit 1 ; \ - done && \ - (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd /tmp/scratch/ && tar xf - ) && \ - mkdir -p /go/src/github.com/ollama/ollama/dist/deps/ && \ - (cd /tmp/scratch/ && tar czvf /go/src/github.com/ollama/ollama/dist/deps/ollama-linux-amd64-rocm.tgz . ) - +RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \ + (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - ) FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64 ARG CMAKE_VERSION @@ -59,6 +56,7 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS +ENV GOARCH amd64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64 @@ -79,6 +77,7 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ ARG OLLAMA_CUSTOM_CPU_DEFS ARG CGO_CFLAGS +ENV GOARCH arm64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64 @@ -95,12 +94,13 @@ COPY . . COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath . +RUN go build -trimpath -o dist/linux-amd64/ollama . # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 @@ -109,23 +109,24 @@ ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath . +RUN go build -trimpath -o dist/linux-arm64/ollama . # Runtime stages FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64 RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-arm64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/ollama /bin/ollama # Radeon images are much larger so we keep it distinct from the CPU/CUDA image FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm RUN update-pciids -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama EXPOSE 11434 ENV OLLAMA_HOST 0.0.0.0 diff --git a/app/ollama.iss b/app/ollama.iss index dc6178f7..e9cf48ec 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -91,16 +91,7 @@ Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion -#if DirExists("..\dist\windows-amd64\cuda") - Source: "..\dist\windows-amd64\cuda\*"; DestDir: "{app}\cuda\"; Flags: ignoreversion recursesubdirs -#endif -#if DirExists("..\dist\windows-amd64\oneapi") - Source: "..\dist\windows-amd64\oneapi\*"; DestDir: "{app}\oneapi\"; Flags: ignoreversion recursesubdirs -#endif -#if DirExists("..\dist\windows-amd64\rocm") - Source: "..\dist\windows-amd64\rocm\*"; DestDir: "{app}\rocm\"; Flags: ignoreversion recursesubdirs -#endif - +Source: "..\dist\windows-amd64\ollama_libs\*"; DestDir: "{app}\ollama_libs\"; Flags: ignoreversion recursesubdirs [Icons] Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" diff --git a/envconfig/config.go b/envconfig/config.go index b82b773d..7f0976c0 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -193,8 +193,8 @@ func RunnersDir() (p string) { for _, root := range []string{filepath.Dir(exe), cwd} { paths = append(paths, root, - filepath.Join(root, "windows-"+runtime.GOARCH), - filepath.Join(root, "dist", "windows-"+runtime.GOARCH), + filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), + filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH), ) } diff --git a/gpu/amd_common.go b/gpu/amd_common.go index 2839cb7c..05747208 100644 --- a/gpu/amd_common.go +++ b/gpu/amd_common.go @@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) { // Installer payload location if we're running the installed binary exe, err := os.Executable() if err == nil { - rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm") + rocmTargetDir := filepath.Join(filepath.Dir(exe), "ollama_libs") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index edabeb43..5d25a966 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) { // Installer payload (if we're running from some other location) localAppData := os.Getenv("LOCALAPPDATA") appDir := filepath.Join(localAppData, "Programs", "Ollama") - rocmTargetDir := filepath.Join(appDir, "rocm") + rocmTargetDir := filepath.Join(appDir, "ollama_libs") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ollama installed ROCm at " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/gpu.go b/gpu/gpu.go index dc124a3e..d0ae0f34 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -229,11 +229,7 @@ func GetGPUInfo() GpuInfoList { return GpuInfoList{cpus[0].GpuInfo} } - // On windows we bundle the nvidia library one level above the runner dir - depPath := "" - if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" { - depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "cuda") - } + depPath := GetDepDir() // Load ALL libraries cHandles = initCudaHandles() @@ -306,13 +302,6 @@ func GetGPUInfo() GpuInfoList { if envconfig.IntelGPU() { oHandles = initOneAPIHandles() if oHandles != nil && oHandles.oneapi != nil { - - // On windows we bundle the oneapi library one level above the runner dir - depPath = "" - if runtime.GOOS == "windows" && envconfig.RunnersDir() != "" { - depPath = filepath.Join(filepath.Dir(envconfig.RunnersDir()), "oneapi") - } - for d := range oHandles.oneapi.num_drivers { if oHandles.oneapi == nil { // shouldn't happen @@ -467,10 +456,12 @@ func GetGPUInfo() GpuInfoList { func FindGPULibs(baseLibName string, defaultPatterns []string) []string { // Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them var ldPaths []string - var patterns []string gpuLibPaths := []string{} slog.Debug("Searching for GPU library", "name", baseLibName) + // Start with our bundled libraries + patterns := []string{filepath.Join(GetDepDir(), baseLibName)} + switch runtime.GOOS { case "windows": ldPaths = strings.Split(os.Getenv("PATH"), ";") @@ -479,13 +470,14 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string { default: return gpuLibPaths } - // Start with whatever we find in the PATH/LD_LIBRARY_PATH + + // Then with whatever we find in the PATH/LD_LIBRARY_PATH for _, ldPath := range ldPaths { d, err := filepath.Abs(ldPath) if err != nil { continue } - patterns = append(patterns, filepath.Join(d, baseLibName+"*")) + patterns = append(patterns, filepath.Join(d, baseLibName)) } patterns = append(patterns, defaultPatterns...) slog.Debug("gpu library search", "globs", patterns) @@ -641,3 +633,31 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { return "", "" } } + +func GetDepDir() string { + // On Windows/linux we bundle the dependencies at the same level as the executable + appExe, err := os.Executable() + if err != nil { + slog.Warn("failed to lookup executable path", "error", err) + } + cwd, err := os.Getwd() + if err != nil { + slog.Warn("failed to lookup working directory", "error", err) + } + // Scan for any of our dependeices, and pick first match + for _, root := range []string{filepath.Dir(appExe), cwd} { + libDep := "ollama_libs" + if _, err := os.Stat(filepath.Join(root, libDep)); err == nil { + return filepath.Join(root, libDep) + } + // Developer mode, local build + if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil { + return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep) + } + if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil { + return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep) + } + } + slog.Warn("unable to locate gpu dependency libraries") + return "" +} diff --git a/gpu/gpu_linux.go b/gpu/gpu_linux.go index d6d2675c..d4d20bc4 100644 --- a/gpu/gpu_linux.go +++ b/gpu/gpu_linux.go @@ -47,7 +47,7 @@ var ( CudartMgmtName = "libcudart.so*" NvcudaMgmtName = "libcuda.so*" NvmlMgmtName = "" // not currently wired on linux - OneapiMgmtName = "libze_intel_gpu.so" + OneapiMgmtName = "libze_intel_gpu.so*" ) func GetCPUMem() (memInfo, error) { diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt index bfc97c63..90fd0ef2 100644 --- a/llm/ext_server/CMakeLists.txt +++ b/llm/ext_server/CMakeLists.txt @@ -1,12 +1,13 @@ set(TARGET ollama_llama_server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) +set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) -target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS}) if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index da1b0688..f1541f2a 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -9,11 +9,14 @@ init_vars() { ARCH="arm64" ;; *) - ARCH=$(uname -m | sed -e "s/aarch64/arm64/g") + echo "GOARCH must be set" + echo "this script is meant to be run from within go generate" + exit 1 + ;; esac LLAMACPP_DIR=../llama.cpp - CMAKE_DEFS="" + CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on" CMAKE_TARGETS="--target ollama_llama_server" if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}" @@ -27,6 +30,7 @@ init_vars() { WHOLE_ARCHIVE="-Wl,-force_load" NO_WHOLE_ARCHIVE="" GCC_ARCH="-arch ${ARCH}" + DIST_BASE=../../dist/darwin-${GOARCH}/ ;; "Linux") LIB_EXT="so" @@ -35,6 +39,7 @@ init_vars() { # Cross compiling not supported on linux - Use docker GCC_ARCH="" + DIST_BASE=../../dist/linux-${GOARCH}/ ;; *) ;; @@ -105,6 +110,14 @@ compress() { echo "Finished compression" } +install() { + echo "Installing libraries to bin dir ${BUILD_DIR}/bin/" + for lib in $(find ${BUILD_DIR} -name \*.${LIB_EXT}); do + rm -f "${BUILD_DIR}/bin/$(basename ${lib})" + cp -af "${lib}" "${BUILD_DIR}/bin/" + done +} + # Keep the local tree clean after we're done with the build cleanup() { (cd ${LLAMACPP_DIR}/ && git checkout CMakeLists.txt) diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index db2c6c30..70fc0313 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then export CUDACXX=$(command -v nvcc) fi fi -COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" +COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" source $(dirname $0)/gen_common.sh init_vars git_module_setup @@ -77,10 +77,11 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then init_vars echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" - CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" + CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building custom CPU" build + install compress else # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512 @@ -93,7 +94,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake - COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" + COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) @@ -103,6 +104,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building LCD CPU" build + install compress fi @@ -120,6 +122,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu_avx" echo "Building AVX CPU" build + install compress fi @@ -133,6 +136,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then BUILD_DIR="../build/linux/${ARCH}/cpu_avx2" echo "Building AVX2 CPU" build + install compress fi fi @@ -178,29 +182,18 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}" echo "Building custom CUDA GPU" else - CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_FLAGS=-t8 -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" + CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" fi - CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}" + export CUDAFLAGS="-t8" + CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" - EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" + export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" + CUDA_DIST_DIR="${DIST_BASE}/ollama_libs" build - - # Carry the CUDA libs as payloads to help reduce dependency burden on users - # - # TODO - in the future we may shift to packaging these separately and conditionally - # downloading them in the install script. - DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )" - for lib in libcudart.so libcublas.so libcublasLt.so ; do - DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true) - if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then - cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/" - elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then - cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/" - elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then - cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/" - else - cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/" - fi + install + mkdir -p "${CUDA_DIST_DIR}" + for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do + cp -a "${lib}" "${CUDA_DIST_DIR}" done compress @@ -218,21 +211,24 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then CC=icx CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" BUILD_DIR="../build/linux/${ARCH}/oneapi" - EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" + ONEAPI_DIST_DIR="${DIST_BASE}/ollama_libs" + export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it build # copy oneAPI dependencies + mkdir -p "${ONEAPI_DIST_DIR}" for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do - cp "${dep}" "${BUILD_DIR}/bin/" + cp -a "${dep}" "${ONEAPI_DIST_DIR}" done - cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/" - cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}" + cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}" + install compress fi @@ -262,21 +258,18 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then echo "Building custom ROCM GPU" fi BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" - EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" + ROCM_DIST_DIR="${DIST_BASE}/ollama_libs" + # TODO figure out how to disable runpath (rpath) + # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work + export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" build - # Record the ROCM dependencies - rm -f "${BUILD_DIR}/bin/deps.txt" - touch "${BUILD_DIR}/bin/deps.txt" - for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do - echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt" + # copy the ROCM dependencies + mkdir -p "${ROCM_DIST_DIR}" + for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do + cp -a "${dep}"* "${ROCM_DIST_DIR}" done - # bomb out if for some reason we didn't get a few deps - if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then - cat "${BUILD_DIR}/bin/deps.txt" - echo "ERROR: deps file short" - exit 1 - fi + install compress fi diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index d8bce92d..1f8c96d8 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -286,12 +286,11 @@ function build_cuda() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" -ea 0 > $null - write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" - cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\cuda\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null + write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" } else { write-host "Skipping CUDA generation step" } @@ -325,18 +324,17 @@ function build_oneapi() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" -ea 0 > $null - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\oneapi\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" } else { Write-Host "Skipping oneAPI generation step" } @@ -386,12 +384,11 @@ function build_rocm() { sign install - rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null - cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" - cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" -ea 0 > $null + cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs - cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" + cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" } else { write-host "Skipping ROCm generation step" } diff --git a/llm/server.go b/llm/server.go index d2b8db9b..9347a458 100644 --- a/llm/server.go +++ b/llm/server.go @@ -306,20 +306,18 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr if runtime.GOOS == "windows" { pathEnv = "PATH" } - // prepend the server directory to LD_LIBRARY_PATH/PATH and the parent dir for common dependencies - libraryPaths := []string{dir, filepath.Dir(dir)} + // Start with the server directory for the LD_LIBRARY_PATH/PATH + libraryPaths := []string{dir} if libraryPath, ok := os.LookupEnv(pathEnv); ok { - // Append our runner directory to the path - // This will favor system libraries over our bundled library dependencies + // favor our bundled library dependencies over system libraries libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...) } // Note: we always put the dependency path first - // since this was the exact version we verified for AMD GPUs - // and we favor what the user had in their path + // since this was the exact version we compiled/linked against if gpus[0].DependencyPath != "" { - // TODO refine for multi-gpu support + // assume gpus from the same library have the same dependency path libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...) } diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 27c4ff1f..4ea51229 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -21,11 +21,9 @@ for TARGETARCH in ${BUILD_ARCH}; do -t builder:$TARGETARCH \ . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/ollama ./dist/ollama-linux-$TARGETARCH - - if [ "$TARGETARCH" = "amd64" ]; then - docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/deps/ ./dist/ - fi - + docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist docker rm builder-$TARGETARCH + echo "Compressing final linux bundle..." + rm -f ./dist/ollama-linux-$TARGETARCH.tgz + (cd dist/linux-$TARGETARCH && tar cf - . | gzip --best > ../ollama-linux-$TARGETARCH.tgz ) done diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index edc73759..e8d851f4 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -103,22 +103,22 @@ function buildApp() { function gatherDependencies() { write-host "Gathering runtime dependencies" cd "${script:SRC_DIR}" - md "${script:DEPS_DIR}\ollama_runners" -ea 0 > $null + md "${script:DEPS_DIR}\ollama_libs" -ea 0 > $null # TODO - this varies based on host build system and MSVC version - drive from dumpbin output # currently works for Win11 + MSVC 2019 + Cuda V11 - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_runners\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_runners\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_libs\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_libs\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_libs\" foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) { - cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_runners\" + cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_libs\" } cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\" if ("${env:KEY_CONTAINER}") { write-host "about to sign" - foreach ($file in (get-childitem "${script:DEPS_DIR}\cuda\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ + foreach ($file in (get-childitem "${script:DEPS_DIR}\ollama_libs\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ write-host "signing $file" & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file diff --git a/scripts/install.sh b/scripts/install.sh index 03af5a69..f0439b00 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -63,16 +63,32 @@ if [ -n "$NEEDS" ]; then exit 1 fi -status "Downloading ollama..." -curl --fail --show-error --location --progress-bar -o $TEMP_DIR/ollama "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}" - for BINDIR in /usr/local/bin /usr/bin /bin; do echo $PATH | grep -q $BINDIR && break || continue done +OLLAMA_INSTALL_DIR=${OLLAMA_INSTALL_DIR:-${BINDIR}} -status "Installing ollama to $BINDIR..." +status "Installing ollama to $OLLAMA_INSTALL_DIR" $SUDO install -o0 -g0 -m755 -d $BINDIR -$SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama +$SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR" +if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then + status "Downloading Linux ${ARCH} bundle" + curl --fail --show-error --location --progress-bar \ + "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \ + $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" + BUNDLE=1 +else + status "Downloading Linux ${ARCH} CLI" + curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\ + "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}" + $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama + BUNDLE=0 +fi + +if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then + status "Making ollama accessible in the PATH in $BINDIR" + $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" +fi install_success() { status 'The Ollama API is now available at 127.0.0.1:11434.' @@ -178,6 +194,11 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg fi if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then + if [ $BUNDLE -ne 0 ]; then + install_success + status "AMD GPU ready." + exit 0 + fi # Look for pre-existing ROCm v6 before downloading the dependencies for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then From c7bcb0031965e33531358639620a11516d101b54 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 9 Aug 2024 07:21:40 -0700 Subject: [PATCH 11/34] Wire up ccache and pigz in the docker based build This should help speed things up a little --- Dockerfile | 37 ++++++++++++++++++++++++++----------- llm/generate/gen_common.sh | 15 +++++++++------ llm/generate/gen_darwin.sh | 2 ++ llm/generate/gen_linux.sh | 2 ++ scripts/build_linux.sh | 3 ++- scripts/rh_linux_deps.sh | 14 ++++++++++++-- 6 files changed, 53 insertions(+), 20 deletions(-) diff --git a/Dockerfile b/Dockerfile index 120ddc21..8eb90057 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,7 +19,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64 ARG CMAKE_VERSION @@ -30,7 +31,12 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH arm64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ + CUDA_VARIANT="_v11" \ + bash gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 ARG CMAKE_VERSION @@ -43,7 +49,8 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ARG AMDGPU_TARGETS ENV GOARCH amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \ (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - ) @@ -60,13 +67,17 @@ ENV GOARCH amd64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64 -RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_CPU_TARGET="static" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx" bash gen_linux.sh FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu_avx2" bash gen_linux.sh FROM --platform=linux/arm64 rockylinux:8 AS cpu-builder-arm64 ARG CMAKE_VERSION @@ -81,9 +92,11 @@ ENV GOARCH arm64 WORKDIR /go/src/github.com/ollama/ollama/llm/generate FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64 -RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_CPU_TARGET="static" bash gen_linux.sh FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh # Intermediate stage used for ./scripts/build_linux.sh @@ -100,7 +113,8 @@ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath -o dist/linux-amd64/ollama . +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-amd64/ollama . # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 @@ -113,7 +127,8 @@ COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS ARG CGO_CFLAGS -RUN go build -trimpath -o dist/linux-arm64/ollama . +RUN --mount=type=cache,target=/root/.ccache \ + go build -trimpath -o dist/linux-arm64/ollama . # Runtime stages FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index f1541f2a..40115936 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -47,6 +47,7 @@ init_vars() { if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" fi + GZIP=$(which pigz 2>/dev/null || echo "gzip") } git_module_setup() { @@ -90,21 +91,23 @@ build() { compress() { echo "Compressing payloads to reduce overall binary size..." - pids="" rm -rf ${BUILD_DIR}/bin/*.gz for f in ${BUILD_DIR}/bin/* ; do - gzip -n --best -f ${f} & - pids+=" $!" + ${GZIP} -n --best -f ${f} & + compress_pids+=" $!" done # check for lib directory if [ -d ${BUILD_DIR}/lib ]; then for f in ${BUILD_DIR}/lib/* ; do - gzip -n --best -f ${f} & - pids+=" $!" + ${GZIP} -n --best -f ${f} & + compress_pids+=" $!" done fi echo - for pid in ${pids}; do +} + +wait_for_compress() { + for pid in ${compress_pids}; do wait $pid done echo "Finished compression" diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 6c0b62cb..f22c0f8e 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -6,6 +6,7 @@ set -ex set -o pipefail +compress_pids="" echo "Starting darwin generate script" source $(dirname $0)/gen_common.sh init_vars @@ -98,4 +99,5 @@ case "${GOARCH}" in esac cleanup +wait_for_compress echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 70fc0313..1365d07d 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -13,6 +13,7 @@ set -ex set -o pipefail +compress_pids="" # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference amdGPUs() { @@ -274,4 +275,5 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then fi cleanup +wait_for_compress echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)" diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 4ea51229..ebb60c5a 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -4,6 +4,7 @@ set -eu export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")} export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'" +GZIP=$(which pigz 2>/dev/null || echo "gzip") BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"} export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""} @@ -25,5 +26,5 @@ for TARGETARCH in ${BUILD_ARCH}; do docker rm builder-$TARGETARCH echo "Compressing final linux bundle..." rm -f ./dist/ollama-linux-$TARGETARCH.tgz - (cd dist/linux-$TARGETARCH && tar cf - . | gzip --best > ../ollama-linux-$TARGETARCH.tgz ) + (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz ) done diff --git a/scripts/rh_linux_deps.sh b/scripts/rh_linux_deps.sh index 81648d68..b4c9afd6 100644 --- a/scripts/rh_linux_deps.sh +++ b/scripts/rh_linux_deps.sh @@ -3,6 +3,7 @@ # Script for common Dockerfile dependency installation in redhat linux based images set -ex +set -o pipefail MACHINE=$(uname -m) if grep -i "centos" /etc/system-release >/dev/null; then @@ -29,7 +30,7 @@ if grep -i "centos" /etc/system-release >/dev/null; then dnf install -y rh-git227-git ln -s /opt/rh/rh-git227/root/usr/bin/git /usr/local/bin/git fi - dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ + dnf install -y devtoolset-10-gcc devtoolset-10-gcc-c++ pigz elif grep -i "rocky" /etc/system-release >/dev/null; then # Temporary workaround until rocky 8 AppStream ships GCC 10.4 (10.3 is incompatible with NVCC) cat << EOF > /etc/yum.repos.d/Rocky-Vault.repo @@ -43,12 +44,21 @@ gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-rockyofficial EOF dnf install -y git \ gcc-toolset-10-gcc-10.2.1-8.2.el8 \ - gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 + gcc-toolset-10-gcc-c++-10.2.1-8.2.el8 \ + pigz else echo "ERROR Unexpected distro" exit 1 fi +if [ "${MACHINE}" = "x86_64" ] ; then + curl -s -L https://github.com/ccache/ccache/releases/download/v4.10.2/ccache-4.10.2-linux-x86_64.tar.xz | tar -Jx -C /tmp --strip-components 1 && \ + mv /tmp/ccache /usr/local/bin/ +else + yum -y install epel-release + yum install -y ccache +fi + if [ -n "${CMAKE_VERSION}" ]; then curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 fi From d470ebe78bc76c098bc378f98f08f7094063ab4d Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 30 May 2024 21:54:07 -0700 Subject: [PATCH 12/34] Add Jetson cuda variants for arm This adds new variants for arm64 specific to Jetson platforms --- Dockerfile | 48 +++++++++++++++++++++++++++++++++++---- gpu/gpu.go | 44 +++++++++++++++++++++++++++++++++-- gpu/gpu_darwin.go | 4 ++-- gpu/types.go | 6 ++--- llm/generate/gen_linux.sh | 5 ++-- llm/payload.go | 4 ++-- scripts/build_linux.sh | 1 + 7 files changed, 96 insertions(+), 16 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8eb90057..79b2a696 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,6 +3,9 @@ ARG CMAKE_VERSION=3.22.1 # this CUDA_VERSION corresponds with the one specified in docs/gpu.md ARG CUDA_VERSION=11.3.1 ARG ROCM_VERSION=6.1.2 +ARG JETPACK_6=r36.2.0 +ARG JETPACK_5=r35.4.1 +ARG JETPACK_4=r32.7.1 # Copy the minimal context we need to run the generate scripts FROM scratch AS llm-code @@ -22,7 +25,7 @@ ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh -FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-arm64 +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -31,11 +34,40 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH arm64 +RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh + +FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64 +ARG CMAKE_VERSION +RUN apt-get update && apt-get install -y git curl && \ + curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ENV GOARCH arm64 +ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ - CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ - CUDA_VARIANT="_v11" \ + CUDA_VARIANT="_jetpack6" \ + CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack6" \ + CMAKE_CUDA_ARCHITECTURES="87" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS cuda-build-jetpack5-arm64 +ARG CMAKE_VERSION +RUN apt-get update && apt-get install -y git curl && \ + curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ENV GOARCH arm64 +ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CUDA_VARIANT="_jetpack5" \ + CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack5" \ + CMAKE_CUDA_ARCHITECTURES="72;87" \ bash gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 @@ -123,8 +155,14 @@ ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +## arm binary += 381M +COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +## arm binary += 330M +COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ diff --git a/gpu/gpu.go b/gpu/gpu.go index d0ae0f34..22461922 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -15,7 +15,9 @@ import ( "log/slog" "os" "path/filepath" + "regexp" "runtime" + "strconv" "strings" "sync" "unsafe" @@ -215,7 +217,7 @@ func GetGPUInfo() GpuInfoList { GpuInfo: GpuInfo{ memInfo: mem, Library: "cpu", - Variant: cpuCapability, + Variant: cpuCapability.String(), ID: "0", }, }, @@ -231,6 +233,35 @@ func GetGPUInfo() GpuInfoList { depPath := GetDepDir() + var cudaVariant string + if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { + if CudaTegra != "" { + ver := strings.Split(CudaTegra, ".") + if len(ver) > 0 { + cudaVariant = "jetpack" + ver[0] + } + } else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil { + r := regexp.MustCompile(` R(\d+) `) + m := r.FindSubmatch(data) + if len(m) != 2 { + slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version") + } else { + if l4t, err := strconv.Atoi(string(m[1])); err == nil { + // Note: mapping from L4t -> JP is inconsistent (can't just subtract 30) + // https://developer.nvidia.com/embedded/jetpack-archive + switch l4t { + case 35: + cudaVariant = "jetpack5" + case 36: + cudaVariant = "jetpack6" + default: + slog.Info("unsupported L4T version", "nv_tegra_release", string(data)) + } + } + } + } + } + // Load ALL libraries cHandles = initCudaHandles() @@ -240,6 +271,7 @@ func GetGPUInfo() GpuInfoList { gpuInfo := CudaGPUInfo{ GpuInfo: GpuInfo{ Library: "cuda", + Variant: cudaVariant, }, index: i, } @@ -266,7 +298,15 @@ func GetGPUInfo() GpuInfoList { gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory - gpuInfo.DependencyPath = depPath + if depPath != "" { + gpuInfo.DependencyPath = depPath + // Check for variant specific directory + if cudaVariant != "" { + if _, err := os.Stat(filepath.Join(depPath, "cuda_"+cudaVariant)); err == nil { + gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+cudaVariant) + } + } + } gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index 9d9fd84e..417b48df 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -25,7 +25,7 @@ func GetGPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUCapability(), + Variant: GetCPUCapability().String(), memInfo: mem, }, } @@ -48,7 +48,7 @@ func GetCPUInfo() GpuInfoList { return []GpuInfo{ { Library: "cpu", - Variant: GetCPUCapability(), + Variant: GetCPUCapability().String(), memInfo: mem, }, } diff --git a/gpu/types.go b/gpu/types.go index 8d22b06b..fc628d47 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -19,7 +19,7 @@ type GpuInfo struct { Library string `json:"library,omitempty"` // Optional variant to select (e.g. versions, cpu feature flags) - Variant CPUCapability `json:"variant"` + Variant string `json:"variant"` // MinimumMemory represents the minimum memory required to use the GPU MinimumMemory uint64 `json:"-"` @@ -81,8 +81,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { for _, info := range l { found := false requested := info.Library - if info.Variant != CPUCapabilityNone { - requested += "_" + info.Variant.String() + if info.Variant != CPUCapabilityNone.String() { + requested += "_" + info.Variant } for i, lib := range libs { if lib == requested { diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 1365d07d..dc9dda5a 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -165,7 +165,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then echo "CUDA libraries detected - building dynamic CUDA library" init_vars CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true) - if [ -n "${CUDA_MAJOR}" ]; then + if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then CUDA_VARIANT=_v${CUDA_MAJOR} fi if [ "${ARCH}" == "arm64" ]; then @@ -189,9 +189,10 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" - CUDA_DIST_DIR="${DIST_BASE}/ollama_libs" + CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/ollama_libs}" build install + echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" mkdir -p "${CUDA_DIST_DIR}" for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do cp -a "${lib}" "${CUDA_DIST_DIR}" diff --git a/llm/payload.go b/llm/payload.go index b402e1f2..963b3295 100644 --- a/llm/payload.go +++ b/llm/payload.go @@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string { // glob workDir for files that start with ollama_ availableServers := getAvailableServers() requested := info.Library - if info.Variant != gpu.CPUCapabilityNone { - requested += "_" + info.Variant.String() + if info.Variant != gpu.CPUCapabilityNone.String() { + requested += "_" + info.Variant } servers := []string{} diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index ebb60c5a..adda2ad7 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -22,6 +22,7 @@ for TARGETARCH in ${BUILD_ARCH}; do -t builder:$TARGETARCH \ . docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH + rm -rf ./dist/linux-$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist docker rm builder-$TARGETARCH echo "Compressing final linux bundle..." From fc3b4cda89f468f923e2e6095c6a62a5c3c336ff Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 19 Jun 2024 09:36:30 -0700 Subject: [PATCH 13/34] Report GPU variant in log --- gpu/types.go | 1 + 1 file changed, 1 insertion(+) diff --git a/gpu/types.go b/gpu/types.go index fc628d47..88539078 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -105,6 +105,7 @@ func (l GpuInfoList) LogDetails() { slog.Info("inference compute", "id", g.ID, "library", g.Library, + "variant", g.Variant, "compute", g.Compute, "driver", fmt.Sprintf("%d.%d", g.DriverMajor, g.DriverMinor), "name", g.Name, From 4fe3a556faf790ba993223cfdda16e281b6cb76d Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 13 Jun 2024 20:46:14 -0700 Subject: [PATCH 14/34] Add cuda v12 variant and selection logic Based on compute capability and driver version, pick v12 or v11 cuda variants. --- Dockerfile | 43 +++++++++++++++++++++++++++++++++---------- gpu/cuda_common.go | 43 +++++++++++++++++++++++++++++++++++++++++++ gpu/gpu.go | 40 ++++------------------------------------ gpu/types.go | 6 ++++-- 4 files changed, 84 insertions(+), 48 deletions(-) diff --git a/Dockerfile b/Dockerfile index 79b2a696..e200f5d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ ARG GOLANG_VERSION=1.22.5 ARG CMAKE_VERSION=3.22.1 -# this CUDA_VERSION corresponds with the one specified in docs/gpu.md -ARG CUDA_VERSION=11.3.1 +ARG CUDA_VERSION_11=11.3.1 +ARG CUDA_VERSION_12=12.4.0 ARG ROCM_VERSION=6.1.2 ARG JETPACK_6=r36.2.0 ARG JETPACK_5=r35.4.1 @@ -13,7 +13,7 @@ COPY .git .git COPY .gitmodules .gitmodules COPY llm llm -FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64 +FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -23,9 +23,29 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ - OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \ + CUDA_VARIANT="_v11" \ + bash gen_linux.sh -FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64 +FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64 +ARG CMAKE_VERSION +COPY ./scripts/rh_linux_deps.sh / +RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh +ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ENV GOARCH amd64 +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \ + CUDA_VARIANT="_v12" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64 ARG CMAKE_VERSION COPY ./scripts/rh_linux_deps.sh / RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh @@ -34,7 +54,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS ENV GOARCH arm64 -RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh +RUN --mount=type=cache,target=/root/.ccache \ + OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64 ARG CMAKE_VERSION @@ -139,8 +160,10 @@ COPY . . COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ARG GOFLAGS @@ -155,8 +178,8 @@ ARG GOLANG_VERSION WORKDIR /go/src/github.com/ollama/ollama COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ## arm binary += 381M COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ diff --git a/gpu/cuda_common.go b/gpu/cuda_common.go index c90a644c..defaa60a 100644 --- a/gpu/cuda_common.go +++ b/gpu/cuda_common.go @@ -4,9 +4,17 @@ package gpu import ( "log/slog" + "os" + "regexp" + "runtime" + "strconv" "strings" ) +// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. +// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. +var CudaTegra string = os.Getenv("JETSON_JETPACK") + func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { ids := []string{} for _, info := range gpuInfo { @@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { } return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",") } + +func cudaGetVariant(gpuInfo CudaGPUInfo) string { + if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { + if CudaTegra != "" { + ver := strings.Split(CudaTegra, ".") + if len(ver) > 0 { + return "jetpack" + ver[0] + } + } else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil { + r := regexp.MustCompile(` R(\d+) `) + m := r.FindSubmatch(data) + if len(m) != 2 { + slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version") + } else { + if l4t, err := strconv.Atoi(string(m[1])); err == nil { + // Note: mapping from L4t -> JP is inconsistent (can't just subtract 30) + // https://developer.nvidia.com/embedded/jetpack-archive + switch l4t { + case 35: + return "jetpack5" + case 36: + return "jetpack6" + default: + slog.Info("unsupported L4T version", "nv_tegra_release", string(data)) + } + } + } + } + } + + if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 { + return "v11" + } + return "v12" +} diff --git a/gpu/gpu.go b/gpu/gpu.go index 22461922..eb87807a 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -15,9 +15,7 @@ import ( "log/slog" "os" "path/filepath" - "regexp" "runtime" - "strconv" "strings" "sync" "unsafe" @@ -66,10 +64,6 @@ var RocmComputeMin = 9 // TODO find a better way to detect iGPU instead of minimum memory const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU -// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed. -// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices. -var CudaTegra string = os.Getenv("JETSON_JETPACK") - // Note: gpuMutex must already be held func initCudaHandles() *cudaHandles { // TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing @@ -233,35 +227,6 @@ func GetGPUInfo() GpuInfoList { depPath := GetDepDir() - var cudaVariant string - if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { - if CudaTegra != "" { - ver := strings.Split(CudaTegra, ".") - if len(ver) > 0 { - cudaVariant = "jetpack" + ver[0] - } - } else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil { - r := regexp.MustCompile(` R(\d+) `) - m := r.FindSubmatch(data) - if len(m) != 2 { - slog.Info("Unexpected format for /etc/nv_tegra_release. Set JETSON_JETPACK to select version") - } else { - if l4t, err := strconv.Atoi(string(m[1])); err == nil { - // Note: mapping from L4t -> JP is inconsistent (can't just subtract 30) - // https://developer.nvidia.com/embedded/jetpack-archive - switch l4t { - case 35: - cudaVariant = "jetpack5" - case 36: - cudaVariant = "jetpack6" - default: - slog.Info("unsupported L4T version", "nv_tegra_release", string(data)) - } - } - } - } - } - // Load ALL libraries cHandles = initCudaHandles() @@ -271,7 +236,6 @@ func GetGPUInfo() GpuInfoList { gpuInfo := CudaGPUInfo{ GpuInfo: GpuInfo{ Library: "cuda", - Variant: cudaVariant, }, index: i, } @@ -297,7 +261,10 @@ func GetGPUInfo() GpuInfoList { gpuInfo.FreeMemory = uint64(memInfo.free) gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor) + gpuInfo.computeMajor = int(memInfo.major) + gpuInfo.computeMinor = int(memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory + cudaVariant := cudaGetVariant(gpuInfo) if depPath != "" { gpuInfo.DependencyPath = depPath // Check for variant specific directory @@ -310,6 +277,7 @@ func GetGPUInfo() GpuInfoList { gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor + gpuInfo.Variant = cudaGetVariant(gpuInfo) // query the management library as well so we can record any skew between the two // which represents overhead on the GPU we must set aside on subsequent updates diff --git a/gpu/types.go b/gpu/types.go index 88539078..4cbbeb84 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -53,8 +53,10 @@ type CPUInfo struct { type CudaGPUInfo struct { GpuInfo - OSOverhead uint64 // Memory overhead between the driver library and management library - index int //nolint:unused,nolintlint + OSOverhead uint64 // Memory overhead between the driver library and management library + index int //nolint:unused,nolintlint + computeMajor int //nolint:unused,nolintlint + computeMinor int //nolint:unused,nolintlint } type CudaGPUInfoList []CudaGPUInfo From f6c811b32075cb3b7633d7d4213251d474a77682 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 12 Jul 2024 11:35:41 -0700 Subject: [PATCH 15/34] Enable cuda v12 flags --- Dockerfile | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index e200f5d4..e83a266a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,9 @@ ARG GOLANG_VERSION=1.22.5 ARG CMAKE_VERSION=3.22.1 ARG CUDA_VERSION_11=11.3.1 +ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" ARG CUDA_VERSION_12=12.4.0 +ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" ARG ROCM_VERSION=6.1.2 ARG JETPACK_6=r36.2.0 ARG JETPACK_5=r35.4.1 @@ -21,11 +23,12 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS +ARG CUDA_V11_ARCHITECTURES ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ - CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ CUDA_VARIANT="_v11" \ bash gen_linux.sh @@ -37,12 +40,14 @@ ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS +ARG CUDA_V12_ARCHITECTURES ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 \ OLLAMA_SKIP_CPU_GENERATE=1 \ - CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \ CUDA_VARIANT="_v12" \ + OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ bash gen_linux.sh FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64 @@ -53,9 +58,31 @@ ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH COPY --from=llm-code / /go/src/github.com/ollama/ollama/ WORKDIR /go/src/github.com/ollama/ollama/llm/generate ARG CGO_CFLAGS +ARG CUDA_V11_ARCHITECTURES +ENV GOARCH arm64 +RUN OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V11_ARCHITECTURES}" \ + CUDA_VARIANT="_v11" \ + bash gen_linux.sh + +FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64 +ARG CMAKE_VERSION +COPY ./scripts/rh_linux_deps.sh / +RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh +ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH +COPY --from=llm-code / /go/src/github.com/ollama/ollama/ +WORKDIR /go/src/github.com/ollama/ollama/llm/generate +ARG CGO_CFLAGS +ARG CUDA_V12_ARCHITECTURES ENV GOARCH arm64 RUN --mount=type=cache,target=/root/.ccache \ - OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh + OLLAMA_SKIP_STATIC_GENERATE=1 \ + OLLAMA_SKIP_CPU_GENERATE=1 \ + CMAKE_CUDA_ARCHITECTURES="${CUDA_V12_ARCHITECTURES}" \ + CUDA_VARIANT="_v12" \ + OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ + bash gen_linux.sh FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64 ARG CMAKE_VERSION @@ -180,6 +207,8 @@ COPY . . COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ +COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ +COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ ## arm binary += 381M COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ From 927d98a6cde43ffee3ef269cf013df5e96cbe767 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 12 Jul 2024 14:33:13 -0700 Subject: [PATCH 16/34] Add windows cuda v12 + v11 support --- .github/workflows/release.yaml | 93 ++++++++++++++++++++++++++++++++-- llm/generate/gen_windows.ps1 | 6 +-- scripts/build_windows.ps1 | 63 ++++++++++++++++++----- 3 files changed, 142 insertions(+), 20 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9287f6f7..4bd68455 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -183,8 +183,8 @@ jobs: name: windows-rocm-deps path: dist/deps/* - # CUDA generation step - generate-windows-cuda: + # CUDA v11 generation step + generate-windows-cuda-v11: environment: release runs-on: windows env: @@ -256,7 +256,89 @@ jobs: cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" - uses: actions/upload-artifact@v4 with: - name: generate-windows-cuda + name: generate-windows-cuda-v11 + path: | + llm/build/**/bin/* + dist/windows-amd64/** + - uses: actions/upload-artifact@v4 + with: + name: windows-cuda-deps + path: dist/deps/* + + # CUDA v12 generation step + generate-windows-cuda-v12: + environment: release + runs-on: windows + env: + KEY_CONTAINER: ${{ vars.KEY_CONTAINER }} + steps: + - uses: actions/checkout@v4 + - name: Set Version + shell: bash + run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV + - uses: 'google-github-actions/auth@v2' + with: + project_id: 'ollama' + credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}' + - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt + - name: install Windows SDK 8.1 to get signtool + run: | + $ErrorActionPreference = "Stop" + write-host "downloading SDK" + Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe" + Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait + write-host "Win SDK 8.1 installed" + gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe' + - name: install signing plugin + run: | + $ErrorActionPreference = "Stop" + write-host "downloading plugin" + Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip" + Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\ + write-host "Installing plugin" + & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet + write-host "plugin installed" + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + - name: 'Install CUDA' + run: | + $ErrorActionPreference = "Stop" + write-host "downloading CUDA Installer" + Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" + write-host "Installing CUDA" + Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait + write-host "Completed CUDA" + $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path) + $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' + echo "$cudaPath\bin" >> $env:GITHUB_PATH + echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV + echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV + echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV + - name: 'Verify CUDA' + run: nvcc -V + - run: go get ./... + - name: go generate + run: | + $gopath=(get-command go).source | split-path -parent + $cudabin=(get-command nvcc).source | split-path + & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" + cd $env:GITHUB_WORKSPACE + $env:CMAKE_SYSTEM_VERSION="10.0.22621.0" + $env:PATH="$gopath;$cudabin;$env:PATH" + $env:OLLAMA_SKIP_CPU_GENERATE="1" + go generate -x ./... + - name: 'gather cuda dependencies' + run: | + $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0] + md "dist\deps" + cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\" + cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\" + cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" + - uses: actions/upload-artifact@v4 + with: + name: generate-windows-cuda-v12 path: | llm/build/**/bin/* dist/windows-amd64/** @@ -270,7 +352,8 @@ jobs: environment: release runs-on: windows needs: - - generate-windows-cuda + - generate-windows-cuda-v11 + - generate-windows-cuda-v12 - generate-windows-rocm - generate-windows-cpu env: @@ -314,7 +397,7 @@ jobs: name: generate-windows-cpu - uses: actions/download-artifact@v4 with: - name: generate-windows-cuda + name: generate-windows-cuda-v11 - uses: actions/download-artifact@v4 with: name: windows-cuda-deps diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 1f8c96d8..42708d3e 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -261,7 +261,7 @@ function build_cuda() { if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) { # Then build cuda as a dynamically loaded library $nvcc = "$script:CUDA_LIB_DIR\nvcc.exe" - $script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename + $script:CUDA_VERSION=((get-item ($nvcc | split-path | split-path)).Basename -Split "\.")[0] if ($null -ne $script:CUDA_VERSION) { $script:CUDA_VARIANT="_"+$script:CUDA_VERSION } @@ -273,9 +273,9 @@ function build_cuda() { "-DGGML_CUDA=ON", "-DGGML_AVX=on", "-DGGML_AVX2=off", - "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_FLAGS=-t8", - "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}" + "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}", + "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH" ) if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) { write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`"" diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index e8d851f4..50b60230 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -7,6 +7,7 @@ $ErrorActionPreference = "Stop" function checkEnv() { + $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower() Write-host "Building for ${script:TARGET_ARCH}" write-host "Locating required tools and paths" @@ -15,26 +16,23 @@ function checkEnv() { $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0] } - # Try to find the CUDA dir - if ($null -eq $env:NVIDIA_DIR) { + # Locate CUDA versions + # Note: this assumes every version found will be built + $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue') + if ($cudaList.length -eq 0) { $d=(get-command -ea 'silentlycontinue' nvcc).path - if ($d -ne $null) { - $script:NVIDIA_DIR=($d| split-path -parent) - } else { - $cudaList=(get-item "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v*\bin\" -ea 'silentlycontinue') - if ($cudaList.length > 0) { - $script:NVIDIA_DIR=$cudaList[0] - } + if ($null -ne $d) { + $script:CUDA_DIRS=@($d| split-path -parent) } } else { - $script:NVIDIA_DIR=$env:NVIDIA_DIR + $script:CUDA_DIRS=$cudaList } $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0] $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}" $env:CGO_ENABLED="1" - echo "Checking version" + Write-Output "Checking version" if (!$env:VERSION) { $data=(git describe --tags --first-parent --abbrev=7 --long --dirty --always) $pattern="v(.+)" @@ -71,7 +69,48 @@ function checkEnv() { function buildOllama() { write-host "Building ollama CLI" if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) { - & go generate ./... + Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}" + + # TODO - consider trying to parallelize this with Start-ThreadJob, but env vars can't be used to toggle + # which targets to build + + # Start by skipping CUDA to build everything else + pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... } + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + + # Then skip everyhting else and build all the CUDA variants + foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) { + write-host "Building CUDA ${env:CUDA_LIB_DIR}" + + if ($env:CUDA_LIB_DIR.Contains("v12")) { + pwsh -Command { + $env:OLLAMA_SKIP_CUDA_GENERATE="" + $env:OLLAMA_SKIP_STATIC_GENERATE="1" + $env:OLLAMA_SKIP_CPU_GENERATE="1" + $env:OLLAMA_SKIP_ONEAPI_GENERATE="1" + $env:OLLAMA_SKIP_ROCM_GENERATE="1" + $env:CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" + $env:OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" + $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent + $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH" + & go generate ./... + } + } else { + pwsh -Command { + $env:OLLAMA_SKIP_CUDA_GENERATE="" + $env:OLLAMA_SKIP_STATIC_GENERATE="1" + $env:OLLAMA_SKIP_CPU_GENERATE="1" + $env:OLLAMA_SKIP_ONEAPI_GENERATE="1" + $env:OLLAMA_SKIP_ROCM_GENERATE="1" + $env:CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" + $env:OLLAMA_CUSTOM_CUDA_DEFS="" + $env:CUDA_PATH=split-path -path $env:CUDA_LIB_DIR -parent + $env:PATH="$envs:CUDA_LIB_DIR;$env:PATH" + & go generate ./... + } + } + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + } if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } else { write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set" From 3b19cdba2a090772b2e886dbfbf712992fafe0cd Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 13 Aug 2024 13:30:28 -0700 Subject: [PATCH 17/34] Remove Jetpack --- Dockerfile | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/Dockerfile b/Dockerfile index e83a266a..99ba5b65 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,9 +5,6 @@ ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" ARG CUDA_VERSION_12=12.4.0 ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" ARG ROCM_VERSION=6.1.2 -ARG JETPACK_6=r36.2.0 -ARG JETPACK_5=r35.4.1 -ARG JETPACK_4=r32.7.1 # Copy the minimal context we need to run the generate scripts FROM scratch AS llm-code @@ -84,39 +81,6 @@ RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \ bash gen_linux.sh -FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64 -ARG CMAKE_VERSION -RUN apt-get update && apt-get install -y git curl && \ - curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 -COPY --from=llm-code / /go/src/github.com/ollama/ollama/ -WORKDIR /go/src/github.com/ollama/ollama/llm/generate -ARG CGO_CFLAGS -ENV GOARCH arm64 -ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs -RUN --mount=type=cache,target=/root/.ccache \ - OLLAMA_SKIP_STATIC_GENERATE=1 \ - OLLAMA_SKIP_CPU_GENERATE=1 \ - CUDA_VARIANT="_jetpack6" \ - CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack6" \ - CMAKE_CUDA_ARCHITECTURES="87" \ - bash gen_linux.sh - -FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS cuda-build-jetpack5-arm64 -ARG CMAKE_VERSION -RUN apt-get update && apt-get install -y git curl && \ - curl -s -L https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-$(uname -m).tar.gz | tar -zx -C /usr --strip-components 1 -COPY --from=llm-code / /go/src/github.com/ollama/ollama/ -WORKDIR /go/src/github.com/ollama/ollama/llm/generate -ARG CGO_CFLAGS -ENV GOARCH arm64 -ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs -RUN --mount=type=cache,target=/root/.ccache \ - OLLAMA_SKIP_STATIC_GENERATE=1 \ - OLLAMA_SKIP_CPU_GENERATE=1 \ - CUDA_VARIANT="_jetpack5" \ - CUDA_DIST_DIR="/go/src/github.com/ollama/ollama/dist/linux-arm64/ollama_libs/cuda_jetpack5" \ - CMAKE_CUDA_ARCHITECTURES="72;87" \ - bash gen_linux.sh FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS rocm-build-amd64 ARG CMAKE_VERSION @@ -209,12 +173,6 @@ COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ di COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -## arm binary += 381M -COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ -## arm binary += 330M -COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/ -COPY --from=cuda-build-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/ ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ From 88bb9e332877dfbba40030c19570fdbe00f41a21 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Wed, 14 Aug 2024 16:32:57 -0700 Subject: [PATCH 18/34] Adjust layout to bin+lib/ollama --- Dockerfile | 23 ++++++++++++++------ app/ollama.iss | 12 +++++------ docs/linux.md | 10 ++++----- envconfig/config.go | 6 +++--- gpu/amd_common.go | 2 +- gpu/amd_windows.go | 2 +- gpu/gpu.go | 4 ++-- llm/generate/gen_linux.sh | 6 +++--- llm/generate/gen_windows.ps1 | 42 ++++++++++++++++++------------------ scripts/build_windows.ps1 | 16 +++++++------- scripts/install.sh | 14 +++++++----- 11 files changed, 74 insertions(+), 63 deletions(-) diff --git a/Dockerfile b/Dockerfile index 99ba5b65..d4b86918 100644 --- a/Dockerfile +++ b/Dockerfile @@ -95,8 +95,8 @@ ARG AMDGPU_TARGETS ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh -RUN mkdir -p ../../dist/linux-amd64/ollama_libs && \ - (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/ollama_libs && tar xf - ) +RUN mkdir -p ../../dist/linux-amd64/lib/ollama && \ + (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/lib/ollama && tar xf - ) FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64 ARG CMAKE_VERSION @@ -160,7 +160,7 @@ COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ l ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ - go build -trimpath -o dist/linux-amd64/ollama . + go build -trimpath -o dist/linux-amd64/bin/ollama . # Intermediate stage used for ./scripts/build_linux.sh FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64 @@ -176,20 +176,29 @@ COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/buil ARG GOFLAGS ARG CGO_CFLAGS RUN --mount=type=cache,target=/root/.ccache \ - go build -trimpath -o dist/linux-arm64/ollama . + go build -trimpath -o dist/linux-arm64/bin/ollama . + +# Strip out ROCm dependencies to keep the primary image lean +FROM --platform=linux/amd64 ubuntu:22.04 as amd64-libs-without-rocm +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/ +RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa* # Runtime stages FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 +COPY --from=amd64-libs-without-rocm /scratch/ /lib/ RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ + FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64 +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/ RUN apt-get update && apt-get install -y ca-certificates -COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/ollama /bin/ollama +COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/ # Radeon images are much larger so we keep it distinct from the CPU/CUDA image FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete as runtime-rocm RUN update-pciids -COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/ollama /bin/ollama +COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/ +RUN ln -s /opt/rocm/lib /lib/ollama EXPOSE 11434 ENV OLLAMA_HOST 0.0.0.0 diff --git a/app/ollama.iss b/app/ollama.iss index e9cf48ec..bce0a337 100644 --- a/app/ollama.iss +++ b/app/ollama.iss @@ -87,11 +87,11 @@ DialogFontSize=12 [Files] Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit -Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit -Source: "..\dist\windows-{#ARCH}\ollama_runners\*"; DestDir: "{app}\ollama_runners"; Flags: ignoreversion 64bit recursesubdirs +Source: "..\ollama.exe"; DestDir: "{app}\bin"; Flags: ignoreversion 64bit +Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion -Source: "..\dist\windows-amd64\ollama_libs\*"; DestDir: "{app}\ollama_libs\"; Flags: ignoreversion recursesubdirs +Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs [Icons] Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" @@ -99,7 +99,7 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico" [Run] -Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden +Filename: "{cmd}"; Parameters: "/C set PATH={app}\bin;%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden [UninstallRun] ; Filename: "{cmd}"; Parameters: "/C ""taskkill /im ''{#MyAppExeName}'' /f /t"; Flags: runhidden @@ -134,8 +134,8 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi [Registry] Root: HKCU; Subkey: "Environment"; \ - ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}"; \ - Check: NeedsAddPath('{app}') + ValueType: expandsz; ValueName: "Path"; ValueData: "{olddata};{app}\bin"; \ + Check: NeedsAddPath('{app}\bin') [Code] diff --git a/docs/linux.md b/docs/linux.md index ec730656..3ed2bed0 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -20,13 +20,12 @@ GPU. ## Manual install -### Download the `ollama` binary +### Download the `ollama` tar file -Ollama is distributed as a self-contained binary. Download it to a directory in your PATH: +Ollama is distributed as a tar file including GPU library dependencies. ```bash -sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama -sudo chmod +x /usr/bin/ollama +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf - ``` ### Adding Ollama as a startup service (recommended) @@ -96,8 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh Or by downloading the ollama binary: ```bash -sudo curl -L https://ollama.com/download/ollama-linux-amd64 -o /usr/bin/ollama -sudo chmod +x /usr/bin/ollama +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf - ``` ## Installing specific versions diff --git a/envconfig/config.go b/envconfig/config.go index 7f0976c0..7e45a4f5 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -174,7 +174,7 @@ func RunnersDir() (p string) { defer func() { if p == "" { - slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama_runners'") + slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'") } }() @@ -190,7 +190,7 @@ func RunnersDir() (p string) { } var paths []string - for _, root := range []string{filepath.Dir(exe), cwd} { + for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), ".."), cwd} { paths = append(paths, root, filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH), @@ -200,7 +200,7 @@ func RunnersDir() (p string) { // Try a few variations to improve developer experience when building from source in the local tree for _, path := range paths { - candidate := filepath.Join(path, "ollama_runners") + candidate := filepath.Join(path, "lib", "ollama", "runners") if _, err := os.Stat(candidate); err == nil { p = candidate break diff --git a/gpu/amd_common.go b/gpu/amd_common.go index 05747208..72d204f7 100644 --- a/gpu/amd_common.go +++ b/gpu/amd_common.go @@ -54,7 +54,7 @@ func commonAMDValidateLibDir() (string, error) { // Installer payload location if we're running the installed binary exe, err := os.Executable() if err == nil { - rocmTargetDir := filepath.Join(filepath.Dir(exe), "ollama_libs") + rocmTargetDir := filepath.Join(filepath.Dir(exe), "..", "lib", "ollama") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/amd_windows.go b/gpu/amd_windows.go index 5d25a966..a0ae7c96 100644 --- a/gpu/amd_windows.go +++ b/gpu/amd_windows.go @@ -153,7 +153,7 @@ func AMDValidateLibDir() (string, error) { // Installer payload (if we're running from some other location) localAppData := os.Getenv("LOCALAPPDATA") appDir := filepath.Join(localAppData, "Programs", "Ollama") - rocmTargetDir := filepath.Join(appDir, "ollama_libs") + rocmTargetDir := filepath.Join(appDir, "..", "lib", "ollama") if rocmLibUsable(rocmTargetDir) { slog.Debug("detected ollama installed ROCm at " + rocmTargetDir) return rocmTargetDir, nil diff --git a/gpu/gpu.go b/gpu/gpu.go index eb87807a..391c98a8 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -653,8 +653,8 @@ func GetDepDir() string { slog.Warn("failed to lookup working directory", "error", err) } // Scan for any of our dependeices, and pick first match - for _, root := range []string{filepath.Dir(appExe), cwd} { - libDep := "ollama_libs" + for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), ".."), cwd} { + libDep := filepath.Join("lib", "ollama") if _, err := os.Stat(filepath.Join(root, libDep)); err == nil { return filepath.Join(root, libDep) } diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index dc9dda5a..aef03f9a 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -189,7 +189,7 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off" BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}" export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" - CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/ollama_libs}" + CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}" build install echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}" @@ -213,7 +213,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then CC=icx CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF" BUILD_DIR="../build/linux/${ARCH}/oneapi" - ONEAPI_DIST_DIR="${DIST_BASE}/ollama_libs" + ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama" export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb" DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it build @@ -260,7 +260,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then echo "Building custom ROCM GPU" fi BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" - ROCM_DIST_DIR="${DIST_BASE}/ollama_libs" + ROCM_DIST_DIR="${DIST_BASE}/lib/ollama" # TODO figure out how to disable runpath (rpath) # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 42708d3e..4d43c9e2 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -35,7 +35,7 @@ function init_vars { ) $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on") $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower() - $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners" + $script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\runners" md "$script:DIST_BASE" -ea 0 > $null if ($env:CGO_CFLAGS -contains "-g") { $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo") @@ -286,11 +286,11 @@ function build_cuda() { sign install - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null - write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null + write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" } else { write-host "Skipping CUDA generation step" } @@ -324,17 +324,17 @@ function build_oneapi() { sign install - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" -ea 0 > $null - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" -ea 0 > $null + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" } else { Write-Host "Skipping oneAPI generation step" } @@ -384,11 +384,11 @@ function build_rocm() { sign install - md "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" -ea 0 > $null - cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" - cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\" + md "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" -ea 0 > $null + cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" + cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\" # amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs - cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_libs\rocblas\library\" + cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\lib\ollama\rocblas\library\" } else { write-host "Skipping ROCm generation step" } diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index 50b60230..9cebf1f4 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -122,8 +122,8 @@ function buildOllama() { /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} ollama.exe if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } - New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\ -Force - cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\ + New-Item -ItemType Directory -Path .\dist\windows-${script:TARGET_ARCH}\bin\ -Force + cp .\ollama.exe .\dist\windows-${script:TARGET_ARCH}\bin\ } function buildApp() { @@ -142,22 +142,22 @@ function buildApp() { function gatherDependencies() { write-host "Gathering runtime dependencies" cd "${script:SRC_DIR}" - md "${script:DEPS_DIR}\ollama_libs" -ea 0 > $null + md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null # TODO - this varies based on host build system and MSVC version - drive from dumpbin output # currently works for Win11 + MSVC 2019 + Cuda V11 - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\ollama_libs\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\ollama_libs\" - cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\ollama_libs\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\" + cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\" foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) { - cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\ollama_libs\" + cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\" } cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\" if ("${env:KEY_CONTAINER}") { write-host "about to sign" - foreach ($file in (get-childitem "${script:DEPS_DIR}\ollama_libs\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ + foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){ write-host "signing $file" & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" ` /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file diff --git a/scripts/install.sh b/scripts/install.sh index f0439b00..a02a0675 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -66,7 +66,7 @@ fi for BINDIR in /usr/local/bin /usr/bin /bin; do echo $PATH | grep -q $BINDIR && break || continue done -OLLAMA_INSTALL_DIR=${OLLAMA_INSTALL_DIR:-${BINDIR}} +OLLAMA_INSTALL_DIR=$(dirname ${BINDIR}) status "Installing ollama to $OLLAMA_INSTALL_DIR" $SUDO install -o0 -g0 -m755 -d $BINDIR @@ -77,18 +77,22 @@ if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux- "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \ $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" BUNDLE=1 + if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then + status "Making ollama accessible in the PATH in $BINDIR" + $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" + fi else status "Downloading Linux ${ARCH} CLI" curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\ "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}" $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama BUNDLE=0 + if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then + status "Making ollama accessible in the PATH in $BINDIR" + $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" + fi fi -if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then - status "Making ollama accessible in the PATH in $BINDIR" - $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama" -fi install_success() { status 'The Ollama API is now available at 127.0.0.1:11434.' From f9e31da9463092d7b3661594788c259d6d55b3d9 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 15 Aug 2024 14:38:14 -0700 Subject: [PATCH 19/34] Review comments --- .github/workflows/release.yaml | 106 ++++++--------------------------- docs/linux.md | 8 +-- gpu/cuda_common.go | 2 +- gpu/gpu.go | 16 ++--- llm/generate/gen_windows.ps1 | 4 +- 5 files changed, 32 insertions(+), 104 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 4bd68455..508fbb35 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -183,10 +183,17 @@ jobs: name: windows-rocm-deps path: dist/deps/* - # CUDA v11 generation step - generate-windows-cuda-v11: + # CUDA generation step + generate-windows-cuda: environment: release runs-on: windows + strategy: + matrix: + cuda: + - version: "11" + url: 'https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe' + - version: "12" + url: 'https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe' env: KEY_CONTAINER: ${{ vars.KEY_CONTAINER }} steps: @@ -220,11 +227,11 @@ jobs: with: go-version-file: go.mod cache: true - - name: 'Install CUDA' + - name: 'Install CUDA ${{ matrix.cuda.version }}' run: | $ErrorActionPreference = "Stop" write-host "downloading CUDA Installer" - Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" + Invoke-WebRequest -Uri "${{ matrix.cuda.url }}" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" write-host "Installing CUDA" Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait write-host "Completed CUDA" @@ -256,7 +263,7 @@ jobs: cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" - uses: actions/upload-artifact@v4 with: - name: generate-windows-cuda-v11 + name: generate-windows-cuda-${{ matrix.cuda.version }} path: | llm/build/**/bin/* dist/windows-amd64/** @@ -265,95 +272,13 @@ jobs: name: windows-cuda-deps path: dist/deps/* - # CUDA v12 generation step - generate-windows-cuda-v12: - environment: release - runs-on: windows - env: - KEY_CONTAINER: ${{ vars.KEY_CONTAINER }} - steps: - - uses: actions/checkout@v4 - - name: Set Version - shell: bash - run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV - - uses: 'google-github-actions/auth@v2' - with: - project_id: 'ollama' - credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}' - - run: echo "${{ vars.OLLAMA_CERT }}" > ollama_inc.crt - - name: install Windows SDK 8.1 to get signtool - run: | - $ErrorActionPreference = "Stop" - write-host "downloading SDK" - Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe" - Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait - write-host "Win SDK 8.1 installed" - gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe' - - name: install signing plugin - run: | - $ErrorActionPreference = "Stop" - write-host "downloading plugin" - Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip" - Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\ - write-host "Installing plugin" - & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet - write-host "plugin installed" - - uses: actions/setup-go@v5 - with: - go-version-file: go.mod - cache: true - - name: 'Install CUDA' - run: | - $ErrorActionPreference = "Stop" - write-host "downloading CUDA Installer" - Invoke-WebRequest -Uri "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_551.61_windows.exe" -OutFile "${env:RUNNER_TEMP}\cuda-install.exe" - write-host "Installing CUDA" - Start-Process "${env:RUNNER_TEMP}\cuda-install.exe" -ArgumentList '-s' -NoNewWindow -Wait - write-host "Completed CUDA" - $cudaPath=((resolve-path "c:\Program Files\NVIDIA*\CUDA\v*\bin\nvcc.exe")[0].path | split-path | split-path) - $cudaVer=($cudaPath | split-path -leaf ) -replace 'v(\d+).(\d+)', '$1_$2' - echo "$cudaPath\bin" >> $env:GITHUB_PATH - echo "CUDA_PATH=$cudaPath" >> $env:GITHUB_ENV - echo "CUDA_PATH_V${cudaVer}=$cudaPath" >> $env:GITHUB_ENV - echo "CUDA_PATH_VX_Y=CUDA_PATH_V${cudaVer}" >> $env:GITHUB_ENV - - name: 'Verify CUDA' - run: nvcc -V - - run: go get ./... - - name: go generate - run: | - $gopath=(get-command go).source | split-path -parent - $cudabin=(get-command nvcc).source | split-path - & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1" - cd $env:GITHUB_WORKSPACE - $env:CMAKE_SYSTEM_VERSION="10.0.22621.0" - $env:PATH="$gopath;$cudabin;$env:PATH" - $env:OLLAMA_SKIP_CPU_GENERATE="1" - go generate -x ./... - - name: 'gather cuda dependencies' - run: | - $NVIDIA_DIR=(resolve-path 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*\bin\')[0] - md "dist\deps" - cp "${NVIDIA_DIR}\cudart64_*.dll" "dist\deps\" - cp "${NVIDIA_DIR}\cublas64_*.dll" "dist\deps\" - cp "${NVIDIA_DIR}\cublasLt64_*.dll" "dist\deps\" - - uses: actions/upload-artifact@v4 - with: - name: generate-windows-cuda-v12 - path: | - llm/build/**/bin/* - dist/windows-amd64/** - - uses: actions/upload-artifact@v4 - with: - name: windows-cuda-deps - path: dist/deps/* # Import the prior generation steps and build the final windows assets build-windows: environment: release runs-on: windows needs: - - generate-windows-cuda-v11 - - generate-windows-cuda-v12 + - generate-windows-cuda - generate-windows-rocm - generate-windows-cpu env: @@ -397,7 +322,10 @@ jobs: name: generate-windows-cpu - uses: actions/download-artifact@v4 with: - name: generate-windows-cuda-v11 + name: generate-windows-cuda-11 + - uses: actions/download-artifact@v4 + with: + name: generate-windows-cuda-12 - uses: actions/download-artifact@v4 with: name: windows-cuda-deps diff --git a/docs/linux.md b/docs/linux.md index 3ed2bed0..d1d5892c 100644 --- a/docs/linux.md +++ b/docs/linux.md @@ -20,12 +20,12 @@ GPU. ## Manual install -### Download the `ollama` tar file +### Download `ollama` -Ollama is distributed as a tar file including GPU library dependencies. +Download and extract the Linux package: ```bash -curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf - +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr ``` ### Adding Ollama as a startup service (recommended) @@ -95,7 +95,7 @@ curl -fsSL https://ollama.com/install.sh | sh Or by downloading the ollama binary: ```bash -curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar -C /usr -zxf - +curl -fsSL https://ollama.com/download/ollama-linux-amd64.tgz | sudo tar zx -C /usr ``` ## Installing specific versions diff --git a/gpu/cuda_common.go b/gpu/cuda_common.go index defaa60a..827cc9b4 100644 --- a/gpu/cuda_common.go +++ b/gpu/cuda_common.go @@ -28,7 +28,7 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",") } -func cudaGetVariant(gpuInfo CudaGPUInfo) string { +func cudaVariant(gpuInfo CudaGPUInfo) string { if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" { if CudaTegra != "" { ver := strings.Split(CudaTegra, ".") diff --git a/gpu/gpu.go b/gpu/gpu.go index 391c98a8..72d237a6 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -225,7 +225,7 @@ func GetGPUInfo() GpuInfoList { return GpuInfoList{cpus[0].GpuInfo} } - depPath := GetDepDir() + depPath := LibraryDir() // Load ALL libraries cHandles = initCudaHandles() @@ -264,20 +264,20 @@ func GetGPUInfo() GpuInfoList { gpuInfo.computeMajor = int(memInfo.major) gpuInfo.computeMinor = int(memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory - cudaVariant := cudaGetVariant(gpuInfo) + variant := cudaVariant(gpuInfo) if depPath != "" { gpuInfo.DependencyPath = depPath // Check for variant specific directory - if cudaVariant != "" { - if _, err := os.Stat(filepath.Join(depPath, "cuda_"+cudaVariant)); err == nil { - gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+cudaVariant) + if variant != "" { + if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil { + gpuInfo.DependencyPath = filepath.Join(depPath, "cuda_"+variant) } } } gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) gpuInfo.DriverMajor = driverMajor gpuInfo.DriverMinor = driverMinor - gpuInfo.Variant = cudaGetVariant(gpuInfo) + gpuInfo.Variant = variant // query the management library as well so we can record any skew between the two // which represents overhead on the GPU we must set aside on subsequent updates @@ -468,7 +468,7 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string { slog.Debug("Searching for GPU library", "name", baseLibName) // Start with our bundled libraries - patterns := []string{filepath.Join(GetDepDir(), baseLibName)} + patterns := []string{filepath.Join(LibraryDir(), baseLibName)} switch runtime.GOOS { case "windows": @@ -642,7 +642,7 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { } } -func GetDepDir() string { +func LibraryDir() string { // On Windows/linux we bundle the dependencies at the same level as the executable appExe, err := os.Executable() if err != nil { diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 4d43c9e2..cbdfd09f 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -117,7 +117,7 @@ function build { if ($cmakeDefs -contains "-G") { $extra=@("-j8") } else { - $extra= @("--", "/p:CL_MPcount=8") + $extra= @("--", "/maxCpuCount:8") } write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ }) $extra" & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ }) $extra @@ -273,7 +273,7 @@ function build_cuda() { "-DGGML_CUDA=ON", "-DGGML_AVX=on", "-DGGML_AVX2=off", - "-DCMAKE_CUDA_FLAGS=-t8", + "-DCMAKE_CUDA_FLAGS=-t6", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}", "-DCMAKE_CUDA_COMPILER_TOOLKIT_ROOT=$env:CUDA_PATH" ) From d8be22e47d460d1483846e2effb9b67fbfce1c0b Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 19 Aug 2024 12:07:18 -0700 Subject: [PATCH 20/34] Fix overlapping artifact name on CI --- .github/workflows/release.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 508fbb35..f6489dac 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -269,7 +269,7 @@ jobs: dist/windows-amd64/** - uses: actions/upload-artifact@v4 with: - name: windows-cuda-deps + name: windows-cuda-deps-${{ matrix.cuda.version }} path: dist/deps/* @@ -328,7 +328,10 @@ jobs: name: generate-windows-cuda-12 - uses: actions/download-artifact@v4 with: - name: windows-cuda-deps + name: windows-cuda-deps-11 + - uses: actions/download-artifact@v4 + with: + name: windows-cuda-deps-12 - uses: actions/download-artifact@v4 with: name: windows-rocm-deps From f91c9e370923d3b10a88732ab577e2728022152d Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 19 Aug 2024 13:48:45 -0700 Subject: [PATCH 21/34] CI: handle directories during checksum (#6427) --- .github/workflows/release.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index f6489dac..aad49d98 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -472,7 +472,8 @@ jobs: merge-multiple: true - run: | ls -lh dist/ - (cd dist; sha256sum * > sha256sum.txt) + (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt) + mv sha256sum.txt dist/ cat dist/sha256sum.txt - name: Create or update Release run: | From 19e5a890f70b95a55c9de6a55357d78fc0a4ff81 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Mon, 19 Aug 2024 15:19:21 -0700 Subject: [PATCH 22/34] CI: remove directories from dist dir before upload step (#6429) --- .github/workflows/release.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index aad49d98..2cf4d2c2 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -474,6 +474,7 @@ jobs: ls -lh dist/ (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt) mv sha256sum.txt dist/ + mv dist/linux-???64 . cat dist/sha256sum.txt - name: Create or update Release run: | From a017cf2fea4aaa376087520382058c42cffce097 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 20 Aug 2024 07:26:38 -0700 Subject: [PATCH 23/34] Split rocm back out of bundle (#6432) We're over budget for github's maximum release artifact size with rocm + 2 cuda versions. This splits rocm back out as a discrete artifact, but keeps the layout so it can be extracted into the same location as the main bundle. --- .github/workflows/release.yaml | 1 + Dockerfile | 4 ++-- llm/generate/gen_linux.sh | 3 ++- scripts/build_linux.sh | 6 ++++++ scripts/install.sh | 5 +++++ 5 files changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 2cf4d2c2..9c1e3e13 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -475,6 +475,7 @@ jobs: (cd dist; find . -type f | xargs sha256sum > ../sha256sum.txt) mv sha256sum.txt dist/ mv dist/linux-???64 . + mv dist/linux-amd64-rocm . cat dist/sha256sum.txt - name: Create or update Release run: | diff --git a/Dockerfile b/Dockerfile index d4b86918..c46477b4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -95,8 +95,8 @@ ARG AMDGPU_TARGETS ENV GOARCH amd64 RUN --mount=type=cache,target=/root/.ccache \ OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh -RUN mkdir -p ../../dist/linux-amd64/lib/ollama && \ - (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64/lib/ollama && tar xf - ) +RUN mkdir -p ../../dist/linux-amd64-rocm/lib/ollama && \ + (cd /opt/rocm/lib && tar cf - rocblas/library) | (cd ../../dist/linux-amd64-rocm/lib/ollama && tar xf - ) FROM --platform=linux/amd64 centos:7 AS cpu-builder-amd64 ARG CMAKE_VERSION diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index aef03f9a..6927dda8 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -260,7 +260,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then echo "Building custom ROCM GPU" fi BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}" - ROCM_DIST_DIR="${DIST_BASE}/lib/ollama" + # ROCm dependencies are too large to fit into a unified bundle + ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama" # TODO figure out how to disable runpath (rpath) # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu" diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index adda2ad7..6cb0d0cd 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -24,8 +24,14 @@ for TARGETARCH in ${BUILD_ARCH}; do docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH rm -rf ./dist/linux-$TARGETARCH docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist + if echo ${TARGETARCH} | grep "amd64" > /dev/null; then + docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist + fi docker rm builder-$TARGETARCH echo "Compressing final linux bundle..." rm -f ./dist/ollama-linux-$TARGETARCH.tgz (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz ) + if [ -d dist/linux-$TARGETARCH-rocm ]; then + (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz ) + fi done diff --git a/scripts/install.sh b/scripts/install.sh index a02a0675..25f57565 100644 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -199,6 +199,11 @@ fi if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then if [ $BUNDLE -ne 0 ]; then + status "Downloading Linux ROCm ${ARCH} bundle" + curl --fail --show-error --location --progress-bar \ + "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \ + $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR" + install_success status "AMD GPU ready." exit 0 From 5a28b9cf5fcb3994aa1a143118c73c7d1fbf3bf9 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 6 Jun 2024 08:59:04 -0700 Subject: [PATCH 24/34] bert --- convert/convert.go | 12 ++ convert/convert_bert.go | 176 +++++++++++++++++++++++++ convert/convert_test.go | 1 + convert/reader.go | 2 + convert/testdata/all-MiniLM-L6-v2.json | 124 +++++++++++++++++ convert/tokenizer.go | 31 ++--- 6 files changed, 331 insertions(+), 15 deletions(-) create mode 100644 convert/convert_bert.go create mode 100644 convert/testdata/all-MiniLM-L6-v2.json diff --git a/convert/convert.go b/convert/convert.go index 24c19aa4..f51e9665 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -66,6 +66,10 @@ type Converter interface { writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error } +type moreParser interface { + parseMore(fs.FS) error +} + // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // and files it finds in the input path. // Supported input model formats include safetensors. @@ -95,6 +99,8 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { conv = &gemma{} case "Phi3ForCausalLM": conv = &phi3{} + case "BertModel": + conv = &bert{} default: return errors.New("unsupported architecture") } @@ -103,6 +109,12 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { return err } + if t, ok := conv.(moreParser); ok { + if err := t.parseMore(fsys); err != nil { + return err + } + } + t, err := parseTokenizer(fsys, conv.specialTokenTypes()) if err != nil { return err diff --git a/convert/convert_bert.go b/convert/convert_bert.go new file mode 100644 index 00000000..62fad147 --- /dev/null +++ b/convert/convert_bert.go @@ -0,0 +1,176 @@ +package convert + +import ( + "cmp" + "encoding/json" + "io/fs" + "path/filepath" + "slices" + "strings" + + "github.com/ollama/ollama/llm" +) + +type bert struct { + Parameters + NLayers uint32 `json:"n_layers"` + NumHiddenLayers uint32 `json:"num_hidden_layers"` + NLayer uint32 `json:"n_layer"` + MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` + NCtx uint32 `json:"n_ctx"` + HiddenSize uint32 `json:"hidden_size"` + NEmbd uint32 `json:"n_embd"` + IntermediateSize uint32 `json:"intermediate_size"` + NInner uint32 `json:"n_inner"` + NumAttentionHeads uint32 `json:"num_attention_heads"` + NHead uint32 `json:"n_head"` + NumKeyValueHeads uint32 `json:"num_key_value_heads"` + LayerNormEPS float32 `json:"layer_norm_eps"` + LayerNormEpsilon float32 `json:"layer_norm_epsilon"` + NormEpsilon float32 `json:"norm_epsilon"` + + PoolingType uint32 +} + +var ( + _ Converter = (*bert)(nil) + _ moreParser = (*bert)(nil) +) + +func (p *bert) parseMore(fsys fs.FS) error { + bts, err := fs.ReadFile(fsys, "modules.json") + if err != nil { + return err + } + + var modules []struct { + Type string `json:"type"` + Path string `json:"path"` + } + + if err := json.Unmarshal(bts, &modules); err != nil { + return err + } + + var pooling string + for _, m := range modules { + if m.Type == "sentence_transformers.models.Pooling" { + pooling = m.Path + break + } + } + + if pooling != "" { + bts, err := fs.ReadFile(fsys, filepath.Join(pooling, "config.json")) + if err != nil { + return err + } + + var pc struct { + PoolingModeCLSToken bool `json:"pooling_mode_cls_token"` + PoolingModeMeanTokens bool `json:"pooling_mode_mean_tokens"` + } + + if err := json.Unmarshal(bts, &pc); err != nil { + return err + } + + if pc.PoolingModeMeanTokens { + p.PoolingType = 1 + } else if pc.PoolingModeCLSToken { + p.PoolingType = 2 + } + } + + return nil +} + +func (p *bert) KV(t *Tokenizer) llm.KV { + kv := p.Parameters.KV(t) + kv["general.architecture"] = "bert" + kv["general.name"] = "bert" + kv["bert.attention.causal"] = false + kv["bert.pooling_type"] = p.PoolingType + + kv["bert.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) + + if contextLength := cmp.Or(p.MaxPositionEmbeddings, p.NCtx); contextLength > 0 { + kv["bert.context_length"] = contextLength + } + + if embeddingLength := cmp.Or(p.HiddenSize, p.NEmbd); embeddingLength > 0 { + kv["bert.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) + } + + if feedForwardLength := cmp.Or(p.IntermediateSize, p.NInner); feedForwardLength > 0 { + kv["bert.feed_forward_length"] = cmp.Or(p.IntermediateSize, p.NInner) + } + + if headCount := cmp.Or(p.NumAttentionHeads, p.NHead); headCount > 0 { + kv["bert.attention.head_count"] = cmp.Or(p.NumAttentionHeads, p.NHead) + } + + if layerNormEpsilon := cmp.Or(p.LayerNormEPS, p.LayerNormEpsilon, p.NormEpsilon); layerNormEpsilon > 0 { + kv["bert.attention.layer_norm_epsilon"] = layerNormEpsilon + } + + kv["tokenizer.ggml.model"] = "bert" + kv["tokenizer.ggml.token_type_count"] = uint32(2) + + // convert to phantom space tokens + for i, e := range t.Tokens { + if strings.HasPrefix(e, "[") && strings.HasSuffix(e, "]") { + // noop + } else if strings.HasPrefix(e, "##") { + t.Tokens[i] = e[2:] + } else { + t.Tokens[i] = "\u2581" + e + } + } + + kv["tokenizer.ggml.tokens"] = t.Tokens + + return kv +} + +func (p *bert) Tensors(ts []Tensor) []llm.Tensor { + var out []llm.Tensor + for _, t := range ts { + if slices.Contains([]string{ + "embeddings.position_ids", + "pooler.dense.weight", + "pooler.dense.bias", + }, t.Name()) { + continue + } + + name := p.tensorName(t.Name()) + out = append(out, llm.Tensor{ + Name: name, + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + + return out +} + +func (bert) tensorName(n string) string { + return strings.NewReplacer( + "encoder.layer", "blk", + "encoder.layers", "blk", + "embeddings.word_embeddings", "token_embd", + "embeddings.token_type_embeddings", "token_types", + "embeddings.LayerNorm", "token_embd_norm", + "embeddings.position_embeddings", "position_embd", + "attention.self.query", "attn_q", + "attention.self.key", "attn_k", + "attention.self.value", "attn_v", + "attention.output.dense", "attn_output", + "attention.output.LayerNorm", "attn_output_norm", + "intermediate.dense", "ffn_up", + "output.dense", "ffn_down", + "output.LayerNorm", "layer_output_norm", + ).Replace(n) +} diff --git a/convert/convert_test.go b/convert/convert_test.go index cb2c585e..e3ab0098 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -67,6 +67,7 @@ func TestConvertFull(t *testing.T) { "gemma-2b-it", // microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8 "Phi-3-mini-128k-instruct", + "all-MiniLM-L6-v2", } for i := range cases { diff --git a/convert/reader.go b/convert/reader.go index ce95208e..294a7c40 100644 --- a/convert/reader.go +++ b/convert/reader.go @@ -37,6 +37,8 @@ const ( func (t tensorBase) Kind() uint32 { if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") { return 0 + } else if t.name == "embeddings.token_type_embeddings.weight" { + return 0 } switch len(t.shape) { diff --git a/convert/testdata/all-MiniLM-L6-v2.json b/convert/testdata/all-MiniLM-L6-v2.json new file mode 100644 index 00000000..15c8f039 --- /dev/null +++ b/convert/testdata/all-MiniLM-L6-v2.json @@ -0,0 +1,124 @@ +{ + "general.architecture": "bert", + "general.file_type": "1", + "general.quantization_version": "2", + "bert.attention.causal": "false", + "bert.attention.head_count": "12", + "bert.attention.layer_norm_epsilon": "1e-12", + "bert.block_count": "6", + "bert.context_length": "512", + "bert.embedding_length": "384", + "bert.feed_forward_length": "1536", + "bert.pooling_type": "1", + "tokenizer.ggml.model": "bert", + "tokenizer.ggml.padding_token_id": "0", + "tokenizer.ggml.unknown_token_id": "100", + "tokenizer.ggml.cls_token_id": "101", + "tokenizer.ggml.seperator_token_id": "102", + "tokenizer.ggml.mask_token_id": "103", + "tokenizer.ggml.token_type_count": "2", + "tokenizer.ggml.scores": "6db964fe67338aca57790481a390121ff3dd643eebe49f7dd308029ad99abb6f", + "tokenizer.ggml.token_type": "98d247c5404b6b18f05f133b92dd56edf6efefefac326794b00d7b351f6c5aa1", + "tokenizer.ggml.tokens": "9efe405e229a45ff9916f54c475d151d2200cd2ab0006f347abfb069cf096c86", + "token_embd.weight": "8c1ee80a9ea4f65aa385ba30112010068af3d209bebc6e149d3d4589c2cd0a5a", + "position_embd.weight": "6c516f0b1c4e2388ab90394dd80ad69e4e4509b890982fc3408108ae66210eb6", + "token_types.weight": "f879f8e422ed211948f28b560d3c5e17aae7993f063b51196a28cf5c0fb3da21", + "token_embd_norm.weight": "75076e095d717aab96f8b6beeee503c27940d9a76f2b891a0e3de72f8a6043e4", + "token_embd_norm.bias": "298735285ffe944e1bf03e5d35c7280326b85cf121bde9874f1af5dc51ab939d", + "blk.0.attn_q.weight": "ab0923ce4c1549175112dcdfcc860fe30137f991e03ea6857fb5993670adaf6c", + "blk.0.attn_q.bias": "a3ec29551dabf976e1d34256b8ab5ab7b758f3ed9742c3cafdbd984d5441df62", + "blk.0.attn_k.weight": "4c1038a6d035c3e9ffed7fa672b614627814752503755fbad0cfb76a41ad71ba", + "blk.0.attn_k.bias": "e0363930eb588d91816aa3d230bb03b6e2551c165117b80b8d60397413819ef9", + "blk.0.attn_v.weight": "425e2e53e3f00ce98d29c3e6a161eb55d3e6ae0d96fdb9f6242d1c4fd6eef4b3", + "blk.0.attn_v.bias": "6579173a1e65ee124fbd0bd53cbdca4225515b4f2c5f18fb1bfd000f5978f9bb", + "blk.0.attn_output.weight": "a6d70a08cd7164de5d12af65d86d657c3db35aaecde778b2b3fda9193c4c9802", + "blk.0.attn_output.bias": "2b8d12c4f9a9c5bfaa29c597839568f6e0525cb41eeaf64ddeb6bd84dfeb9701", + "blk.0.attn_output_norm.weight": "bbe6e502a473228b525aeed26cc31b7db123ad63bdc5a6eebac6ea70b8b51d62", + "blk.0.attn_output_norm.bias": "36eaacaf0007c5c62daea97aab0115390c0682914f78482e37eb76885f4b7a50", + "blk.0.ffn_up.weight": "24654561c76ce387d125759ba843f06b904ef721fcceaeff6ccc62180a48e874", + "blk.0.ffn_up.bias": "fd3f0126aa1d95768fa60eb6f4ab8a2763cfcb7e5405f35b92353031d86f4d34", + "blk.0.ffn_down.weight": "97a829763a6a5bf3329ceb4d39c424ba4787d61653a5b0bbd1f84782e4d4e0ca", + "blk.0.ffn_down.bias": "7aa980c30ae8b4ee7f69df28808dbf5c431f56ccc4a80340f644a0419f16c054", + "blk.0.layer_output_norm.weight": "ef30dad4c2a083ae1ff5039a2a6cda60ecc89bf1e486a6f8c0d15f50589603f8", + "blk.0.layer_output_norm.bias": "8b1b77e67568b1bce43fc476de1b177c53ff688d66beb66995e8eb3dc290da8a", + "blk.1.attn_q.weight": "284331622a1f6f9b87ccee4f652bd66a394ca493c4d93be4d1844e4f6159ad10", + "blk.1.attn_q.bias": "e24ebd4860330e08f6bfdd077a82db0bee33f4c8846cf1db26327a34754c7069", + "blk.1.attn_k.weight": "729dd0d555544b5bd0f7580b3c8b384256b974605f0e7487b95f295aa032997d", + "blk.1.attn_k.bias": "2aa51a828a858f35473f54477583fea54ce2ccc34ea60fbd1d228fbe9bca827f", + "blk.1.attn_v.weight": "6be304671cc311d5ca5c103f2b51467ee800c589bc5b8101e09ff5aed1f68c21", + "blk.1.attn_v.bias": "43bcbab78a8819e07f723bc9e5b737b71e87a7594f15234e882b63e327a64199", + "blk.1.attn_output.weight": "15ec8a1a12b26c9976445308a09f748ab0e4bef0f583d13ab08c3129f8738d73", + "blk.1.attn_output.bias": "dac2146f4baa6ed16f6c0dc7443831fb7ec79bedcceafd80d1a4b628a1bb072d", + "blk.1.attn_output_norm.weight": "d2151eb33bffac536787a4c9a5d2b31c7a80b17c4611877842a3cce2cd6e98d8", + "blk.1.attn_output_norm.bias": "31e1b779716dafb855d2cf5631ee168a0ccf372eb9c6ea6091f66fa97a9b9d2d", + "blk.1.ffn_up.weight": "a57547fc3fc3b77406f5cdcb0c87af9bc184701f175c39c1f35297826fce3cc7", + "blk.1.ffn_up.bias": "123be6d541d086202913c75d878c54d59a749f3af7b58f7ef9eb9e7c62a24c9a", + "blk.1.ffn_down.weight": "cfdb79788377e5cbded8790cd41b9e66c397ecab75474071fcd7cf32d30f9613", + "blk.1.ffn_down.bias": "bcb58315519a573097960891c9ae41cf4c685ab78c3e0e77471471758a7eae88", + "blk.1.layer_output_norm.weight": "819b554271452bfb1d84c2603b90377b2e41a0ac1e3aa8b417ccf9dce63375bd", + "blk.1.layer_output_norm.bias": "47a3433ac27f5ce8947fb38dd491f3706df4ef6adb0ddf74612bf0f54b19e164", + "blk.2.attn_q.weight": "1557a9ea852b1880551f7290e00aded4f35e6c4180fdcbed1b0039bf805f639e", + "blk.2.attn_q.bias": "c3bfe5f3066f655fd36b055530997b59ff33ef013563aaeb3cb8ff07dabd59a9", + "blk.2.attn_k.weight": "cfd08eb69c61ae2f9f14f9b7ff5c5394ca264b1a9f3d48156677f90dd1766289", + "blk.2.attn_k.bias": "9b839bc0e79974a0b3f5d1895972bc6f5c9a1bc16052e1af786e6a530758152d", + "blk.2.attn_v.weight": "02b26b1208480eaeeb00e7b4cf8b690006ca14759357fc44ed4a2a8924ead993", + "blk.2.attn_v.bias": "e7e6f0089fded1659a867ab736c220d9653ea7da6b1b94baf5c8d30a748b63ab", + "blk.2.attn_output.weight": "a1db121c7d33806b349cadd050300a57db49fdc91224fd07c9ac43bf4299dc79", + "blk.2.attn_output.bias": "7675128b6a92555cd955c820311e91e9417d31f48848f45d047b4100c62148b3", + "blk.2.attn_output_norm.weight": "5b4595e0fbcba67a700c4331adf746d2fba3546364a4db5607ae241947bb1a21", + "blk.2.attn_output_norm.bias": "7b8e16826ea30e5a2ba0b02e0095a901775981a296e98819625320e983060d08", + "blk.2.ffn_up.weight": "a0d815d946ac07a65095c4ae4df77b818845e6d97795c7d82f55e689d944db59", + "blk.2.ffn_up.bias": "ce37c0a4174d6bf773ded7bd016ede627ad3bdb8bc99b9992a18dc8e8898f252", + "blk.2.ffn_down.weight": "f6231d2a25426fbd45b9f1160aa484220eb227ceef0348c4a6a6de890606e5ef", + "blk.2.ffn_down.bias": "429e00556e8dc63a785238b309b9d83738500c1ef6d736fe6526ad88ea496d27", + "blk.2.layer_output_norm.weight": "651457a573adf3f7dd9ee5dfe1c8e89389e94443993aab77ec6a0b05aa621e35", + "blk.2.layer_output_norm.bias": "41fbbeda7fd89b0cef5f945ae44011c316982390401d6f75ba8c6d365e185247", + "blk.3.attn_q.weight": "95a43f32949d2cb8d22815bb27a44abfc6665ba96221af817dfe058cb6ca72c6", + "blk.3.attn_q.bias": "f4e34385e75d8108b6b3bd336106e2133a8c9be0cc343dfe5dc48c32a823c7cb", + "blk.3.attn_k.weight": "6b892da6a17d4d3265265a15f695864a31813ee8c8e710ae9bc9e1adbc6c9a18", + "blk.3.attn_k.bias": "40b8067b641a56014cee42548240aa8930820958b1933004892b5f04fbaef39e", + "blk.3.attn_v.weight": "9fcd5922319dd2a461082a5ce040c1dfe65d87d70ca6547dd0b46eeecc3eeb2b", + "blk.3.attn_v.bias": "b528c56212e66931fdbe267ac327a9c2f87cd03baff3ea719e30afe681da15f1", + "blk.3.attn_output.weight": "e3b178c1b03981e75510e0d277af23ea59cc404b5394e61bd32291825719b502", + "blk.3.attn_output.bias": "712c84d39a6a5a9c06a09da8fd9939ba0d5525524a4bba61ea4de09b48f45cae", + "blk.3.attn_output_norm.weight": "d1ffac88e675592ff72f8a617be32b4a381d443b2f8f2645dbe44a1e5745aac0", + "blk.3.attn_output_norm.bias": "ea31a1c73146234c50e0e43f485c458413714867b8e2703af66482f7db2d6c40", + "blk.3.ffn_up.weight": "4ef4f3b9a1ea6ab2ef2eb6e8b008e06a44790d099d97482a05a51e39a29afac0", + "blk.3.ffn_up.bias": "06a4296dda16f452675c51f108079fe7722552d6521c737d97734943818b9a2b", + "blk.3.ffn_down.weight": "f114b2bebe392c7d80433bb880c6730293aa4561b0b0370dcdaf7472daebd847", + "blk.3.ffn_down.bias": "2c8e67831d28a3bf613fc7912ae3259b63d72abcaf4d30efd8800758400158de", + "blk.3.layer_output_norm.weight": "a1dfeb7b5a51dd56447312ca41e2ad2f361a3ea12ddc355127f5f4219fb0a482", + "blk.3.layer_output_norm.bias": "1ed630021b25c6c6fc93fd32988b9907df966d4982a93081f639aac3044618ab", + "blk.4.attn_q.weight": "b5fae4c1f9a5f33a2a2e816ac0c01c25f422e4efdd59ef1ed93da2610e5370fc", + "blk.4.attn_q.bias": "c2e376524ea98ac3b10d9eee19ecb1b1e261fa5149efe0232844c923dfb428fb", + "blk.4.attn_k.weight": "a4632f5ebf9321d9d08f9112a4e5dda2efe5671df4a4e67fee24845f5b14af16", + "blk.4.attn_k.bias": "a9a02ffb8b8b4f6dfe487a7e0341f1d5318c9d2b793a688f34cb1b22fc66ef60", + "blk.4.attn_v.weight": "10ad8deb81d9fa093b1e5c0f24ea82aa7df43e6aca49e260fcbea56eab8cc86a", + "blk.4.attn_v.bias": "7326813e181e021130bd33ac136293fcffccce2d1d8cb59041e5b13a8cceacf6", + "blk.4.attn_output.weight": "c92573088c7437c2b3cda51490e152c27fb19e5468df591eabba5a49d5398d44", + "blk.4.attn_output.bias": "14e10b419e5859af1eb685af5c330aee67048cd704dcead9217840c6f5393222", + "blk.4.attn_output_norm.weight": "02b6831c0e0fb0edbc579a92812a1dd972cb15d14fcd382d4427c5a7b300ac44", + "blk.4.attn_output_norm.bias": "7eed5cd503bb6bb6ceb1bc8b07cc077903a4f14fb8b9d6cdf39644815ecf1374", + "blk.4.ffn_up.weight": "8d0c91d62e74d6431321116a37cf3339e630bd50ba164d3304fc4fe8dd831223", + "blk.4.ffn_up.bias": "d325f07f73c005a273c484c7be8e7abb4d6e8a5c4fd093f5869133b97629d017", + "blk.4.ffn_down.weight": "7ba7bd81143f40537b84f938e403e19f30e4928625eb371de052b9025beb4d21", + "blk.4.ffn_down.bias": "2853d9c2a75288214a4bf4907dc19d04d01926f4913d302b1aa7bdbfcce0f7a1", + "blk.4.layer_output_norm.weight": "a4ed1885fa77b90fed5300c355ef0aa0c876a8c747151d9d790939d464d57d4f", + "blk.4.layer_output_norm.bias": "62142a81e813a9e636333b2b805d6bc3b17c5e7cd4b15adce1ada6bc9a32563c", + "blk.5.attn_q.weight": "afc1dff080a72c3daad01384b1448d476aaf789871017c8ff8e144788887995d", + "blk.5.attn_q.bias": "748a820371c1d4f872c84545b36358d239c35bf6c99e2812c237d88c3292763b", + "blk.5.attn_k.weight": "59e30c1ed8acd2cbb01de5f62e7804015b9ecf98ba157d98cab016344639eda5", + "blk.5.attn_k.bias": "f839520078f9e589496e982e86d0126c7aa14196047339abffcf49a696229f77", + "blk.5.attn_v.weight": "3e21fb874e21b90308e1f46af034a3c32d3eba1628d62ae5f2246d6af5818923", + "blk.5.attn_v.bias": "5cd4852bf95c1444d10d756750f6bf49f842c0b39e9953c7f408bb67c325ac8c", + "blk.5.attn_output.weight": "636ce6a7752895f204b9d01ba0aedd9a294f908b42f372c22a16d9dd590d7471", + "blk.5.attn_output.bias": "82d924d4b0d2b94f2bbff91619216d6967a3541ce9b1531a6a60457a67b5d219", + "blk.5.attn_output_norm.weight": "5e7bd0a8d3396080f3360d7c4700bf094a06216431bd014c4479eef72ecf4271", + "blk.5.attn_output_norm.bias": "66c6de5edda5466d029c6753780be81ccd4218bf8bc00680000e0f06856ab712", + "blk.5.ffn_up.weight": "5bbf6e7ea380e216e33f8bee06d25f2265359d3876a300e92bc6e41d48e33430", + "blk.5.ffn_up.bias": "9d795388bb36fb33ad3a37fea3ccb4937838e02800a608fb47d363cd06b47370", + "blk.5.ffn_down.weight": "2fd628974e7f075479dd227b46fbd48ae8d3ca34d735b36f391ac06410730368", + "blk.5.ffn_down.bias": "cd213ba9eaa75fa541648097fbe9c96e58077e6c3ad6ad2fb1f21f8350f44291", + "blk.5.layer_output_norm.weight": "159a9df41d15b7022d136f86a2a2631c4635f9816e957472217077b522bcf52a", + "blk.5.layer_output_norm.bias": "24c1f27ffd1eb4e5be7e3a2909943e6f0980635d761fa1efdd0c19645da23766" +} diff --git a/convert/tokenizer.go b/convert/tokenizer.go index 0d42a6d8..653df6d2 100644 --- a/convert/tokenizer.go +++ b/convert/tokenizer.go @@ -1,7 +1,6 @@ package convert import ( - "cmp" "crypto/sha256" "encoding/hex" "encoding/json" @@ -11,6 +10,8 @@ import ( "log/slog" "os" "slices" + + "golang.org/x/exp/maps" ) const ( @@ -184,32 +185,32 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) { return nil, err } - var tokens []token + tokens := make(map[int]token, len(t.Model.Vocab)) for k, v := range t.Model.Vocab { - tokens = append(tokens, token{ + tokens[v] = token{ ID: v, Content: k, - }) + } } - for _, t := range t.AddedTokens { - t.UserDefined = true - tokens = append(tokens, t) + for _, token := range t.AddedTokens { + token.UserDefined = true + tokens[token.ID] = token } - slices.SortFunc(tokens, func(i, j token) int { - return cmp.Compare(i.ID, j.ID) - }) + keys := maps.Keys(tokens) + slices.Sort(keys) v := Vocabulary{Model: "gpt2"} - for _, t := range tokens { - v.Tokens = append(v.Tokens, t.Content) - v.Scores = append(v.Scores, float32(t.ID)) + for _, k := range keys { + token := tokens[k] + v.Tokens = append(v.Tokens, token.Content) + v.Scores = append(v.Scores, float32(token.ID)) switch { - case t.Special: + case token.Special: v.Types = append(v.Types, tokenTypeControl) - case t.UserDefined: + case token.UserDefined: v.Types = append(v.Types, tokenTypeUserDefined) default: v.Types = append(v.Types, tokenTypeNormal) From beb49eef65acefc64a6ae0562ce58467e6974fde Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 7 Jun 2024 14:55:56 -0700 Subject: [PATCH 25/34] create bert models from cli --- cmd/cmd.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cmd/cmd.go b/cmd/cmd.go index fd7246c8..a8a02605 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -223,6 +223,14 @@ func tempZipFiles(path string) (string, error) { } files = append(files, js...) + // bert models require a nested config.json + // TODO(mxyng): merge this with the glob above + js, err = glob(filepath.Join(path, "**/*.json"), "text/plain") + if err != nil { + return "", err + } + files = append(files, js...) + if tks, _ := glob(filepath.Join(path, "tokenizer.model"), "application/octet-stream"); len(tks) > 0 { // add tokenizer.model if it exists, tokenizer.json is automatically picked up by the previous glob // tokenizer.model might be a unresolved git lfs reference; error if it is @@ -252,6 +260,11 @@ func tempZipFiles(path string) (string, error) { return "", err } + zfi.Name, err = filepath.Rel(path, file) + if err != nil { + return "", err + } + zf, err := zipfile.CreateHeader(zfi) if err != nil { return "", err From 3546bbd08c52df73eb6523b06b13f1b2dfeaa5fb Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 28 Jun 2024 13:27:05 -0700 Subject: [PATCH 26/34] convert gemma2 --- convert/convert.go | 11 ++++++-- convert/convert_bert.go | 9 +++--- convert/convert_gemma.go | 14 ++++----- convert/convert_gemma2.go | 44 +++++++++++++++++++++++++++++ convert/convert_llama.go | 19 ++++++------- convert/convert_mixtral.go | 9 ++++-- convert/convert_phi3.go | 11 ++++---- convert/convert_test.go | 1 + convert/reader.go | 12 ++++---- convert/reader_safetensors.go | 5 ++-- convert/reader_torch.go | 5 ++-- convert/testdata/gemma-2-9b-it.json | 6 ++++ convert/tokenizer_spm.go | 32 ++++++++++++++++++++- 13 files changed, 132 insertions(+), 46 deletions(-) create mode 100644 convert/convert_gemma2.go create mode 100644 convert/testdata/gemma-2-9b-it.json diff --git a/convert/convert.go b/convert/convert.go index f51e9665..5a314cdd 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -7,6 +7,7 @@ import ( "io" "io/fs" "log/slog" + "strings" "github.com/ollama/ollama/llm" ) @@ -58,11 +59,13 @@ type Converter interface { KV(*Tokenizer) llm.KV // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. Tensors([]Tensor) []llm.Tensor + // Replacements returns a list of string pairs to replace in tensor names. + // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details + Replacements() []string - // tensorName returns the LLM tensor name for a specific input name - tensorName(string) string // specialTokenTypes returns any special token types the model uses specialTokenTypes() []string + // writeFile writes the model to the provided io.WriteSeeker writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error } @@ -97,6 +100,8 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { conv = &mixtral{} case "GemmaForCausalLM": conv = &gemma{} + case "Gemma2ForCausalLM": + conv = &gemma2{} case "Phi3ForCausalLM": conv = &phi3{} case "BertModel": @@ -131,7 +136,7 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { slog.Debug("vocabulary", "size", len(t.Vocabulary.Tokens)) } - ts, err := parseTensors(fsys) + ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...)) if err != nil { return err } diff --git a/convert/convert_bert.go b/convert/convert_bert.go index 62fad147..4547a705 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -144,9 +144,8 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor { continue } - name := p.tensorName(t.Name()) out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -156,8 +155,8 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor { return out } -func (bert) tensorName(n string) string { - return strings.NewReplacer( +func (bert) Replacements() []string { + return []string{ "encoder.layer", "blk", "encoder.layers", "blk", "embeddings.word_embeddings", "token_embd", @@ -172,5 +171,5 @@ func (bert) tensorName(n string) string { "intermediate.dense", "ffn_up", "output.dense", "ffn_down", "output.LayerNorm", "layer_output_norm", - ).Replace(n) + } } diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index 9213e157..333e4c83 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -44,15 +44,14 @@ func (p *gemma) KV(t *Tokenizer) llm.KV { } func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { - var out []llm.Tensor + out := make([]llm.Tensor, 0, len(ts)) for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasSuffix(name, "_norm.weight") { + if strings.HasSuffix(t.Name(), "_norm.weight") { t.SetRepacker(p.addOne) } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -62,8 +61,8 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *gemma) tensorName(n string) string { - return strings.NewReplacer( +func (p *gemma) Replacements() []string { + return []string{ "model.embed_tokens", "token_embd", "model.norm", "output_norm", "model.layers", "blk", @@ -76,8 +75,7 @@ func (p *gemma) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - "block_sparse_moe.gate", "ffn_inp", - ).Replace(n) + } } func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) { diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go new file mode 100644 index 00000000..66be02d6 --- /dev/null +++ b/convert/convert_gemma2.go @@ -0,0 +1,44 @@ +package convert + +import ( + "github.com/ollama/ollama/llm" +) + +type gemma2 struct { + gemma + SlidingWindow uint32 `json:"sliding_window"` + AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"` + FinalLogitSoftcap float32 `json:"final_logit_softcapping"` +} + +func (p *gemma2) KV(t *Tokenizer) llm.KV { + kv := p.Parameters.KV(t) + kv["general.architecture"] = "gemma2" + kv["general.name"] = "gemma2" + kv["gemma2.context_length"] = p.MaxPositionEmbeddings + kv["gemma2.embedding_length"] = p.HiddenSize + kv["gemma2.block_count"] = p.HiddenLayers + kv["gemma2.feed_forward_length"] = p.IntermediateSize + kv["gemma2.attention.head_count"] = p.NumAttentionHeads + kv["gemma2.attention.head_count_kv"] = p.NumKeyValueHeads + kv["gemma2.attention.layer_norm_rms_epsilon"] = p.RMSNormEPS + kv["gemma2.attention.key_length"] = p.HeadDim + kv["gemma2.attention.value_length"] = p.HeadDim + kv["gemma2.attention.sliding_window"] = p.SlidingWindow + kv["gemma2.attn_logit_softcapping"] = p.AttentionLogitSoftcap + kv["gemma2.final_logit_softcapping"] = p.FinalLogitSoftcap + kv["tokenizer.ggml.eot_token_id"] = uint32(107) + kv["tokenizer.ggml.middle_token_id"] = uint32(68) + kv["tokenizer.ggml.prefix_token_id"] = uint32(67) + kv["tokenizer.ggml.suffix_token_id"] = uint32(69) + return kv +} + +func (p *gemma2) Replacements() []string { + return append( + p.gemma.Replacements(), + "post_attention_layernorm", "post_attention_norm", + "pre_feedforward_layernorm", "ffn_norm", + "post_feedforward_layernorm", "post_ffw_norm", + ) +} diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 178b13f3..498d1321 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -96,14 +96,13 @@ func (p *llama) KV(t *Tokenizer) llm.KV { func (p *llama) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasSuffix(name, "attn_q.weight") || - strings.HasSuffix(name, "attn_k.weight") { + if strings.HasSuffix(t.Name(), "attn_q.weight") || + strings.HasSuffix(t.Name(), "attn_k.weight") { t.SetRepacker(p.repack) } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -113,8 +112,8 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *llama) tensorName(n string) string { - return strings.NewReplacer( +func (p *llama) Replacements() []string { + return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", "model.norm", "output_norm", @@ -128,9 +127,7 @@ func (p *llama) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - // mixtral - "block_sparse_moe.gate", "ffn_gate_inp", - ).Replace(n) + } } func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) { @@ -140,9 +137,9 @@ func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, } var heads uint32 - if strings.HasSuffix(name, "q_proj.weight") { + if strings.HasSuffix(name, "attn_q.weight") { heads = p.NumAttentionHeads - } else if strings.HasSuffix(name, "k_proj.weight") { + } else if strings.HasSuffix(name, "attn_k.weight") { heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) } else { return nil, fmt.Errorf("unknown tensor for repack: %s", name) diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go index 3263a27b..97a86b30 100644 --- a/convert/convert_mixtral.go +++ b/convert/convert_mixtral.go @@ -15,8 +15,6 @@ type mixtral struct { NumExpertsPerToken uint32 `json:"num_experts_per_tok"` } -var _ Converter = (*mixtral)(nil) - func (p *mixtral) KV(t *Tokenizer) llm.KV { kv := p.llama.KV(t) @@ -72,6 +70,13 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor { return append(out, p.llama.Tensors(ts)...) } +func (p *mixtral) Replacements() []string { + return append( + p.llama.Replacements(), + "block_sparse_moe.gate", "ffn_gate_inp", + ) +} + type experts []Tensor func (e experts) WriteTo(w io.Writer) (int64, error) { diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index 0f645217..4ee59ff5 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -74,8 +74,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { out := make([]llm.Tensor, 0, len(ts)+2) for _, t := range ts { - name := p.tensorName(t.Name()) - if strings.HasPrefix(name, "blk.0.") { + if strings.HasPrefix(t.Name(), "blk.0.") { addRopeFactors.Do(func() { out = append(out, llm.Tensor{ Name: "rope_factors_long.weight", @@ -92,7 +91,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { } out = append(out, llm.Tensor{ - Name: name, + Name: t.Name(), Kind: t.Kind(), Shape: t.Shape(), WriterTo: t, @@ -102,8 +101,8 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *phi3) tensorName(n string) string { - return strings.NewReplacer( +func (p *phi3) Replacements() []string { + return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", "model.norm", "output_norm", @@ -114,7 +113,7 @@ func (p *phi3) tensorName(n string) string { "mlp.down_proj", "ffn_down", "mlp.gate_up_proj", "ffn_up", "post_attention_layernorm", "ffn_norm", - ).Replace(n) + } } type ropeFactor []float32 diff --git a/convert/convert_test.go b/convert/convert_test.go index e3ab0098..e78afab7 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -68,6 +68,7 @@ func TestConvertFull(t *testing.T) { // microsoft/Phi-3-mini-128-instruct@d548c233192db00165d842bf8edff054bb3212f8 "Phi-3-mini-128k-instruct", "all-MiniLM-L6-v2", + "gemma-2-9b-it", } for i := range cases { diff --git a/convert/reader.go b/convert/reader.go index 294a7c40..5bba0406 100644 --- a/convert/reader.go +++ b/convert/reader.go @@ -35,9 +35,9 @@ const ( ) func (t tensorBase) Kind() uint32 { - if strings.HasSuffix(t.name, ".block_sparse_moe.gate.weight") { - return 0 - } else if t.name == "embeddings.token_type_embeddings.weight" { + if strings.HasSuffix(t.name, ".ffn_gate_inp.weight") || + t.name == "token_types.weight" { + // these tensors are always F32 return 0 } @@ -57,10 +57,10 @@ func (t *tensorBase) SetRepacker(fn repacker) { type repacker func(string, []float32, []uint64) ([]float32, error) -func parseTensors(fsys fs.FS) ([]Tensor, error) { +func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) { patterns := []struct { Pattern string - Func func(fs.FS, ...string) ([]Tensor, error) + Func func(fs.FS, *strings.Replacer, ...string) ([]Tensor, error) }{ {"model-*-of-*.safetensors", parseSafetensors}, {"model.safetensors", parseSafetensors}, @@ -76,7 +76,7 @@ func parseTensors(fsys fs.FS) ([]Tensor, error) { } if len(matches) > 0 { - return pattern.Func(fsys, matches...) + return pattern.Func(fsys, replacer, matches...) } } diff --git a/convert/reader_safetensors.go b/convert/reader_safetensors.go index 42f902a5..32a362cd 100644 --- a/convert/reader_safetensors.go +++ b/convert/reader_safetensors.go @@ -8,6 +8,7 @@ import ( "io" "io/fs" "slices" + "strings" "github.com/d4l3k/go-bfloat16" "github.com/x448/float16" @@ -20,7 +21,7 @@ type safetensorMetadata struct { Offsets []int64 `json:"data_offsets"` } -func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) { +func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { var ts []Tensor for _, p := range ps { f, err := fsys.Open(p) @@ -56,7 +57,7 @@ func parseSafetensors(fsys fs.FS, ps ...string) ([]Tensor, error) { offset: safetensorsPad(n, value.Offsets[0]), size: safetensorsPad(n, value.Offsets[1]) - safetensorsPad(n, value.Offsets[0]), tensorBase: &tensorBase{ - name: key, + name: replacer.Replace(key), shape: value.Shape, }, }) diff --git a/convert/reader_torch.go b/convert/reader_torch.go index 531996bf..1b3e1c9f 100644 --- a/convert/reader_torch.go +++ b/convert/reader_torch.go @@ -3,12 +3,13 @@ package convert import ( "io" "io/fs" + "strings" "github.com/nlpodyssey/gopickle/pytorch" "github.com/nlpodyssey/gopickle/types" ) -func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) { +func parseTorch(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]Tensor, error) { var ts []Tensor for _, p := range ps { pt, err := pytorch.Load(p) @@ -27,7 +28,7 @@ func parseTorch(fsys fs.FS, ps ...string) ([]Tensor, error) { ts = append(ts, torch{ storage: t.(*pytorch.Tensor).Source, tensorBase: &tensorBase{ - name: k.(string), + name: replacer.Replace(k.(string)), shape: shape, }, }) diff --git a/convert/testdata/gemma-2-9b-it.json b/convert/testdata/gemma-2-9b-it.json new file mode 100644 index 00000000..90cdbee4 --- /dev/null +++ b/convert/testdata/gemma-2-9b-it.json @@ -0,0 +1,6 @@ +{ + "general.architecture": "gemma2", + "gemma2.attention.sliding_window": "4096", + "gemma2.attn_logit_softcapping": "50", + "gemma2.final_logit_softcapping": "30" +} diff --git a/convert/tokenizer_spm.go b/convert/tokenizer_spm.go index babf702c..5e506087 100644 --- a/convert/tokenizer_spm.go +++ b/convert/tokenizer_spm.go @@ -15,6 +15,11 @@ import ( ) func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { + ast, err := parseAdditionalSpecialTokens(fsys) + if err != nil { + return nil, err + } + bts, err := fs.ReadFile(fsys, "tokenizer.model") if err != nil { return nil, err @@ -37,7 +42,12 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { sentencepiece.ModelProto_SentencePiece_BYTE: v.Types = append(v.Types, int32(t)) default: - v.Types = append(v.Types, int32(sentencepiece.ModelProto_SentencePiece_NORMAL)) + tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL) + if slices.Contains(ast, piece.GetPiece()) { + tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL) + } + + v.Types = append(v.Types, tt) } } @@ -81,3 +91,23 @@ func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { return &v, nil } + +func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) { + f, err := fsys.Open("special_tokens_map.json") + if errors.Is(err, os.ErrNotExist) { + return nil, nil + } else if err != nil { + return nil, err + } + defer f.Close() + + var m struct { + AdditionalSpecialTokens []string `json:"additional_special_tokens"` + } + + if err := json.NewDecoder(f).Decode(&m); err != nil { + return nil, err + } + + return m.AdditionalSpecialTokens, nil +} From 77903ab8b4fb8075faad7bde5bde2eee3173e407 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 29 Jul 2024 14:53:02 -0700 Subject: [PATCH 27/34] llama3.1 --- convert/convert_bert.go | 1 - convert/convert_gemma.go | 1 - convert/convert_gemma2.go | 1 - convert/convert_llama.go | 43 +++++++++++++++++-- convert/convert_phi3.go | 1 - convert/convert_test.go | 1 + .../testdata/Meta-Llama-3.1-8B-Instruct.json | 3 ++ llm/memory_test.go | 1 - server/sched_test.go | 1 - 9 files changed, 44 insertions(+), 9 deletions(-) create mode 100644 convert/testdata/Meta-Llama-3.1-8B-Instruct.json diff --git a/convert/convert_bert.go b/convert/convert_bert.go index 4547a705..6e7d59fe 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -88,7 +88,6 @@ func (p *bert) parseMore(fsys fs.FS) error { func (p *bert) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "bert" - kv["general.name"] = "bert" kv["bert.attention.causal"] = false kv["bert.pooling_type"] = p.PoolingType diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index 333e4c83..c4316808 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -26,7 +26,6 @@ var _ Converter = (*gemma)(nil) func (p *gemma) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "gemma" - kv["general.name"] = "gemma" kv["gemma.context_length"] = p.MaxPositionEmbeddings kv["gemma.embedding_length"] = p.HiddenSize kv["gemma.block_count"] = p.HiddenLayers diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go index 66be02d6..084f9c52 100644 --- a/convert/convert_gemma2.go +++ b/convert/convert_gemma2.go @@ -14,7 +14,6 @@ type gemma2 struct { func (p *gemma2) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "gemma2" - kv["general.name"] = "gemma2" kv["gemma2.context_length"] = p.MaxPositionEmbeddings kv["gemma2.embedding_length"] = p.HiddenSize kv["gemma2.block_count"] = p.HiddenLayers diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 498d1321..27f924fb 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -3,6 +3,7 @@ package convert import ( "cmp" "fmt" + "math" "strings" "github.com/pdevine/tensor" @@ -27,8 +28,14 @@ type llama struct { NumKeyValueHeads uint32 `json:"num_key_value_heads"` RopeTheta float32 `json:"rope_theta"` RopeScaling struct { - Type string `json:"type"` - Factor float32 `json:"factor"` + Type string `json:"type"` + RopeType string `json:"rope_type"` + Factor float32 `json:"factor"` + LowFrequencyFactor float32 `json:"low_freq_factor"` + HighFrequencyFactor float32 `json:"high_freq_factor"` + OriginalMaxPositionalEmbeddings uint32 `json:"original_max_positional_embeddings"` + + factors ropeFactor } `json:"rope_scaling"` RMSNormEPS float32 `json:"rms_norm_eps"` LayerNormEPS float32 `json:"layer_norm_eps"` @@ -42,7 +49,6 @@ var _ Converter = (*llama)(nil) func (p *llama) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "llama" - kv["general.name"] = "llama" kv["llama.vocab_size"] = p.VocabSize kv["llama.block_count"] = cmp.Or(p.NLayers, p.NumHiddenLayers, p.NLayer) @@ -71,6 +77,27 @@ func (p *llama) KV(t *Tokenizer) llm.KV { if p.RopeScaling.Type == "linear" { kv["llama.rope.scaling.type"] = p.RopeScaling.Type kv["llama.rope.scaling.factor"] = p.RopeScaling.Factor + } else if p.RopeScaling.RopeType == "llama3" { + dim := p.HiddenSize / p.NumAttentionHeads + for i := uint32(0); i < dim; i += 2 { + factor := cmp.Or(p.RopeScaling.Factor, 8.0) + factorLow := cmp.Or(p.RopeScaling.LowFrequencyFactor, 1.0) + factorHigh := cmp.Or(p.RopeScaling.HighFrequencyFactor, 4.0) + + original := cmp.Or(p.RopeScaling.OriginalMaxPositionalEmbeddings, 8192) + lambdaLow := float32(original) / factorLow + lambdaHigh := float32(original) / factorHigh + + lambda := 2 * math.Pi * math.Pow(float64(p.RopeTheta), float64(i)/float64(dim)) + if lambda < float64(lambdaHigh) { + p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0) + } else if lambda > float64(lambdaLow) { + p.RopeScaling.factors = append(p.RopeScaling.factors, factor) + } else { + smooth := (float32(original)/float32(lambda) - factorLow) / (factorHigh - factorLow) + p.RopeScaling.factors = append(p.RopeScaling.factors, 1.0/((1-smooth)/factor+smooth)) + } + } } if p.NumKeyValueHeads > 0 { @@ -95,6 +122,16 @@ func (p *llama) KV(t *Tokenizer) llm.KV { func (p *llama) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor + + if p.RopeScaling.factors != nil { + out = append(out, llm.Tensor{ + Name: "rope_freqs.weight", + Kind: 0, + Shape: []uint64{uint64(len(p.RopeScaling.factors))}, + WriterTo: p.RopeScaling.factors, + }) + } + for _, t := range ts { if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") { diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index 4ee59ff5..64d3d012 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -40,7 +40,6 @@ var _ Converter = (*phi3)(nil) func (p *phi3) KV(t *Tokenizer) llm.KV { kv := p.Parameters.KV(t) kv["general.architecture"] = "phi3" - kv["general.name"] = "phi3" kv["phi3.context_length"] = p.MaxPositionEmbeddings kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) kv["phi3.feed_forward_length"] = p.IntermediateSize diff --git a/convert/convert_test.go b/convert/convert_test.go index e78afab7..64b7df3b 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -62,6 +62,7 @@ func TestMain(m *testing.M) { func TestConvertFull(t *testing.T) { cases := []string{ "Meta-Llama-3-8B-Instruct", + "Meta-Llama-3.1-8B-Instruct", "Mistral-7B-Instruct-v0.2", "Mixtral-8x7B-Instruct-v0.1", "gemma-2b-it", diff --git a/convert/testdata/Meta-Llama-3.1-8B-Instruct.json b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json new file mode 100644 index 00000000..ad7cd20a --- /dev/null +++ b/convert/testdata/Meta-Llama-3.1-8B-Instruct.json @@ -0,0 +1,3 @@ +{ + "rope_freqs.weight": "80fd5efb2f729381785b293a091a268cfeceb0079167f6ece9b07070e662b222" +} diff --git a/llm/memory_test.go b/llm/memory_test.go index 6cf0119f..ffb14286 100644 --- a/llm/memory_test.go +++ b/llm/memory_test.go @@ -33,7 +33,6 @@ func TestEstimateGPULayers(t *testing.T) { assert.Len(t, tensors, inputLayerCount+1) err = WriteGGUF(f, KV{ "general.architecture": "llama", - "general.name": "name", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), "llama.block_count": uint32(inputLayerCount), diff --git a/server/sched_test.go b/server/sched_test.go index 713b9259..fb049574 100644 --- a/server/sched_test.go +++ b/server/sched_test.go @@ -117,7 +117,6 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est require.NoError(t, llm.WriteGGUF(f, llm.KV{ "general.architecture": "llama", - "general.name": "name", "llama.context_length": uint32(32), "llama.embedding_length": uint32(4096), "llama.block_count": uint32(1), From 90ca84172c2a98ecfd76eb7e05cd3e33e1dde507 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 22 Aug 2024 14:51:42 -0700 Subject: [PATCH 28/34] Fix embeddings memory corruption (#6467) * Fix embeddings memory corruption The patch was leading to a buffer overrun corruption. Once removed though, parallism in server.cpp lead to hitting an assert due to slot/seq IDs being >= token count. To work around this, only use slot 0 for embeddings. * Fix embed integration test assumption The token eval count has changed with recent llama.cpp bumps (0.3.5+) --- integration/embed_test.go | 8 ++--- llm/ext_server/server.cpp | 8 ++++- llm/patches/08-pooling.diff | 60 ------------------------------------- server/sched.go | 5 ++++ 4 files changed, 16 insertions(+), 65 deletions(-) delete mode 100644 llm/patches/08-pooling.diff diff --git a/integration/embed_test.go b/integration/embed_test.go index 10333d5d..4a68af68 100644 --- a/integration/embed_test.go +++ b/integration/embed_test.go @@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) { t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0]) } - if res.PromptEvalCount != 8 { - t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount) + if res.PromptEvalCount != 6 { + t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount) } } @@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) { t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0]) } - if res.PromptEvalCount != 16 { - t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount) + if res.PromptEvalCount != 12 { + t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount) } } diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 5717c17a..8e08b850 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1429,7 +1429,13 @@ struct llama_server_context switch (task.type) { case TASK_TYPE_COMPLETION: { - server_slot *slot = prefix_slot(task.data["prompt"]); + server_slot *slot = nullptr; + if (task.embedding_mode) { + // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0 + slot = slots[0].available() ? &slots[0] : nullptr; + } else { + slot = prefix_slot(task.data["prompt"]); + } if (slot == nullptr) { // if no slot is available, we defer this task for processing later diff --git a/llm/patches/08-pooling.diff b/llm/patches/08-pooling.diff deleted file mode 100644 index 2e4fe11e..00000000 --- a/llm/patches/08-pooling.diff +++ /dev/null @@ -1,60 +0,0 @@ -diff --git a/src/llama.cpp b/src/llama.cpp -index 721b8f4e..cfe7ac40 100644 ---- a/src/llama.cpp -+++ b/src/llama.cpp -@@ -8420,14 +8420,14 @@ struct llm_build_context { - } - - struct ggml_tensor * build_inp_mean() { -- lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); -+ lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max); - cb(lctx.inp_mean, "inp_mean", -1); - ggml_set_input(lctx.inp_mean); - return lctx.inp_mean; - } - - struct ggml_tensor * build_inp_cls() { -- lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); -+ lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max); - cb(lctx.inp_cls, "inp_cls", -1); - ggml_set_input(lctx.inp_cls); - return lctx.inp_cls; -@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); - - float * data = (float *) lctx.inp_mean->data; -- memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean)); -+ memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean)); - - std::vector sum(n_tokens, 0); - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; -- -- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN"); -- - sum[seq_id] += 1; - } - -- std::vector div(n_tokens, 0.0f); -- for (int i = 0; i < n_tokens; ++i) { -+ std::vector div(cparams.n_seq_max, 0.0f); -+ for (uint32_t i = 0; i < cparams.n_seq_max; ++i) { - const uint64_t s = sum[i]; - if (s > 0) { - div[i] = 1.0f/float(s); -@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); - - uint32_t * data = (uint32_t *) lctx.inp_cls->data; -- memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls)); -+ memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls)); - - for (int i = 0; i < n_tokens; ++i) { - const llama_seq_id seq_id = batch.seq_id[i][0]; - const llama_pos pos = batch.pos[i]; -- -- GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS"); -- - if (pos == 0) { - data[seq_id] = i; - } diff --git a/server/sched.go b/server/sched.go index 9d8c4144..58071bf0 100644 --- a/server/sched.go +++ b/server/sched.go @@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) { break } + // Embedding models should always be loaded with parallel=1 + if pending.model.CheckCapabilities(CapabilityCompletion) != nil { + numParallel = 1 + } + // Evaluate if the model will fit in the available system memory, or if we should unload a model first if len(gpus) == 1 && gpus[0].Library == "cpu" { // simplifying assumption of defaultParallel when in CPU mode From 0b03b9c32f483be2d7a4e902d13a909b546ae6bf Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 23 Aug 2024 11:20:39 -0700 Subject: [PATCH 29/34] llm: Align cmake define for cuda no peer copy (#6455) Define changed recently and this slipped through the cracks with the old name. --- llm/generate/gen_linux.sh | 2 +- llm/generate/gen_windows.ps1 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 6927dda8..1f702ca2 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -252,7 +252,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true) fi init_vars - CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" + CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)" # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\"" diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index cbdfd09f..7179c1bc 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -355,7 +355,7 @@ function build_rocm() { "-DCMAKE_C_COMPILER=clang.exe", "-DCMAKE_CXX_COMPILER=clang++.exe", "-DGGML_HIPBLAS=on", - "-DLLAMA_CUDA_NO_PEER_COPY=on", + "-DGGML_CUDA_NO_PEER_COPY=on", "-DHIP_PLATFORM=amd", "-DGGML_AVX=on", "-DGGML_AVX2=off", From 7a1e1c1cafe4d3f3f935dc7192f9e66d4b2185b3 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 23 Aug 2024 11:21:12 -0700 Subject: [PATCH 30/34] gpu: Ensure driver version set before variant (#6480) During rebasing, the ordering was inverted causing the cuda version selection logic to break, with driver version being evaluated as zero incorrectly causing a downgrade to v11. --- gpu/gpu.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 72d237a6..10afb1e3 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -264,6 +264,8 @@ func GetGPUInfo() GpuInfoList { gpuInfo.computeMajor = int(memInfo.major) gpuInfo.computeMinor = int(memInfo.minor) gpuInfo.MinimumMemory = cudaMinimumMemory + gpuInfo.DriverMajor = driverMajor + gpuInfo.DriverMinor = driverMinor variant := cudaVariant(gpuInfo) if depPath != "" { gpuInfo.DependencyPath = depPath @@ -275,8 +277,6 @@ func GetGPUInfo() GpuInfoList { } } gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) - gpuInfo.DriverMajor = driverMajor - gpuInfo.DriverMinor = driverMinor gpuInfo.Variant = variant // query the management library as well so we can record any skew between the two From 0c819e167becd7f08312d2a1a1e2ac8e8ea5d4da Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Fri, 23 Aug 2024 11:29:56 -0700 Subject: [PATCH 31/34] convert safetensor adapters into GGUF (#6327) --- cmd/cmd.go | 6 + convert/convert.go | 111 +++++++++++-- convert/convert_bert.go | 18 +- convert/convert_gemma.go | 18 +- convert/convert_gemma2.go | 12 +- convert/convert_gemma2_adapter.go | 91 +++++++++++ convert/convert_llama.go | 16 +- convert/convert_llama_adapter.go | 169 +++++++++++++++++++ convert/convert_mixtral.go | 16 +- convert/convert_phi3.go | 14 +- convert/convert_test.go | 262 +++++++++++++++++++++++++++--- convert/reader.go | 2 + llm/ggml.go | 8 + server/images.go | 11 +- server/model.go | 38 ++++- server/model_test.go | 6 +- 16 files changed, 697 insertions(+), 101 deletions(-) create mode 100644 convert/convert_gemma2_adapter.go create mode 100644 convert/convert_llama_adapter.go diff --git a/cmd/cmd.go b/cmd/cmd.go index a8a02605..b75c0b5e 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -204,6 +204,12 @@ func tempZipFiles(path string) (string, error) { // safetensors files might be unresolved git lfs references; skip if they are // covers model-x-of-y.safetensors, model.fp32-x-of-y.safetensors, model.safetensors files = append(files, st...) + } else if st, _ := glob(filepath.Join(path, "adapters.safetensors"), "application/octet-stream"); len(st) > 0 { + // covers adapters.safetensors + files = append(files, st...) + } else if st, _ := glob(filepath.Join(path, "adapter_model.safetensors"), "application/octet-stream"); len(st) > 0 { + // covers adapter_model.safetensors + files = append(files, st...) } else if pt, _ := glob(filepath.Join(path, "pytorch_model*.bin"), "application/zip"); len(pt) > 0 { // pytorch files might also be unresolved git lfs references; skip if they are // covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin diff --git a/convert/convert.go b/convert/convert.go index 5a314cdd..8c7b0943 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -12,12 +12,22 @@ import ( "github.com/ollama/ollama/llm" ) -type Parameters struct { +type ModelParameters struct { Architectures []string `json:"architectures"` VocabSize uint32 `json:"vocab_size"` } -func (Parameters) KV(t *Tokenizer) llm.KV { +type AdapterParameters struct { + Alpha uint32 `json:"lora_alpha"` + LoraLayers uint32 `json:"lora_layers"` + LoraParameters struct { + Rank uint32 `json:"rank"` + Alpha float32 `json:"alpha"` + Scale float32 `json:"scale"` + } `json:"lora_parameters"` +} + +func (ModelParameters) KV(t *Tokenizer) llm.KV { kv := llm.KV{ "general.file_type": uint32(1), "general.quantization_version": uint32(2), @@ -44,17 +54,40 @@ func (Parameters) KV(t *Tokenizer) llm.KV { return kv } -func (Parameters) specialTokenTypes() []string { +func (p AdapterParameters) KV() llm.KV { + var alpha float32 + if p.LoraParameters.Alpha == 0 { + alpha = float32(p.Alpha) + } else { + alpha = p.LoraParameters.Alpha + } + + kv := llm.KV{ + "adapter.lora.alpha": alpha, + "adapter.type": "lora", + "general.file_type": uint32(1), + "general.type": "adapter", + "general.version": "v0.2", + } + + return kv +} + +func (ModelParameters) specialTokenTypes() []string { return []string{ "bos", "eos", "unk", "sep", "pad", "cls", "mask", } } -func (Parameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { +func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { return llm.WriteGGUF(ws, kv, ts) } -type Converter interface { +func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error { + return llm.WriteGGUF(ws, kv, ts) +} + +type ModelConverter interface { // KV maps parameters to LLM key-values KV(*Tokenizer) llm.KV // Tensors maps input tensors to LLM tensors. Model specific modifications can be done here. @@ -73,17 +106,67 @@ type moreParser interface { parseMore(fs.FS) error } +type AdapterConverter interface { + // KV maps parameters to LLM key-values + KV(llm.KV) llm.KV + // Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here. + Tensors([]Tensor) []llm.Tensor + // Replacements returns a list of string pairs to replace in tensor names. + // See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details + Replacements() []string + + writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error +} + +func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error { + bts, err := fs.ReadFile(fsys, "adapter_config.json") + if err != nil { + return err + } + + var p AdapterParameters + if err := json.Unmarshal(bts, &p); err != nil { + return err + } + + arch, ok := baseKV["general.architecture"] + if !ok { + return errors.New("architecture not set for the base model") + } + + var conv AdapterConverter + switch arch { + case "llama": + conv = &llamaAdapter{} + case "gemma2": + conv = &gemma2Adapter{} + default: + return errors.New("unsupported architecture") + } + + ts, err := parseTensors(fsys, strings.NewReplacer(conv.Replacements()...)) + if err != nil { + return err + } + + if err := json.Unmarshal(bts, conv); err != nil { + return err + } + + return conv.writeFile(ws, conv.KV(baseKV), conv.Tensors(ts)) +} + // Convert writes an Ollama compatible model to the provided io.WriteSeeker based on configurations // and files it finds in the input path. // Supported input model formats include safetensors. // Supported input tokenizers files include tokenizer.json (preferred) and tokenizer.model. -func Convert(fsys fs.FS, ws io.WriteSeeker) error { +func ConvertModel(fsys fs.FS, ws io.WriteSeeker) error { bts, err := fs.ReadFile(fsys, "config.json") if err != nil { return err } - var p Parameters + var p ModelParameters if err := json.Unmarshal(bts, &p); err != nil { return err } @@ -92,20 +175,20 @@ func Convert(fsys fs.FS, ws io.WriteSeeker) error { return errors.New("unknown architecture") } - var conv Converter + var conv ModelConverter switch p.Architectures[0] { case "LlamaForCausalLM", "MistralForCausalLM": - conv = &llama{} + conv = &llamaModel{} case "MixtralForCausalLM": - conv = &mixtral{} + conv = &mixtralModel{} case "GemmaForCausalLM": - conv = &gemma{} + conv = &gemmaModel{} case "Gemma2ForCausalLM": - conv = &gemma2{} + conv = &gemma2Model{} case "Phi3ForCausalLM": - conv = &phi3{} + conv = &phi3Model{} case "BertModel": - conv = &bert{} + conv = &bertModel{} default: return errors.New("unsupported architecture") } diff --git a/convert/convert_bert.go b/convert/convert_bert.go index 6e7d59fe..ea5facaa 100644 --- a/convert/convert_bert.go +++ b/convert/convert_bert.go @@ -11,8 +11,8 @@ import ( "github.com/ollama/ollama/llm" ) -type bert struct { - Parameters +type bertModel struct { + ModelParameters NLayers uint32 `json:"n_layers"` NumHiddenLayers uint32 `json:"num_hidden_layers"` NLayer uint32 `json:"n_layer"` @@ -33,11 +33,11 @@ type bert struct { } var ( - _ Converter = (*bert)(nil) - _ moreParser = (*bert)(nil) + _ ModelConverter = (*bertModel)(nil) + _ moreParser = (*bertModel)(nil) ) -func (p *bert) parseMore(fsys fs.FS) error { +func (p *bertModel) parseMore(fsys fs.FS) error { bts, err := fs.ReadFile(fsys, "modules.json") if err != nil { return err @@ -85,8 +85,8 @@ func (p *bert) parseMore(fsys fs.FS) error { return nil } -func (p *bert) KV(t *Tokenizer) llm.KV { - kv := p.Parameters.KV(t) +func (p *bertModel) KV(t *Tokenizer) llm.KV { + kv := p.ModelParameters.KV(t) kv["general.architecture"] = "bert" kv["bert.attention.causal"] = false kv["bert.pooling_type"] = p.PoolingType @@ -132,7 +132,7 @@ func (p *bert) KV(t *Tokenizer) llm.KV { return kv } -func (p *bert) Tensors(ts []Tensor) []llm.Tensor { +func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor for _, t := range ts { if slices.Contains([]string{ @@ -154,7 +154,7 @@ func (p *bert) Tensors(ts []Tensor) []llm.Tensor { return out } -func (bert) Replacements() []string { +func (bertModel) Replacements() []string { return []string{ "encoder.layer", "blk", "encoder.layers", "blk", diff --git a/convert/convert_gemma.go b/convert/convert_gemma.go index c4316808..b8865294 100644 --- a/convert/convert_gemma.go +++ b/convert/convert_gemma.go @@ -9,8 +9,8 @@ import ( "github.com/ollama/ollama/llm" ) -type gemma struct { - Parameters +type gemmaModel struct { + ModelParameters MaxPositionEmbeddings uint32 `json:"max_position_embeddings"` HiddenSize uint32 `json:"hidden_size"` HiddenLayers uint32 `json:"num_hidden_layers"` @@ -21,10 +21,10 @@ type gemma struct { HeadDim uint32 `json:"head_dim"` } -var _ Converter = (*gemma)(nil) +var _ ModelConverter = (*gemmaModel)(nil) -func (p *gemma) KV(t *Tokenizer) llm.KV { - kv := p.Parameters.KV(t) +func (p *gemmaModel) KV(t *Tokenizer) llm.KV { + kv := p.ModelParameters.KV(t) kv["general.architecture"] = "gemma" kv["gemma.context_length"] = p.MaxPositionEmbeddings kv["gemma.embedding_length"] = p.HiddenSize @@ -42,8 +42,8 @@ func (p *gemma) KV(t *Tokenizer) llm.KV { return kv } -func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { - out := make([]llm.Tensor, 0, len(ts)) +func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor { + var out []llm.Tensor for _, t := range ts { if strings.HasSuffix(t.Name(), "_norm.weight") { t.SetRepacker(p.addOne) @@ -60,7 +60,7 @@ func (p *gemma) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *gemma) Replacements() []string { +func (p *gemmaModel) Replacements() []string { return []string{ "model.embed_tokens", "token_embd", "model.norm", "output_norm", @@ -77,7 +77,7 @@ func (p *gemma) Replacements() []string { } } -func (*gemma) addOne(_ string, data []float32, shape []uint64) ([]float32, error) { +func (*gemmaModel) addOne(_ string, data []float32, shape []uint64) ([]float32, error) { n := tensor.New(tensor.WithShape(int(shape[0])), tensor.WithBacking(data)) ones := tensor.Ones(tensor.Float32, int(shape[0])) diff --git a/convert/convert_gemma2.go b/convert/convert_gemma2.go index 084f9c52..c4ee2d09 100644 --- a/convert/convert_gemma2.go +++ b/convert/convert_gemma2.go @@ -4,15 +4,15 @@ import ( "github.com/ollama/ollama/llm" ) -type gemma2 struct { - gemma +type gemma2Model struct { + gemmaModel SlidingWindow uint32 `json:"sliding_window"` AttentionLogitSoftcap float32 `json:"attn_logit_softcapping"` FinalLogitSoftcap float32 `json:"final_logit_softcapping"` } -func (p *gemma2) KV(t *Tokenizer) llm.KV { - kv := p.Parameters.KV(t) +func (p *gemma2Model) KV(t *Tokenizer) llm.KV { + kv := p.ModelParameters.KV(t) kv["general.architecture"] = "gemma2" kv["gemma2.context_length"] = p.MaxPositionEmbeddings kv["gemma2.embedding_length"] = p.HiddenSize @@ -33,9 +33,9 @@ func (p *gemma2) KV(t *Tokenizer) llm.KV { return kv } -func (p *gemma2) Replacements() []string { +func (p *gemma2Model) Replacements() []string { return append( - p.gemma.Replacements(), + p.gemmaModel.Replacements(), "post_attention_layernorm", "post_attention_norm", "pre_feedforward_layernorm", "ffn_norm", "post_feedforward_layernorm", "post_ffw_norm", diff --git a/convert/convert_gemma2_adapter.go b/convert/convert_gemma2_adapter.go new file mode 100644 index 00000000..a89a25f4 --- /dev/null +++ b/convert/convert_gemma2_adapter.go @@ -0,0 +1,91 @@ +package convert + +import ( + "strings" + + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" + + "github.com/ollama/ollama/llm" +) + +type gemma2Adapter struct { + AdapterParameters +} + +var _ AdapterConverter = (*gemma2Adapter)(nil) + +func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV { + kv := p.AdapterParameters.KV() + kv["general.architecture"] = "gemma2" + return kv +} + +func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor { + var out []llm.Tensor + for _, t := range ts { + shape := t.Shape() + if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || + (strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) { + shape[0], shape[1] = shape[1], shape[0] + t.SetRepacker(p.repack) + } + + out = append(out, llm.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: t.Shape(), + WriterTo: t, + }) + } + + return out +} + +func (p *gemma2Adapter) Replacements() []string { + return []string{ + "base_model.model.", "", + "model.layers", "blk", + "self_attn.q_proj", "attn_q", + "self_attn.k_proj", "attn_k", + "self_attn.v_proj", "attn_v", + "self_attn.o_proj", "attn_output", + "mlp.gate_proj", "ffn_gate", + "mlp.down_proj", "ffn_down", + "mlp.up_proj", "ffn_up", + "lora_A.weight", "weight.lora_a", + "lora_B.weight", "weight.lora_b", + "lora_a", "weight.lora_a", + "lora_b", "weight.lora_b", + } +} + +func (p *gemma2Adapter) repack(name string, data []float32, shape []uint64) ([]float32, error) { + dims := []int{int(shape[1]), int(shape[0])} + + n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + + if err := n.T(1, 0); err != nil { + return nil, err + } + + if err := n.Reshape(dims...); err != nil { + return nil, err + } + + if err := n.Transpose(); err != nil { + return nil, err + } + + ts, err := native.SelectF32(n, 1) + if err != nil { + return nil, err + } + + var f32s []float32 + for _, t := range ts { + f32s = append(f32s, t...) + } + + return f32s, nil +} diff --git a/convert/convert_llama.go b/convert/convert_llama.go index 27f924fb..5dedb829 100644 --- a/convert/convert_llama.go +++ b/convert/convert_llama.go @@ -12,8 +12,8 @@ import ( "github.com/ollama/ollama/llm" ) -type llama struct { - Parameters +type llamaModel struct { + ModelParameters NLayers uint32 `json:"n_layers"` NumHiddenLayers uint32 `json:"num_hidden_layers"` NLayer uint32 `json:"n_layer"` @@ -44,10 +44,10 @@ type llama struct { HeadDim uint32 `json:"head_dim"` } -var _ Converter = (*llama)(nil) +var _ ModelConverter = (*llamaModel)(nil) -func (p *llama) KV(t *Tokenizer) llm.KV { - kv := p.Parameters.KV(t) +func (p *llamaModel) KV(t *Tokenizer) llm.KV { + kv := p.ModelParameters.KV(t) kv["general.architecture"] = "llama" kv["llama.vocab_size"] = p.VocabSize @@ -120,7 +120,7 @@ func (p *llama) KV(t *Tokenizer) llm.KV { return kv } -func (p *llama) Tensors(ts []Tensor) []llm.Tensor { +func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor { var out []llm.Tensor if p.RopeScaling.factors != nil { @@ -149,7 +149,7 @@ func (p *llama) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *llama) Replacements() []string { +func (p *llamaModel) Replacements() []string { return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", @@ -167,7 +167,7 @@ func (p *llama) Replacements() []string { } } -func (p *llama) repack(name string, data []float32, shape []uint64) ([]float32, error) { +func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]float32, error) { var dims []int for _, dim := range shape { dims = append(dims, int(dim)) diff --git a/convert/convert_llama_adapter.go b/convert/convert_llama_adapter.go new file mode 100644 index 00000000..08ddee10 --- /dev/null +++ b/convert/convert_llama_adapter.go @@ -0,0 +1,169 @@ +package convert + +import ( + "cmp" + "strings" + + "github.com/pdevine/tensor" + "github.com/pdevine/tensor/native" + + "github.com/ollama/ollama/llm" +) + +type llamaAdapter struct { + AdapterParameters + NumAttentionHeads uint32 `json:"num_attention_heads"` + NumKeyValueHeads uint32 `json:"num_key_value_heads"` +} + +var _ AdapterConverter = (*llamaAdapter)(nil) + +func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV { + kv := p.AdapterParameters.KV() + kv["general.architecture"] = "llama" + kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"] + kv["llama.attention.head_count_kv"] = baseKV["llama.attention.head_count_kv"] + + p.NumAttentionHeads = baseKV["llama.attention.head_count"].(uint32) + + return kv +} + +func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor { + var out []llm.Tensor + for _, t := range ts { + shape := t.Shape() + if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) || + (strings.HasSuffix(t.Name(), "weight.lora_b") && shape[0] < shape[1]) { + shape[0], shape[1] = shape[1], shape[0] + t.SetRepacker(p.repackAndTranspose) + } else { + t.SetRepacker(p.repack) + } + + out = append(out, llm.Tensor{ + Name: t.Name(), + Kind: t.Kind(), + Shape: shape, + WriterTo: t, + }) + } + + return out +} + +func (p *llamaAdapter) Replacements() []string { + return []string{ + "base_model.model.", "", + "model.layers", "blk", + "self_attn.q_proj", "attn_q", + "self_attn.k_proj", "attn_k", + "self_attn.v_proj", "attn_v", + "self_attn.o_proj", "attn_output", + "mlp.gate_proj", "ffn_gate", + "mlp.down_proj", "ffn_down", + "mlp.up_proj", "ffn_up", + "lora_A.weight", "weight.lora_a", + "lora_B.weight", "weight.lora_b", + "lora_a", "weight.lora_a", + "lora_b", "weight.lora_b", + } +} + +func (p *llamaAdapter) repack(name string, data []float32, shape []uint64) ([]float32, error) { + dims := []int{int(shape[1]), int(shape[0])} + + var heads uint32 + if strings.HasSuffix(name, "attn_q.weight.lora_a") { + heads = p.NumAttentionHeads + } else if strings.HasSuffix(name, "attn_k.weight.lora_a") { + heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) + } else { + return data, nil + } + + n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + + if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil { + return nil, err + } + + if err := n.T(0, 2, 1, 3); err != nil { + return nil, err + } + + if err := n.Reshape(dims...); err != nil { + return nil, err + } + + if err := n.Transpose(); err != nil { + return nil, err + } + + ts, err := native.SelectF32(n, 1) + if err != nil { + return nil, err + } + + var f32s []float32 + for _, t := range ts { + f32s = append(f32s, t...) + } + + return f32s, nil +} + +func (p *llamaAdapter) repackAndTranspose(name string, data []float32, shape []uint64) ([]float32, error) { + dims := []int{int(shape[1]), int(shape[0])} + + n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + + var heads uint32 + if strings.HasSuffix(name, "attn_q.weight.lora_a") { + heads = p.NumAttentionHeads + } else if strings.HasSuffix(name, "attn_k.weight.lora_a") { + heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads) + } + + if heads > 0 { + if err := n.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil { + return nil, err + } + + if err := n.T(0, 2, 1, 3); err != nil { + return nil, err + } + + if err := n.Reshape(dims...); err != nil { + return nil, err + } + + if err := n.Transpose(); err != nil { + return nil, err + } + } + + if err := n.T(1, 0); err != nil { + return nil, err + } + + if err := n.Reshape(dims...); err != nil { + return nil, err + } + + if err := n.Transpose(); err != nil { + return nil, err + } + + ts, err := native.SelectF32(n, 1) + if err != nil { + return nil, err + } + + var f32s []float32 + for _, t := range ts { + f32s = append(f32s, t...) + } + + return f32s, nil +} diff --git a/convert/convert_mixtral.go b/convert/convert_mixtral.go index 97a86b30..43b7c8b1 100644 --- a/convert/convert_mixtral.go +++ b/convert/convert_mixtral.go @@ -9,14 +9,14 @@ import ( "github.com/ollama/ollama/llm" ) -type mixtral struct { - llama +type mixtralModel struct { + llamaModel NumLocalExperts uint32 `json:"num_local_experts"` NumExpertsPerToken uint32 `json:"num_experts_per_tok"` } -func (p *mixtral) KV(t *Tokenizer) llm.KV { - kv := p.llama.KV(t) +func (p *mixtralModel) KV(t *Tokenizer) llm.KV { + kv := p.llamaModel.KV(t) if p.NumLocalExperts > 0 { kv["llama.expert_count"] = p.NumLocalExperts @@ -29,7 +29,7 @@ func (p *mixtral) KV(t *Tokenizer) llm.KV { return kv } -func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor { +func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor { oldnew := []string{ "model.layers", "blk", "w1", "ffn_gate_exps", @@ -67,12 +67,12 @@ func (p *mixtral) Tensors(ts []Tensor) []llm.Tensor { }) } - return append(out, p.llama.Tensors(ts)...) + return append(out, p.llamaModel.Tensors(ts)...) } -func (p *mixtral) Replacements() []string { +func (p *mixtralModel) Replacements() []string { return append( - p.llama.Replacements(), + p.llamaModel.Replacements(), "block_sparse_moe.gate", "ffn_gate_inp", ) } diff --git a/convert/convert_phi3.go b/convert/convert_phi3.go index 64d3d012..3de0d404 100644 --- a/convert/convert_phi3.go +++ b/convert/convert_phi3.go @@ -11,8 +11,8 @@ import ( "github.com/ollama/ollama/llm" ) -type phi3 struct { - Parameters +type phi3Model struct { + ModelParameters NumHiddenLayers uint32 `json:"num_hidden_layers"` NLayers uint32 `json:"n_layers"` HiddenSize uint32 `json:"hidden_size"` @@ -35,10 +35,10 @@ type phi3 struct { SlidingWindow uint32 `json:"sliding_window"` } -var _ Converter = (*phi3)(nil) +var _ ModelConverter = (*phi3Model)(nil) -func (p *phi3) KV(t *Tokenizer) llm.KV { - kv := p.Parameters.KV(t) +func (p *phi3Model) KV(t *Tokenizer) llm.KV { + kv := p.ModelParameters.KV(t) kv["general.architecture"] = "phi3" kv["phi3.context_length"] = p.MaxPositionEmbeddings kv["phi3.embedding_length"] = cmp.Or(p.HiddenSize, p.NEmbd) @@ -68,7 +68,7 @@ func (p *phi3) KV(t *Tokenizer) llm.KV { return kv } -func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { +func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor { var addRopeFactors sync.Once out := make([]llm.Tensor, 0, len(ts)+2) @@ -100,7 +100,7 @@ func (p *phi3) Tensors(ts []Tensor) []llm.Tensor { return out } -func (p *phi3) Replacements() []string { +func (p *phi3Model) Replacements() []string { return []string{ "lm_head", "output", "model.embed_tokens", "token_embd", diff --git a/convert/convert_test.go b/convert/convert_test.go index 64b7df3b..56b34f22 100644 --- a/convert/convert_test.go +++ b/convert/convert_test.go @@ -1,7 +1,9 @@ package convert import ( + "bytes" "crypto/sha256" + "encoding/binary" "encoding/hex" "encoding/json" "flag" @@ -29,7 +31,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) { } defer f.Close() - if err := Convert(fsys, f); err != nil { + if err := ConvertModel(fsys, f); err != nil { t.Fatal(err) } @@ -51,6 +53,34 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, llm.Tensors) { return r, m.KV(), m.Tensors() } +func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors llm.Tensors) map[string]string { + actual := make(map[string]string) + for k, v := range kv { + if s, ok := v.(json.Marshaler); !ok { + actual[k] = fmt.Sprintf("%v", v) + } else { + bts, err := json.Marshal(s) + if err != nil { + t.Fatal(err) + } + + actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts)) + } + } + + for _, tensor := range tensors.Items { + sha256sum := sha256.New() + sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size())) + if _, err := io.Copy(sha256sum, sr); err != nil { + t.Fatal(err) + } + + actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil)) + } + + return actual +} + func TestMain(m *testing.M) { var level slog.Level flag.TextVar(&level, "level", slog.LevelInfo, "log level") @@ -85,29 +115,7 @@ func TestConvertFull(t *testing.T) { } f, kv, tensors := convertFull(t, os.DirFS(p)) - actual := make(map[string]string) - for k, v := range kv { - if s, ok := v.(json.Marshaler); !ok { - actual[k] = fmt.Sprintf("%v", v) - } else { - bts, err := json.Marshal(s) - if err != nil { - t.Fatal(err) - } - - actual[k] = fmt.Sprintf("%x", sha256.Sum256(bts)) - } - } - - for _, tensor := range tensors.Items { - sha256sum := sha256.New() - sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size())) - if _, err := io.Copy(sha256sum, sr); err != nil { - t.Fatal(err) - } - - actual[tensor.Name] = hex.EncodeToString(sha256sum.Sum(nil)) - } + actual := generateResultsJSON(t, f, kv, tensors) expectFile, err := os.Open(filepath.Join("testdata", fmt.Sprintf("%s.json", tt))) if err != nil { @@ -131,3 +139,209 @@ func TestConvertFull(t *testing.T) { }) } } + +func TestConvertAdapter(t *testing.T) { + type AdapterCase struct { + Name string + BaseKV map[string]any + Expected map[string]string + } + + cases := []AdapterCase{ + { + Name: "discollama", + BaseKV: map[string]any{ + "general.architecture": "llama", + "llama.attention.head_count": uint32(32), + "llama.attention.head_count_kv": uint32(8), + }, + Expected: map[string]string{ + "general.architecture": "llama", + "general.file_type": "1", + "general.parameter_count": "106496", + "general.type": "adapter", + "general.version": "v0.2", + "adapter.lora.alpha": "16", + "adapter.type": "lora", + "llama.attention.head_count": "32", + "llama.attention.head_count_kv": "8", + "blk.31.attn_q.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50", + "blk.31.attn_q.weight.lora_b": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50", + "blk.31.attn_v.weight.lora_a": "0eb3318b02cd313429bcc7621b539fdbb10240fea190c56c9e5f93fcd37a4e50", + "blk.31.attn_v.weight.lora_b": "071dcafe89df065d6e1c935ecb8fdf6479b3c202eb912e7da938597673ff5857", + }, + }, + } + + for _, c := range cases { + t.Run(c.Name, func(t *testing.T) { + t.Parallel() + + f, err := os.CreateTemp(t.TempDir(), "f16") + if err != nil { + t.Fatal(err) + } + defer f.Close() + + tempDir := t.TempDir() + generateLoraTestData(t, tempDir) + + if err = ConvertAdapter(os.DirFS(tempDir), f, c.BaseKV); err != nil { + t.Fatal(err) + } + + r, err := os.Open(f.Name()) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + m, _, err := llm.DecodeGGML(r, math.MaxInt) + if err != nil { + t.Fatal(err) + } + + if _, err := r.Seek(0, io.SeekStart); err != nil { + t.Fatal(err) + } + + actual := generateResultsJSON(t, r, m.KV(), m.Tensors()) + + keys := maps.Keys(c.Expected) + slices.Sort(keys) + for _, k := range keys { + if v, ok := actual[k]; !ok { + t.Errorf("missing %s", k) + } else if v != c.Expected[k] { + t.Errorf("unexpected %s: want %s, got %s", k, c.Expected[k], v) + } + } + }) + } +} + +func generateLoraTestData(t *testing.T, tempDir string) { + type tensorData struct { + Offsets []int `json:"data_offsets"` + Type string `json:"dtype"` + Shape []int `json:"shape"` + } + offset := 4096 * 8 * 4 + + td := map[string]*tensorData{"__metadata__": nil} + td["model.layers.31.self_attn.q_proj.lora_a"] = &tensorData{ + Offsets: []int{0, offset}, + Type: "F32", + Shape: []int{4096, 8}, + } + td["model.layers.31.self_attn.q_proj.lora_b"] = &tensorData{ + Offsets: []int{offset, offset * 2}, + Type: "F32", + Shape: []int{8, 4096}, + } + td["model.layers.31.self_attn.v_proj.lora_a"] = &tensorData{ + Offsets: []int{offset * 2, offset * 3}, + Type: "F32", + Shape: []int{4096, 8}, + } + td["model.layers.31.self_attn.v_proj.lora_b"] = &tensorData{ + Offsets: []int{offset * 3, offset*3 + 8*1024*4}, + Type: "F32", + Shape: []int{8, 1024}, + } + + data, err := json.Marshal(td) + if err != nil { + t.Fatal(err) + } + + var buf bytes.Buffer + + l := int64(len(data)) + err = binary.Write(&buf, binary.LittleEndian, l) + if err != nil { + t.Fatal(err) + } + + _, err = buf.Write(data) + if err != nil { + t.Fatal(err) + } + + // write some data for the tensors + + ones := make([]float32, 4096*8) + for i := range ones { + ones[i] = float32(1) + } + + for range 3 { + err = binary.Write(&buf, binary.LittleEndian, ones) + if err != nil { + t.Fatal(err) + } + } + + ones = make([]float32, 1024*8) + for i := range ones { + ones[i] = float32(1) + } + + err = binary.Write(&buf, binary.LittleEndian, ones) + if err != nil { + t.Fatal(err) + } + + fdata, err := os.Create(filepath.Join(tempDir, "adapters.safetensors")) + if err != nil { + t.Fatal(err) + } + defer fdata.Close() + + _, err = fdata.Write(buf.Bytes()) + if err != nil { + t.Fatal(err) + } + + configData := ` +{ + "adapter_path": "adapters-test", + "batch_size": 8, + "config": "config-tiny.json", + "data": "../discollama-completion", + "grad_checkpoint": null, + "iters": 1000, + "learning_rate": 1e-05, + "lora_layers": 1, + "lora_parameters": { + "rank": 8, + "alpha": 16, + "dropout": 0.0, + "scale": 2.0 + }, + "lr_schedule": null, + "max_seq_length": 2048, + "model": "/Users/pdevine/git/Meta-Llama-3-8B-Instruct", + "resume_adapter_file": null, + "save_every": 100, + "seed": 0, + "steps_per_eval": 200, + "steps_per_report": 10, + "test": false, + "test_batches": 500, + "train": true, + "use_dora": false, + "val_batches": 25 +} +` + f, err := os.Create(filepath.Join(tempDir, "adapter_config.json")) + if err != nil { + t.Fatal(err) + } + defer f.Close() + + _, err = f.WriteString(configData) + if err != nil { + t.Fatal(err) + } +} diff --git a/convert/reader.go b/convert/reader.go index 5bba0406..c1218e66 100644 --- a/convert/reader.go +++ b/convert/reader.go @@ -64,6 +64,8 @@ func parseTensors(fsys fs.FS, replacer *strings.Replacer) ([]Tensor, error) { }{ {"model-*-of-*.safetensors", parseSafetensors}, {"model.safetensors", parseSafetensors}, + {"adapters.safetensors", parseSafetensors}, + {"adapter_model.safetensors", parseSafetensors}, {"pytorch_model-*-of-*.bin", parseTorch}, {"pytorch_model.bin", parseTorch}, {"consolidated.*.pth", parseTorch}, diff --git a/llm/ggml.go b/llm/ggml.go index 4c68adf9..ab436095 100644 --- a/llm/ggml.go +++ b/llm/ggml.go @@ -43,6 +43,14 @@ func (kv KV) Architecture() string { return "unknown" } +func (kv KV) Kind() string { + if s, ok := kv["general.type"].(string); ok { + return s + } + + return "unknown" +} + func (kv KV) ParameterCount() uint64 { return kv.u64("general.parameter_count") } diff --git a/server/images.go b/server/images.go index 8b3a67cf..b5bf7ad6 100644 --- a/server/images.go +++ b/server/images.go @@ -369,13 +369,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio parameters := make(map[string]any) var layers []Layer + var baseLayers []*layerGGML for _, c := range modelfile.Commands { mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name) + command := c.Name - switch c.Name { + switch command { case "model", "adapter": - var baseLayers []*layerGGML - if name := model.ParseName(c.Args); name.IsValid() { + if name := model.ParseName(c.Args); name.IsValid() && command == "model" { baseLayers, err = parseFromModel(ctx, name, fn) if err != nil { return err @@ -409,14 +410,14 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio } defer blob.Close() - baseLayers, err = parseFromFile(ctx, blob, digest, fn) + baseLayers, err = parseFromFile(ctx, command, baseLayers, blob, digest, fn) if err != nil { return err } } else if file, err := os.Open(realpath(modelFileDir, c.Args)); err == nil { defer file.Close() - baseLayers, err = parseFromFile(ctx, file, "", fn) + baseLayers, err = parseFromFile(ctx, command, baseLayers, file, "", fn) if err != nil { return err } diff --git a/server/model.go b/server/model.go index b17bf0e3..55fb2d8d 100644 --- a/server/model.go +++ b/server/model.go @@ -81,7 +81,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe return layers, nil } -func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { +func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML, f *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { fi, err := f.Stat() if err != nil { return nil, err @@ -108,16 +108,38 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api. defer t.Close() defer os.Remove(t.Name()) - fn(api.ProgressResponse{Status: "converting model"}) - if err := convert.Convert(convert.NewZipReader(r, p, 32<<20), t); err != nil { - return nil, err + var layerType string + + switch command { + case "adapter": + var baseModel *llm.GGML + for _, l := range baseLayers { + if l.GGML != nil { + baseModel = l.GGML + break + } + } + + if baseModel == nil { + return nil, fmt.Errorf("no base model specified for the adapter") + } + + if err := convert.ConvertAdapter(convert.NewZipReader(r, p, 32<<20), t, baseModel.KV()); err != nil { + return nil, err + } + layerType = "application/vnd.ollama.image.adapter" + case "model": + if err := convert.ConvertModel(convert.NewZipReader(r, p, 32<<20), t); err != nil { + return nil, err + } + layerType = "application/vnd.ollama.image.model" } if _, err := t.Seek(0, io.SeekStart); err != nil { return nil, err } - layer, err := NewLayer(t, "application/vnd.ollama.image.model") + layer, err := NewLayer(t, layerType) if err != nil { return nil, err } @@ -139,7 +161,7 @@ func parseFromZipFile(_ context.Context, f *os.File, digest string, fn func(api. return detectChatTemplate(layers) } -func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { +func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) { sr := io.NewSectionReader(file, 0, 512) contentType, err := detectContentType(sr) if err != nil { @@ -150,7 +172,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap case "gguf", "ggla": // noop case "application/zip": - return parseFromZipFile(ctx, file, digest, fn) + return parseFromZipFile(ctx, command, baseLayers, file, digest, fn) default: return nil, fmt.Errorf("unsupported content type: %s", contentType) } @@ -170,7 +192,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap } mediatype := "application/vnd.ollama.image.model" - if ggml.Name() == "ggla" { + if ggml.Name() == "ggla" || ggml.KV().Kind() == "adapter" { mediatype = "application/vnd.ollama.image.adapter" } else if ggml.KV().Architecture() == "clip" { mediatype = "application/vnd.ollama.image.projector" diff --git a/server/model_test.go b/server/model_test.go index 63fc408d..7753c549 100644 --- a/server/model_test.go +++ b/server/model_test.go @@ -153,7 +153,7 @@ func TestParseFromFileFromLayer(t *testing.T) { t.Fatalf("failed to seek to start: %v", err) } - layers, err := parseFromFile(context.Background(), file, "", func(api.ProgressResponse) {}) + layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, "", func(api.ProgressResponse) {}) if err != nil { t.Fatalf("failed to parse from file: %v", err) } @@ -166,7 +166,7 @@ func TestParseFromFileFromLayer(t *testing.T) { t.Fatalf("failed to seek to start: %v", err) } - layers2, err := parseFromFile(context.Background(), file, layers[0].Digest, func(api.ProgressResponse) {}) + layers2, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file, layers[0].Digest, func(api.ProgressResponse) {}) if err != nil { t.Fatalf("failed to parse from file: %v", err) } @@ -206,7 +206,7 @@ func TestParseLayerFromCopy(t *testing.T) { t.Fatalf("failed to seek to start: %v", err) } - layers, err := parseFromFile(context.Background(), file2, "", func(api.ProgressResponse) {}) + layers, err := parseFromFile(context.Background(), "model", []*layerGGML{}, file2, "", func(api.ProgressResponse) {}) if err != nil { t.Fatalf("failed to parse from file: %v", err) } From bb362caf88487a08cb29c8263d383a7d9e448803 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Tue, 2 Jul 2024 15:02:07 -0700 Subject: [PATCH 32/34] update faq --- docs/faq.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index 324116d1..25b68248 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -111,7 +111,10 @@ On Windows, Ollama inherits your user and system environment variables. ## How do I use Ollama behind a proxy? -Ollama is compatible with proxy servers if `HTTP_PROXY` or `HTTPS_PROXY` are configured. When using either variables, ensure it is set where `ollama serve` can access the values. When using `HTTPS_PROXY`, ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform. +Ollama pulls models from the Internet and may require a proxy server to access the models. Use `HTTPS_PROXY` to redirect outbound requests through the proxy. Ensure the proxy certificate is installed as a system certificate. Refer to the section above for how to use environment variables on your platform. + +> [!NOTE] +> Avoid setting `HTTP_PROXY`. Ollama does not use HTTP for model pulls, only HTTPS. Setting `HTTP_PROXY` may interrupt client connections to the server. ### How do I use Ollama behind a proxy in Docker? @@ -276,4 +279,4 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit ## How does Ollama load models on multiple GPUs? -Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs. \ No newline at end of file +Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs. From 69be940bf6d2816f61c79facfa336183bc882720 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 23 Aug 2024 15:11:56 -0700 Subject: [PATCH 33/34] gpu: Group GPU Library sets by variant (#6483) The recent cuda variant changes uncovered a bug in ByLibrary which failed to group by common variant for GPU types. --- gpu/gpu_test.go | 25 +++++++++++++++++++++++++ gpu/types.go | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/gpu/gpu_test.go b/gpu/gpu_test.go index 46d3201e..13a3f544 100644 --- a/gpu/gpu_test.go +++ b/gpu/gpu_test.go @@ -32,4 +32,29 @@ func TestCPUMemInfo(t *testing.T) { } } +func TestByLibrary(t *testing.T) { + type testCase struct { + input []GpuInfo + expect int + } + + testCases := map[string]*testCase{ + "empty": {input: []GpuInfo{}, expect: 0}, + "cpu": {input: []GpuInfo{{Library: "cpu"}}, expect: 1}, + "cpu + GPU": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}}, expect: 2}, + "cpu + 2 GPU no variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda"}, {Library: "cuda"}}, expect: 2}, + "cpu + 2 GPU same variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v11"}}, expect: 2}, + "cpu + 2 GPU diff variant": {input: []GpuInfo{{Library: "cpu"}, {Library: "cuda", Variant: "v11"}, {Library: "cuda", Variant: "v12"}}, expect: 3}, + } + + for k, v := range testCases { + t.Run(k, func(t *testing.T) { + resp := (GpuInfoList)(v.input).ByLibrary() + if len(resp) != v.expect { + t.Fatalf("expected length %d, got %d => %+v", v.expect, len(resp), resp) + } + }) + } +} + // TODO - add some logic to figure out card type through other means and actually verify we got back what we expected diff --git a/gpu/types.go b/gpu/types.go index 4cbbeb84..a30e5fb3 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -94,7 +94,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList { } } if !found { - libs = append(libs, info.Library) + libs = append(libs, requested) resp = append(resp, []GpuInfo{info}) } } From 0f92b19bec97198b035a7801eda14e3d48149033 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sat, 24 Aug 2024 17:24:50 -0700 Subject: [PATCH 34/34] Only enable numa on CPUs (#6484) The numa flag may be having a performance impact on multi-socket systems with GPU loads --- llm/server.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/server.go b/llm/server.go index 9347a458..4e5dac28 100644 --- a/llm/server.go +++ b/llm/server.go @@ -258,7 +258,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr params = append(params, "--mlock") } - if gpu.IsNUMA() { + if gpu.IsNUMA() && gpus[0].Library == "cpu" { numaMode := "distribute" if runtime.GOOS == "linux" { if _, err := exec.LookPath("numactl"); err == nil {