From 27331ae3a8fa3d5fba2a4b7105c18eed10f5b8af Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Mon, 8 Jan 2024 11:44:59 -0800 Subject: [PATCH 01/22] download: add inactivity monitor if a download part is inactive for some time, restart it --- server/download.go | 119 ++++++++++++++++++++++++++++----------------- 1 file changed, 75 insertions(+), 44 deletions(-) diff --git a/server/download.go b/server/download.go index cf86f994..12e19cab 100644 --- a/server/download.go +++ b/server/download.go @@ -25,6 +25,11 @@ import ( "github.com/jmorganca/ollama/format" ) +const maxRetries = 6 + +var errMaxRetriesExceeded = errors.New("max retries exceeded") +var errPartStalled = errors.New("part stalled") + var blobDownloadManager sync.Map type blobDownload struct { @@ -44,10 +49,11 @@ type blobDownload struct { } type blobDownloadPart struct { - N int - Offset int64 - Size int64 - Completed int64 + N int + Offset int64 + Size int64 + Completed int64 + lastUpdated time.Time *blobDownload `json:"-"` } @@ -72,6 +78,13 @@ func (p *blobDownloadPart) StopsAt() int64 { return p.Offset + p.Size } +func (p *blobDownloadPart) Write(b []byte) (n int, err error) { + n = len(b) + p.blobDownload.Completed.Add(int64(n)) + p.lastUpdated = time.Now() + return n, nil +} + func (b *blobDownload) Prepare(ctx context.Context, requestURL *url.URL, opts *RegistryOptions) error { partFilePaths, err := filepath.Glob(b.Name + "-partial-*") if err != nil { @@ -157,6 +170,9 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis case errors.Is(err, context.Canceled), errors.Is(err, syscall.ENOSPC): // return immediately if the context is canceled or the device is out of space return err + case errors.Is(err, errPartStalled): + try-- + continue case err != nil: sleep := time.Second * time.Duration(math.Pow(2, float64(try))) log.Printf("%s part %d attempt %d failed: %v, retrying in %s", b.Digest[7:19], part.N, try, err, sleep) @@ -195,28 +211,54 @@ func (b *blobDownload) run(ctx context.Context, requestURL *url.URL, opts *Regis } func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w io.Writer, part *blobDownloadPart, opts *RegistryOptions) error { - headers := make(http.Header) - headers.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1)) - resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, opts) - if err != nil { - return err - } - defer resp.Body.Close() + g, ctx := errgroup.WithContext(ctx) + g.Go(func() error { + headers := make(http.Header) + headers.Set("Range", fmt.Sprintf("bytes=%d-%d", part.StartsAt(), part.StopsAt()-1)) + resp, err := makeRequestWithRetry(ctx, http.MethodGet, requestURL, headers, nil, opts) + if err != nil { + return err + } + defer resp.Body.Close() - n, err := io.Copy(w, io.TeeReader(resp.Body, b)) - if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) { - // rollback progress - b.Completed.Add(-n) - return err - } + n, err := io.Copy(w, io.TeeReader(resp.Body, part)) + if err != nil && !errors.Is(err, context.Canceled) && !errors.Is(err, io.ErrUnexpectedEOF) { + // rollback progress + b.Completed.Add(-n) + return err + } - part.Completed += n - if err := b.writePart(part.Name(), part); err != nil { - return err - } + part.Completed += n + if err := b.writePart(part.Name(), part); err != nil { + return err + } - // return nil or context.Canceled or UnexpectedEOF (resumable) - return err + // return nil or context.Canceled or UnexpectedEOF (resumable) + return err + }) + + g.Go(func() error { + ticker := time.NewTicker(time.Second) + for { + select { + case <-ticker.C: + if part.Completed >= part.Size { + return nil + } + + if !part.lastUpdated.IsZero() && time.Since(part.lastUpdated) > 5*time.Second { + log.Printf("%s part %d stalled; retrying", b.Digest[7:19], part.N) + // reset last updated + part.lastUpdated = time.Time{} + return errPartStalled + } + case <-ctx.Done(): + return ctx.Err() + } + } + }) + + return g.Wait() } func (b *blobDownload) newPart(offset, size int64) error { @@ -255,12 +297,6 @@ func (b *blobDownload) writePart(partName string, part *blobDownloadPart) error return json.NewEncoder(partFile).Encode(part) } -func (b *blobDownload) Write(p []byte) (n int, err error) { - n = len(p) - b.Completed.Add(int64(n)) - return n, nil -} - func (b *blobDownload) acquire() { b.references.Add(1) } @@ -279,20 +315,19 @@ func (b *blobDownload) Wait(ctx context.Context, fn func(api.ProgressResponse)) for { select { case <-ticker.C: + fn(api.ProgressResponse{ + Status: fmt.Sprintf("pulling %s", b.Digest[7:19]), + Digest: b.Digest, + Total: b.Total, + Completed: b.Completed.Load(), + }) + + if b.done || b.err != nil { + return b.err + } case <-ctx.Done(): return ctx.Err() } - - fn(api.ProgressResponse{ - Status: fmt.Sprintf("pulling %s", b.Digest[7:19]), - Digest: b.Digest, - Total: b.Total, - Completed: b.Completed.Load(), - }) - - if b.done || b.err != nil { - return b.err - } } } @@ -303,10 +338,6 @@ type downloadOpts struct { fn func(api.ProgressResponse) } -const maxRetries = 6 - -var errMaxRetriesExceeded = errors.New("max retries exceeded") - // downloadBlob downloads a blob from the registry and stores it in the blobs directory func downloadBlob(ctx context.Context, opts downloadOpts) error { fp, err := GetBlobsPath(opts.digest) From 681a91499010be819dd45a1390e668b0817e7338 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sat, 20 Jan 2024 10:48:43 -0800 Subject: [PATCH 02/22] Add support for CUDA 5.2 cards --- gpu/gpu.go | 7 ++++--- llm/generate/gen_linux.sh | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 67bd8352..06d022fe 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -29,8 +29,9 @@ type handles struct { var gpuMutex sync.Mutex var gpuHandles *handles = nil -// With our current CUDA compile flags, 5.2 and older will not work properly -const CudaComputeMajorMin = 6 +// With our current CUDA compile flags, older than 5.2 will not work properly +// 5.0: CUDA error: no kernel image is available for execution on the device +var CudaComputeMin = [2]C.int{5, 2} // Possible locations for the nvidia-ml library var CudaLinuxGlobs = []string{ @@ -133,7 +134,7 @@ func GetGPUInfo() GpuInfo { if cc.err != nil { slog.Info(fmt.Sprintf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err))) C.free(unsafe.Pointer(cc.err)) - } else if cc.major >= CudaComputeMajorMin { + } else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) { slog.Info(fmt.Sprintf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)) resp.Library = "cuda" } else { diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 0fcf1356..f00d717c 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -125,7 +125,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then if [ -n "${CUDA_MAJOR}" ]; then CUDA_VARIANT=_v${CUDA_MAJOR} fi - CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" + CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}" EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" build From a447a083f2169e2a3c975cb5951d8b0b0dcddb04 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Sat, 20 Jan 2024 12:15:50 -0800 Subject: [PATCH 03/22] Add compute capability 5.0, 7.5, and 8.0 --- docs/development.md | 3 ++- gpu/gpu.go | 5 ++--- llm/generate/gen_common.sh | 3 +++ llm/generate/gen_linux.sh | 2 +- llm/generate/gen_windows.ps1 | 7 ++++++- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/docs/development.md b/docs/development.md index 5369f5a8..04599f6c 100644 --- a/docs/development.md +++ b/docs/development.md @@ -50,7 +50,8 @@ development and runtime packages. Typically the build scripts will auto-detect CUDA, however, if your Linux distro or installation approach uses unusual paths, you can specify the location by specifying an environment variable `CUDA_LIB_DIR` to the location of the shared -libraries, and `CUDACXX` to the location of the nvcc compiler. +libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize +set set of target CUDA architectues by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70") Then generate dependencies: diff --git a/gpu/gpu.go b/gpu/gpu.go index 06d022fe..61821b5d 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -29,9 +29,8 @@ type handles struct { var gpuMutex sync.Mutex var gpuHandles *handles = nil -// With our current CUDA compile flags, older than 5.2 will not work properly -// 5.0: CUDA error: no kernel image is available for execution on the device -var CudaComputeMin = [2]C.int{5, 2} +// With our current CUDA compile flags, older than 5.0 will not work properly +var CudaComputeMin = [2]C.int{5, 0} // Possible locations for the nvidia-ml library var CudaLinuxGlobs = []string{ diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index d1e64d7d..4762e185 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -39,6 +39,9 @@ init_vars() { *) ;; esac + if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then + CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" + fi } git_module_setup() { diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index f00d717c..507c54c8 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -125,7 +125,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then if [ -n "${CUDA_MAJOR}" ]; then CUDA_VARIANT=_v${CUDA_MAJOR} fi - CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" + CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}" BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}" EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda" build diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 109b8602..e7fac03a 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -25,6 +25,11 @@ function init_vars { } $script:GZIP=(get-command -ea 'silentlycontinue' gzip).path $script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path + if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) { + $script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80" + } else { + $script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES + } } function git_module_setup { @@ -128,7 +133,7 @@ if ($null -ne $script:CUDA_LIB_DIR) { } init_vars $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT" - $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on") + $script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}") build install cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib" From cd22855ef868609d74c64516f9b9cf92f1c662c9 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 24 Jan 2024 10:48:31 -0800 Subject: [PATCH 04/22] refactor tensor read --- llm/gguf.go | 115 ++++++++++++++++++++++++++++------------------------ 1 file changed, 61 insertions(+), 54 deletions(-) diff --git a/llm/gguf.go b/llm/gguf.go index cfcab758..436be42c 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -69,12 +69,65 @@ type tensor struct { name string kind uint32 offset uint64 - size uint64 // shape is the number of elements in each dimension shape [4]uint64 } +func (t tensor) blockSize() uint64 { + switch { + case t.kind < 2: + return 1 + case t.kind < 10: + return 32 + default: + return 256 + } +} + +func (t tensor) typeSize() uint64 { + blockSize := t.blockSize() + + switch t.kind { + case 0: // FP32 + return 4 + case 1: // FP16 + return 2 + case 2: // Q4_0 + return 2 + blockSize/2 + case 3: // Q4_1 + return 2 + 2 + blockSize/2 + case 6: // Q5_0 + return 2 + 4 + blockSize/2 + case 7: // Q5_1 + return 2 + 2 + 4 + blockSize/2 + case 8: // Q8_0 + return 2 + blockSize + case 9: // Q8_1 + return 4 + 4 + blockSize + case 10: // Q2_K + return blockSize/16 + blockSize/4 + 2 + 2 + case 11: // Q3_K + return blockSize/8 + blockSize/4 + 12 + 2 + case 12: // Q4_K + return 2 + 2 + 12 + blockSize/2 + case 13: // Q5_K + return 2 + 2 + 12 + blockSize/8 + blockSize/2 + case 14: // Q6_K + return blockSize/2 + blockSize/4 + blockSize/16 + 2 + default: + return 0 + } +} + +func (t tensor) parameters() uint64 { + return t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3] +} + +func (t tensor) size() uint64 { + return t.parameters() * t.typeSize() / t.blockSize() +} + type ggufModel struct { *containerGGUF @@ -201,61 +254,15 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error { shape[i] = llm.readU64(rso) } - kind := llm.readU32(rso) - offset := llm.readU64(rso) - - var blockSize uint64 - switch { - case kind < 2: - blockSize = 1 - case kind < 10: - blockSize = 32 - default: - blockSize = 256 - } - - var typeSize uint64 - switch kind { - case 0: // FP32 - typeSize = 4 - case 1: // FP16 - typeSize = 2 - case 2: // Q4_0 - typeSize = 2 + blockSize/2 - case 3: // Q4_1 - typeSize = 2 + 2 + blockSize/2 - case 6: // Q5_0 - typeSize = 2 + 4 + blockSize/2 - case 7: // Q5_1 - typeSize = 2 + 2 + 4 + blockSize/2 - case 8: // Q8_0 - typeSize = 2 + blockSize - case 9: // Q8_1 - typeSize = 4 + 4 + blockSize - case 10: // Q2_K - typeSize = blockSize/16 + blockSize/4 + 2 + 2 - case 11: // Q3_K - typeSize = blockSize/8 + blockSize/4 + 12 + 2 - case 12: // Q4_K - typeSize = 2 + 2 + 12 + blockSize/2 - case 13: // Q5_K - typeSize = 2 + 2 + 12 + blockSize/8 + blockSize/2 - case 14: // Q6_K - typeSize = blockSize/2 + blockSize/4 + blockSize/16 + 2 - } - - parameters := shape[0] * shape[1] * shape[2] * shape[3] - size := parameters * typeSize / blockSize - - llm.tensors = append(llm.tensors, tensor{ + tensor := tensor{ name: name, - kind: kind, - offset: offset, - size: size, + kind: llm.readU32(rso), + offset: llm.readU64(rso), shape: shape, - }) + } - llm.parameters += parameters + llm.tensors = append(llm.tensors, tensor) + llm.parameters += tensor.parameters() } alignment, ok := llm.kv["general.alignment"].(uint32) @@ -265,7 +272,7 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error { rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent) for _, tensor := range llm.tensors { - padded := (int64(tensor.size) + int64(alignment) - 1) & ^(int64(alignment) - 1) + padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1) rso.Seek(padded, io.SeekCurrent) } From 8e5d359a03ac2339fe410892b271a8cd31764220 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Wed, 24 Jan 2024 17:29:47 -0800 Subject: [PATCH 05/22] stub generate outputs for lint --- .github/workflows/test.yaml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index ad178cab..fc58cfce 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -45,7 +45,6 @@ jobs: path: | llm/llama.cpp/build/**/lib/* lint: - needs: generate strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] @@ -69,10 +68,19 @@ jobs: with: go-version: '1.21' cache: false - - uses: actions/download-artifact@v4 - with: - name: ${{ matrix.os }}-${{ matrix.arch }}-libraries - path: llm/llama.cpp/build + - run: | + mkdir -p llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/ + touch llm/llama.cpp/build/linux/${{ matrix.arch }}/stub/lib/stub.so + if: ${{ startsWith(matrix.os, 'ubuntu-') }} + - run: | + mkdir -p llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/ + touch llm/llama.cpp/build/darwin/${{ matrix.arch }}/stub/lib/stub.dylib + touch llm/llama.cpp/ggml-metal.metal + if: ${{ startsWith(matrix.os, 'macos-') }} + - run: | + mkdir -p llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/ + touch llm/llama.cpp/build/windows/${{ matrix.arch }}/stub/lib/stub.dll + if: ${{ startsWith(matrix.os, 'windows-') }} - uses: golangci/golangci-lint-action@v3 test: needs: generate From 7c40a67841fd32073b66984e24605e5a0cc46f1a Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Thu, 25 Jan 2024 12:12:36 -0800 Subject: [PATCH 06/22] Save and load sessions (#2063) --- api/types.go | 2 + cmd/cmd.go | 20 +++--- cmd/interactive.go | 151 +++++++++++++++++++++++++++++++++------- cmd/interactive_test.go | 65 +++++++++++++++++ parser/parser.go | 11 +++ parser/parser_test.go | 35 ++++++++++ server/images.go | 52 ++++++++++++-- server/routes.go | 15 +++- 8 files changed, 312 insertions(+), 39 deletions(-) diff --git a/api/types.go b/api/types.go index d4e385bf..585daf6c 100644 --- a/api/types.go +++ b/api/types.go @@ -171,6 +171,7 @@ type ShowResponse struct { Template string `json:"template,omitempty"` System string `json:"system,omitempty"` Details ModelDetails `json:"details,omitempty"` + Messages []Message `json:"messages,omitempty"` } type CopyRequest struct { @@ -236,6 +237,7 @@ type GenerateResponse struct { } type ModelDetails struct { + ParentModel string `json:"parent_model"` Format string `json:"format"` Family string `json:"family"` Families []string `json:"families"` diff --git a/cmd/cmd.go b/cmd/cmd.go index 76e3c7a9..915fa993 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -458,15 +458,17 @@ func RunGenerate(cmd *cobra.Command, args []string) error { type generateContextKey string type runOptions struct { - Model string - Prompt string - Messages []api.Message - WordWrap bool - Format string - System string - Template string - Images []api.ImageData - Options map[string]interface{} + Model string + ParentModel string + Prompt string + Messages []api.Message + WordWrap bool + Format string + System string + Template string + Images []api.ImageData + Options map[string]interface{} + MultiModal bool } type displayResponseState struct { diff --git a/cmd/interactive.go b/cmd/interactive.go index da3c5b72..d337e555 100644 --- a/cmd/interactive.go +++ b/cmd/interactive.go @@ -7,12 +7,14 @@ import ( "net/http" "os" "regexp" + "sort" "strings" "github.com/spf13/cobra" "golang.org/x/exp/slices" "github.com/jmorganca/ollama/api" + "github.com/jmorganca/ollama/progress" "github.com/jmorganca/ollama/readline" ) @@ -25,43 +27,75 @@ const ( MultilineTemplate ) -func modelIsMultiModal(cmd *cobra.Command, name string) bool { - // get model details +func loadModel(cmd *cobra.Command, opts *runOptions) error { client, err := api.ClientFromEnvironment() if err != nil { - fmt.Println("error: couldn't connect to ollama server") - return false + return err } - req := api.ShowRequest{Name: name} - resp, err := client.Show(cmd.Context(), &req) + p := progress.NewProgress(os.Stderr) + defer p.StopAndClear() + + spinner := progress.NewSpinner("") + p.Add("", spinner) + + showReq := api.ShowRequest{Name: opts.Model} + showResp, err := client.Show(cmd.Context(), &showReq) if err != nil { - return false + return err + } + opts.MultiModal = slices.Contains(showResp.Details.Families, "clip") + opts.ParentModel = showResp.Details.ParentModel + + if len(showResp.Messages) > 0 { + opts.Messages = append(opts.Messages, showResp.Messages...) } - return slices.Contains(resp.Details.Families, "clip") + chatReq := &api.ChatRequest{ + Model: opts.Model, + Messages: []api.Message{}, + } + err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error { + p.StopAndClear() + if len(opts.Messages) > 0 { + for _, msg := range opts.Messages { + switch msg.Role { + case "user": + fmt.Printf(">>> %s\n", msg.Content) + case "assistant": + state := &displayResponseState{} + displayResponse(msg.Content, opts.WordWrap, state) + fmt.Println() + fmt.Println() + } + } + } + return nil + }) + if err != nil { + return err + } + + return nil } func generateInteractive(cmd *cobra.Command, opts runOptions) error { - multiModal := modelIsMultiModal(cmd, opts.Model) + opts.Messages = make([]api.Message, 0) - // load the model - loadOpts := runOptions{ - Model: opts.Model, - Prompt: "", - Messages: []api.Message{}, - } - if _, err := chat(cmd, loadOpts); err != nil { + err := loadModel(cmd, &opts) + if err != nil { return err } usage := func() { fmt.Fprintln(os.Stderr, "Available Commands:") - fmt.Fprintln(os.Stderr, " /set Set session variables") - fmt.Fprintln(os.Stderr, " /show Show model information") - fmt.Fprintln(os.Stderr, " /bye Exit") - fmt.Fprintln(os.Stderr, " /?, /help Help for a command") - fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts") + fmt.Fprintln(os.Stderr, " /set Set session variables") + fmt.Fprintln(os.Stderr, " /show Show model information") + fmt.Fprintln(os.Stderr, " /load Load a session or model") + fmt.Fprintln(os.Stderr, " /save Save your current session") + fmt.Fprintln(os.Stderr, " /bye Exit") + fmt.Fprintln(os.Stderr, " /?, /help Help for a command") + fmt.Fprintln(os.Stderr, " /? shortcuts Help for keyboard shortcuts") fmt.Fprintln(os.Stderr, "") fmt.Fprintln(os.Stderr, "Use \"\"\" to begin a multi-line message.") fmt.Fprintln(os.Stderr, "") @@ -140,7 +174,6 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { var sb strings.Builder var multiline MultilineState - opts.Messages = make([]api.Message, 0) for { line, err := scanner.Readline() @@ -203,6 +236,44 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { if err := ListHandler(cmd, args[1:]); err != nil { return err } + case strings.HasPrefix(line, "/load"): + args := strings.Fields(line) + if len(args) != 2 { + fmt.Println("Usage:\n /load ") + continue + } + opts.Model = args[1] + opts.Messages = []api.Message{} + fmt.Printf("Loading model '%s'\n", opts.Model) + if err := loadModel(cmd, &opts); err != nil { + return err + } + continue + case strings.HasPrefix(line, "/save"): + args := strings.Fields(line) + if len(args) != 2 { + fmt.Println("Usage:\n /save ") + continue + } + + client, err := api.ClientFromEnvironment() + if err != nil { + fmt.Println("error: couldn't connect to ollama server") + return err + } + + req := &api.CreateRequest{ + Name: args[1], + Modelfile: buildModelfile(opts), + } + fn := func(resp api.ProgressResponse) error { return nil } + err = client.Create(cmd.Context(), req, fn) + if err != nil { + fmt.Println("error: couldn't save model") + return err + } + fmt.Printf("Created new model '%s'\n", args[1]) + continue case strings.HasPrefix(line, "/set"): args := strings.Fields(line) if len(args) > 1 { @@ -389,7 +460,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { args := strings.Fields(line) isFile := false - if multiModal { + if opts.MultiModal { for _, f := range extractFileNames(line) { if strings.HasPrefix(f, args[0]) { isFile = true @@ -411,7 +482,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { if sb.Len() > 0 && multiline == MultilineNone { newMessage := api.Message{Role: "user", Content: sb.String()} - if multiModal { + if opts.MultiModal { msg, images, err := extractFileData(sb.String()) if err != nil { return err @@ -454,6 +525,38 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error { } } +func buildModelfile(opts runOptions) string { + var mf strings.Builder + model := opts.ParentModel + if model == "" { + model = opts.Model + } + fmt.Fprintf(&mf, "FROM %s\n", model) + if opts.System != "" { + fmt.Fprintf(&mf, "SYSTEM \"\"\"%s\"\"\"\n", opts.System) + } + + if opts.Template != "" { + fmt.Fprintf(&mf, "TEMPLATE \"\"\"%s\"\"\"\n", opts.Template) + } + + keys := make([]string, 0) + for k := range opts.Options { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + fmt.Fprintf(&mf, "PARAMETER %s %v\n", k, opts.Options[k]) + } + fmt.Fprintln(&mf) + + for _, msg := range opts.Messages { + fmt.Fprintf(&mf, "MESSAGE %s \"\"\"%s\"\"\"\n", msg.Role, msg.Content) + } + + return mf.String() +} + func normalizeFilePath(fp string) string { // Define a map of escaped characters and their replacements replacements := map[string]string{ diff --git a/cmd/interactive_test.go b/cmd/interactive_test.go index 1bd5058a..19e43287 100644 --- a/cmd/interactive_test.go +++ b/cmd/interactive_test.go @@ -1,9 +1,13 @@ package cmd import ( + "bytes" "testing" + "text/template" "github.com/stretchr/testify/assert" + + "github.com/jmorganca/ollama/api" ) func TestExtractFilenames(t *testing.T) { @@ -49,3 +53,64 @@ d:\path with\spaces\seven.svg inbetween7 c:\users\jdoe\eight.png inbetween8 assert.Contains(t, res[9], "ten.svg") assert.Contains(t, res[9], "E:") } + +func TestModelfileBuilder(t *testing.T) { + opts := runOptions{ + Model: "hork", + System: "You are part horse and part shark, but all hork. Do horklike things", + Template: "This is a template.", + Messages: []api.Message{ + {Role: "user", Content: "Hey there hork!"}, + {Role: "assistant", Content: "Yes it is true, I am half horse, half shark."}, + }, + Options: map[string]interface{}{}, + } + + opts.Options["temperature"] = 0.9 + opts.Options["seed"] = 42 + opts.Options["penalize_newline"] = false + opts.Options["stop"] = []string{"hi", "there"} + + mf := buildModelfile(opts) + expectedModelfile := `FROM {{.Model}} +SYSTEM """{{.System}}""" +TEMPLATE """{{.Template}}""" +PARAMETER penalize_newline false +PARAMETER seed 42 +PARAMETER stop [hi there] +PARAMETER temperature 0.9 + +MESSAGE user """Hey there hork!""" +MESSAGE assistant """Yes it is true, I am half horse, half shark.""" +` + + tmpl, err := template.New("").Parse(expectedModelfile) + assert.Nil(t, err) + + var buf bytes.Buffer + err = tmpl.Execute(&buf, opts) + assert.Nil(t, err) + assert.Equal(t, buf.String(), mf) + + opts.ParentModel = "horseshark" + mf = buildModelfile(opts) + expectedModelfile = `FROM {{.ParentModel}} +SYSTEM """{{.System}}""" +TEMPLATE """{{.Template}}""" +PARAMETER penalize_newline false +PARAMETER seed 42 +PARAMETER stop [hi there] +PARAMETER temperature 0.9 + +MESSAGE user """Hey there hork!""" +MESSAGE assistant """Yes it is true, I am half horse, half shark.""" +` + + tmpl, err = template.New("").Parse(expectedModelfile) + assert.Nil(t, err) + + var parentBuf bytes.Buffer + err = tmpl.Execute(&parentBuf, opts) + assert.Nil(t, err) + assert.Equal(t, parentBuf.String(), mf) +} diff --git a/parser/parser.go b/parser/parser.go index 2fbd3cc5..947848b2 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "log/slog" + "slices" ) type Command struct { @@ -56,6 +57,16 @@ func Parse(reader io.Reader) ([]Command, error) { command.Args = string(bytes.TrimSpace(fields[1])) case "EMBED": return nil, fmt.Errorf("deprecated command: EMBED is no longer supported, use the /embed API endpoint instead") + case "MESSAGE": + command.Name = string(bytes.ToLower(fields[0])) + fields = bytes.SplitN(fields[1], []byte(" "), 2) + if len(fields) < 2 { + return nil, fmt.Errorf("should be in the format ") + } + if !slices.Contains([]string{"system", "user", "assistant"}, string(bytes.ToLower(fields[0]))) { + return nil, fmt.Errorf("role must be one of \"system\", \"user\", or \"assistant\"") + } + command.Args = fmt.Sprintf("%s: %s", string(bytes.ToLower(fields[0])), string(fields[1])) default: if !bytes.HasPrefix(fields[0], []byte("#")) { // log a warning for unknown commands diff --git a/parser/parser_test.go b/parser/parser_test.go index 53555ad1..25e849b5 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -61,3 +61,38 @@ PARAMETER param1 assert.ErrorContains(t, err, "missing value for [param1]") } + +func Test_Parser_Messages(t *testing.T) { + + input := ` +FROM foo +MESSAGE system You are a Parser. Always Parse things. +MESSAGE user Hey there! +MESSAGE assistant Hello, I want to parse all the things! +` + + reader := strings.NewReader(input) + commands, err := Parse(reader) + assert.Nil(t, err) + + expectedCommands := []Command{ + {Name: "model", Args: "foo"}, + {Name: "message", Args: "system: You are a Parser. Always Parse things."}, + {Name: "message", Args: "user: Hey there!"}, + {Name: "message", Args: "assistant: Hello, I want to parse all the things!"}, + } + + assert.Equal(t, expectedCommands, commands) +} + +func Test_Parser_Messages_BadRole(t *testing.T) { + + input := ` +FROM foo +MESSAGE badguy I'm a bad guy! +` + + reader := strings.NewReader(input) + _, err := Parse(reader) + assert.ErrorContains(t, err, "role must be one of \"system\", \"user\", or \"assistant\"") +} diff --git a/server/images.go b/server/images.go index a20f6bd7..ab3b4faa 100644 --- a/server/images.go +++ b/server/images.go @@ -41,7 +41,7 @@ type Model struct { Config ConfigV2 ShortName string ModelPath string - OriginalModel string + ParentModel string AdapterPaths []string ProjectorPaths []string Template string @@ -50,6 +50,12 @@ type Model struct { Digest string Size int64 Options map[string]interface{} + Messages []Message +} + +type Message struct { + Role string `json:"role"` + Content string `json:"content"` } type PromptVars struct { @@ -333,7 +339,7 @@ func GetModel(name string) (*Model, error) { switch layer.MediaType { case "application/vnd.ollama.image.model": model.ModelPath = filename - model.OriginalModel = layer.From + model.ParentModel = layer.From case "application/vnd.ollama.image.embed": // Deprecated in versions > 0.1.2 // TODO: remove this warning in a future version @@ -374,6 +380,16 @@ func GetModel(name string) (*Model, error) { if err = json.NewDecoder(params).Decode(&model.Options); err != nil { return nil, err } + case "application/vnd.ollama.image.messages": + msgs, err := os.Open(filename) + if err != nil { + return nil, err + } + defer msgs.Close() + + if err = json.NewDecoder(msgs).Decode(&model.Messages); err != nil { + return nil, err + } case "application/vnd.ollama.image.license": bts, err := os.ReadFile(filename) if err != nil { @@ -428,12 +444,12 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars } var layers Layers + messages := []string{} params := make(map[string][]string) fromParams := make(map[string]any) for _, c := range commands { - slog.Info(fmt.Sprintf("[%s] - %s", c.Name, c.Args)) mediatype := fmt.Sprintf("application/vnd.ollama.image.%s", c.Name) switch c.Name { @@ -607,11 +623,37 @@ func CreateModel(ctx context.Context, name, modelFileDir string, commands []pars } layers.Replace(layer) + case "message": + messages = append(messages, c.Args) default: params[c.Name] = append(params[c.Name], c.Args) } } + if len(messages) > 0 { + fn(api.ProgressResponse{Status: "creating parameters layer"}) + + msgs := make([]api.Message, 0) + + for _, m := range messages { + // todo: handle images + msg := strings.SplitN(m, ": ", 2) + msgs = append(msgs, api.Message{Role: msg[0], Content: msg[1]}) + } + + var b bytes.Buffer + if err := json.NewEncoder(&b).Encode(msgs); err != nil { + return err + } + + layer, err := NewLayer(&b, "application/vnd.ollama.image.messages") + if err != nil { + return err + } + + layers.Replace(layer) + } + if len(params) > 0 { fn(api.ProgressResponse{Status: "creating parameters layer"}) @@ -908,8 +950,8 @@ func ShowModelfile(model *Model) (string, error) { mt.Model = model mt.From = model.ModelPath - if model.OriginalModel != "" { - mt.From = model.OriginalModel + if model.ParentModel != "" { + mt.From = model.ParentModel } modelFile := `# Modelfile generated by "ollama show" diff --git a/server/routes.go b/server/routes.go index 0c145ae6..141f05d4 100644 --- a/server/routes.go +++ b/server/routes.go @@ -659,6 +659,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { } modelDetails := api.ModelDetails{ + ParentModel: model.ParentModel, Format: model.Config.ModelFormat, Family: model.Config.ModelFamily, Families: model.Config.ModelFamilies, @@ -674,11 +675,17 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) { model.Template = req.Template } + msgs := make([]api.Message, 0) + for _, msg := range model.Messages { + msgs = append(msgs, api.Message{Role: msg.Role, Content: msg.Content}) + } + resp := &api.ShowResponse{ License: strings.Join(model.License, "\n"), System: model.System, Template: model.Template, Details: modelDetails, + Messages: msgs, } var params []string @@ -1075,7 +1082,13 @@ func ChatHandler(c *gin.Context) { // an empty request loads the model if len(req.Messages) == 0 { - c.JSON(http.StatusOK, api.ChatResponse{CreatedAt: time.Now().UTC(), Model: req.Model, Done: true, Message: api.Message{Role: "assistant"}}) + resp := api.ChatResponse{ + CreatedAt: time.Now().UTC(), + Model: req.Model, + Done: true, + Message: api.Message{Role: "assistant"}, + } + c.JSON(http.StatusOK, resp) return } From a64570dcae17794adf100a85667180e03b6d7ef2 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 25 Jan 2024 13:46:20 -0800 Subject: [PATCH 07/22] Fix clearing kv cache between requests with the same prompt (#2186) * Fix clearing kv cache between requests with the same prompt * fix powershell script --- llm/dyn_ext_server.go | 1 + llm/generate/gen_common.sh | 11 +++++++++++ llm/generate/gen_windows.ps1 | 23 +++++++++++++++++++++++ llm/patches/01-cache.diff | 30 ++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+) create mode 100644 llm/patches/01-cache.diff diff --git a/llm/dyn_ext_server.go b/llm/dyn_ext_server.go index 45e2dc72..8674a514 100644 --- a/llm/dyn_ext_server.go +++ b/llm/dyn_ext_server.go @@ -190,6 +190,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu "seed": predict.Options.Seed, "stop": predict.Options.Stop, "image_data": imageData, + "cache_prompt": true, } if predict.Format == "json" { diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index d1e64d7d..0b7b2ae4 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -61,6 +61,17 @@ apply_patches() { if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt fi + + # apply temporary patches until fix is upstream + for patch in ../patches/*.diff; do + for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do + (cd ${LLAMACPP_DIR}; git checkout ${file}) + done + done + for patch in ../patches/*.diff; do + (cd ${LLAMACPP_DIR} && git apply ${patch}) + done + # Avoid duplicate main symbols when we link into the cgo binary sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp && mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 109b8602..e4271997 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -40,6 +40,29 @@ function apply_patches { if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) { Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama' } + + # Apply temporary patches until fix is upstream + $patches = Get-ChildItem "../patches/*.diff" + foreach ($patch in $patches) { + # Extract file paths from the patch file + $filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object { + $parts = $_ -split ' ' + ($parts[1] -split '/', 2)[1] + } + + # Checkout each file + foreach ($file in $filePaths) { + Set-Location -Path ${script:llamacppDir} + git checkout $file + } + } + + # Apply each patch + foreach ($patch in $patches) { + Set-Location -Path ${script:llamacppDir} + git apply $patch.FullName + } + # Avoid duplicate main symbols when we link into the cgo binary $content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp" $content = $content -replace 'int main\(', 'int __main(' diff --git a/llm/patches/01-cache.diff b/llm/patches/01-cache.diff new file mode 100644 index 00000000..f8392495 --- /dev/null +++ b/llm/patches/01-cache.diff @@ -0,0 +1,30 @@ +diff --git a/examples/server/server.cpp b/examples/server/server.cpp +index 0462fbd2..4fa7b57f 100644 +--- a/examples/server/server.cpp ++++ b/examples/server/server.cpp +@@ -1857,12 +1857,6 @@ struct llama_server_context + LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); + } + +- LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); +- +- llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); +- +- slot.cache_tokens = prompt_tokens; +- + if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) + { + // we have to evaluate at least 1 token to generate logits. +@@ -1870,6 +1864,12 @@ struct llama_server_context + slot.n_past--; + } + ++ LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); ++ ++ llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); ++ ++ slot.cache_tokens = prompt_tokens; ++ + LOG_VERBOSE("prompt ingested", { + {"n_past", slot.n_past}, + {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)}, From 3ebd6a83fcfbdecf3ccbae13ebaf4435c853465d Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 25 Jan 2024 13:54:11 -0800 Subject: [PATCH 08/22] update submodule to `cd4fddb29f81d6a1f6d51a0c016bc6b486d68def` --- llm/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/llama.cpp b/llm/llama.cpp index 011e8ec5..cd4fddb2 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit 011e8ec577fd135cbc02993d3ea9840c516d6a1c +Subproject commit cd4fddb29f81d6a1f6d51a0c016bc6b486d68def From 0610126049e1aa61945aeaabbc55e9aae99d8c77 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Thu, 18 Jan 2024 17:19:12 -0800 Subject: [PATCH 09/22] remove env setting --- .github/workflows/test.yaml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index fc58cfce..36691b26 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -23,20 +23,6 @@ jobs: with: go-version: '1.21' cache: true - - if: ${{ startsWith(matrix.os, 'windows-') }} - shell: pwsh - run: | - $path = vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath - if ($path) { - $path = join-path $path 'Common7\Tools\vsdevcmd.bat' - if (test-path $path) { - cmd /s /c """$path"" $args && set" | where { $_ -match '(\w+)=(.*)' } | foreach { - echo "$($Matches[1])=$($Matches[2])" | Out-File -FilePath $Env:GITHUB_ENV -Encoding utf8 -Append - } - } - } - - echo "C:\Program Files\Git\usr\bin" | Out-File -FilePath $Env:GITHUB_PATH -Encoding utf8 -Append - run: go get ./... - run: go generate -x ./... - uses: actions/upload-artifact@v4 From 946431d5b073ff2f620faabb182336895cd174dc Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 22 Dec 2023 12:17:37 -0800 Subject: [PATCH 10/22] build cuda and rocm --- .github/workflows/test.yaml | 82 +++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 36691b26..5ba4c6e8 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -28,8 +28,66 @@ jobs: - uses: actions/upload-artifact@v4 with: name: ${{ matrix.os }}-${{ matrix.arch }}-libraries - path: | - llm/llama.cpp/build/**/lib/* + path: llm/llama.cpp/build/**/lib/* + generate-cuda: + strategy: + matrix: + cuda-version: + - '11.8.0' + runs-on: ubuntu-latest + container: nvidia/cuda:${{ matrix.cuda-version }}-devel-ubuntu20.04 + steps: + - run: | + apt-get update && apt-get install -y git build-essential curl + curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \ + | tar -zx -C /usr --strip-components 1 + env: + DEBIAN_FRONTEND: noninteractive + - uses: actions/checkout@v4 + - uses: actions/setup-go@v4 + with: + go-version: '1.21' + cache: true + - run: go get ./... + - run: | + git config --global --add safe.directory /__w/ollama/ollama + go generate -x ./... + env: + OLLAMA_SKIP_CPU_GENERATE: '1' + - uses: actions/upload-artifact@v4 + with: + name: cuda-${{ matrix.cuda-version }}-libraries + path: llm/llama.cpp/build/**/lib/* + generate-rocm: + strategy: + matrix: + rocm-version: + - '5.7.1' + - '6.0' + runs-on: ubuntu-latest + container: rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }} + steps: + - run: | + apt-get update && apt-get install -y git build-essential curl rocm-libs + curl -fsSL https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-linux-x86_64.tar.gz \ + | tar -zx -C /usr --strip-components 1 + env: + DEBIAN_FRONTEND: noninteractive + - uses: actions/checkout@v4 + - uses: actions/setup-go@v4 + with: + go-version: '1.21' + cache: true + - run: go get ./... + - run: | + git config --global --add safe.directory /__w/ollama/ollama + go generate -x ./... + env: + OLLAMA_SKIP_CPU_GENERATE: '1' + - uses: actions/upload-artifact@v4 + with: + name: rocm-${{ matrix.rocm-version }}-libraries + path: llm/llama.cpp/build/**/lib/* lint: strategy: matrix: @@ -69,7 +127,10 @@ jobs: if: ${{ startsWith(matrix.os, 'windows-') }} - uses: golangci/golangci-lint-action@v3 test: - needs: generate + needs: + - generate + - generate-cuda + - generate-rocm strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] @@ -96,5 +157,20 @@ jobs: with: name: ${{ matrix.os }}-${{ matrix.arch }}-libraries path: llm/llama.cpp/build + - if: ${{ matrix.os == 'ubuntu-latest' }} + uses: actions/download-artifact@v4 + with: + name: cuda-11.8.0-libraries + path: llm/llama.cpp/build + - if: ${{ matrix.os == 'ubuntu-latest' }} + uses: actions/download-artifact@v4 + with: + name: rocm-5.7.1-libraries + path: llm/llama.cpp/build + - if: ${{ matrix.os == 'ubuntu-latest' }} + uses: actions/download-artifact@v4 + with: + name: rocm-6.0-libraries + path: llm/llama.cpp/build - run: go build - run: go test -v ./... From 5580de45717c2f1b85ff66f4012d87d6bf8c2963 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 22 Dec 2023 16:06:35 -0800 Subject: [PATCH 11/22] archive ollama binaries --- .github/workflows/test.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 5ba4c6e8..c98b1f6e 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -174,3 +174,7 @@ jobs: path: llm/llama.cpp/build - run: go build - run: go test -v ./... + - uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.os }}-binaries + path: ollama From a8c5413d06b417f9484d8723da308215a7eed922 Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 19 Jan 2024 09:20:19 -0800 Subject: [PATCH 12/22] only generate gpu libs --- .github/workflows/test.yaml | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c98b1f6e..f5174c33 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -127,10 +127,7 @@ jobs: if: ${{ startsWith(matrix.os, 'windows-') }} - uses: golangci/golangci-lint-action@v3 test: - needs: - - generate - - generate-cuda - - generate-rocm + needs: generate strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] @@ -157,21 +154,6 @@ jobs: with: name: ${{ matrix.os }}-${{ matrix.arch }}-libraries path: llm/llama.cpp/build - - if: ${{ matrix.os == 'ubuntu-latest' }} - uses: actions/download-artifact@v4 - with: - name: cuda-11.8.0-libraries - path: llm/llama.cpp/build - - if: ${{ matrix.os == 'ubuntu-latest' }} - uses: actions/download-artifact@v4 - with: - name: rocm-5.7.1-libraries - path: llm/llama.cpp/build - - if: ${{ matrix.os == 'ubuntu-latest' }} - uses: actions/download-artifact@v4 - with: - name: rocm-6.0-libraries - path: llm/llama.cpp/build - run: go build - run: go test -v ./... - uses: actions/upload-artifact@v4 From b706794905cc154a269f099c804ddfd8bed1f1b2 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 25 Jan 2024 16:29:32 -0800 Subject: [PATCH 13/22] Update modelfile.md to include `MESSAGE` --- docs/modelfile.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/modelfile.md b/docs/modelfile.md index 6134bf9c..7a61da99 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -38,6 +38,7 @@ INSTRUCTION arguments | [`SYSTEM`](#system) | Specifies the system message that will be set in the template. | | [`ADAPTER`](#adapter) | Defines the (Q)LoRA adapters to apply to the model. | | [`LICENSE`](#license) | Specifies the legal license. | +| [`MESSAGE`](#message) | Specify message history. | ## Examples @@ -205,6 +206,19 @@ LICENSE """ """ ``` +### MESSAGE + +The `MESSAGE` instruction allows you to specify a message history for the model to use when responding: + +```modelfile +MESSAGE user Is Toronto in Canada? +MESSAGE assistant yes +MESSAGE user Is Sacramento in Canada? +MESSAGE assistant no +MESSAGE user Is Ontario in Canada? +MESSAGE assistant yes +``` + ## Notes - the **`Modelfile` is not case sensitive**. In the examples, uppercase instructions are used to make it easier to distinguish it from arguments. From 5be9bdd444d69d86cf87ee5ae0b8c70e75b5ea35 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 25 Jan 2024 16:29:48 -0800 Subject: [PATCH 14/22] Update modelfile.md --- docs/modelfile.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/modelfile.md b/docs/modelfile.md index 7a61da99..6d6ac152 100644 --- a/docs/modelfile.md +++ b/docs/modelfile.md @@ -19,6 +19,7 @@ A model file is the blueprint to create and share models with Ollama. - [SYSTEM](#system) - [ADAPTER](#adapter) - [LICENSE](#license) + - [MESSAGE](#message) - [Notes](#notes) ## Format From a34e1ad3cf371d402032af5099a056d04d72ea0f Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 25 Jan 2024 16:46:01 -0800 Subject: [PATCH 15/22] Switch back to ubuntu base The size increase for rocm support in the standard image is problematic We'll revisit multiple tags for rocm support in a follow up PR. --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9767faa3..a58b963a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -109,7 +109,8 @@ ARG CGO_CFLAGS RUN go build . # Runtime stages -FROM --platform=linux/amd64 rocm/dev-centos-7:6.0-complete as runtime-amd64 +FROM --platform=linux/amd64 ubuntu:22.04 as runtime-amd64 +RUN apt-get update && apt-get install -y ca-certificates COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64 RUN apt-get update && apt-get install -y ca-certificates From 5d9c4a5f5a5d6aab0e8b4aedf504c6a0e526b2f4 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 26 Jan 2024 09:18:33 -0800 Subject: [PATCH 16/22] Fix crash on cuda ml init failure The new driver lookup code was triggering after init failure due to a missing return --- gpu/gpu_info_cuda.c | 1 + 1 file changed, 1 insertion(+) diff --git a/gpu/gpu_info_cuda.c b/gpu/gpu_info_cuda.c index 9299b22c..d877ff0c 100644 --- a/gpu/gpu_info_cuda.c +++ b/gpu/gpu_info_cuda.c @@ -70,6 +70,7 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) { resp->ch.handle = NULL; snprintf(buf, buflen, "nvml vram init failure: %d", ret); resp->err = strdup(buf); + return; } // Report driver version if we're in verbose mode, ignore errors From 9d7b5d6c91b0686619010f5355d7eb44c856a95d Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 25 Jan 2024 15:57:32 -0800 Subject: [PATCH 17/22] Ignore AMD integrated GPUs Detect and ignore integrated GPUs reported by rocm. --- gpu/gpu.go | 26 +++++++++++++++++++++++++- gpu/gpu_info.h | 1 + gpu/gpu_info_rocm.c | 11 +++++++++-- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index fb120ea5..743b27d1 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -16,6 +16,7 @@ import ( "os" "path/filepath" "runtime" + "strconv" "strings" "sync" "unsafe" @@ -147,7 +148,28 @@ func GetGPUInfo() GpuInfo { if memInfo.err != nil { slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))) C.free(unsafe.Pointer(memInfo.err)) + } else if memInfo.igpu_index >= 0 && memInfo.count == 1 { + // Only one GPU detected and it appears to be an integrated GPU - skip it + slog.Info("ROCm unsupported integrated GPU detected") } else { + if memInfo.igpu_index >= 0 { + // We have multiple GPUs reported, and one of them is an integrated GPU + // so we have to set the env var to bypass it + // If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it + val := os.Getenv("ROCR_VISIBLE_DEVICES") + if val == "" { + devices := []string{} + for i := 0; i < int(memInfo.count); i++ { + if i == int(memInfo.igpu_index) { + continue + } + devices = append(devices, strconv.Itoa(i)) + } + val = strings.Join(devices, ",") + os.Setenv("ROCR_VISIBLE_DEVICES", val) + } + slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val)) + } resp.Library = "rocm" var version C.rocm_version_resp_t C.rocm_get_version(*gpuHandles.rocm, &version) @@ -199,7 +221,9 @@ func CheckVRAM() (int64, error) { if overhead < gpus*1024*1024*1024 { overhead = gpus * 1024 * 1024 * 1024 } - return int64(gpuInfo.FreeMemory - overhead), nil + avail := int64(gpuInfo.FreeMemory - overhead) + slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024)) + return avail, nil } return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation diff --git a/gpu/gpu_info.h b/gpu/gpu_info.h index f32efa8e..e52d2066 100644 --- a/gpu/gpu_info.h +++ b/gpu/gpu_info.h @@ -42,6 +42,7 @@ typedef struct mem_info { uint64_t total; uint64_t free; unsigned int count; + int igpu_index; // If >= 0, we detected an integrated GPU to ignore char *err; // If non-nill, caller responsible for freeing } mem_info_t; diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c index 59ab0817..2d1db7bb 100644 --- a/gpu/gpu_info_rocm.c +++ b/gpu/gpu_info_rocm.c @@ -77,6 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { resp->err = NULL; + resp->igpu_index = -1; uint64_t totalMem = 0; uint64_t usedMem = 0; rsmi_status_t ret; @@ -162,8 +163,14 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { } LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem); LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem); - resp->total += totalMem; - resp->free += totalMem - usedMem; + if (totalMem < 1024 * 1024 * 1024) { + // Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory + LOG(h.verbose, "[%d] ROCm integrated GPU\n", i); + resp->igpu_index = i; + } else { + resp->total += totalMem; + resp->free += totalMem - usedMem; + } } } From 75c44aa319738b696cd13e82b016bbdcdc39cdad Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Thu, 25 Jan 2024 16:58:05 -0800 Subject: [PATCH 18/22] Add back ROCm container support This adds ROCm support back as a discrete image. --- Dockerfile | 12 +++++++++++- scripts/build_docker.sh | 10 ++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a58b963a..7c921df8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -116,11 +116,21 @@ FROM --platform=linux/arm64 ubuntu:22.04 as runtime-arm64 RUN apt-get update && apt-get install -y ca-certificates COPY --from=build-arm64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama +# Radeon images are much larger so we keep it distinct from the CPU/CUDA image +FROM --platform=linux/amd64 rocm/dev-centos-7:5.7.1-complete as runtime-rocm +RUN update-pciids +COPY --from=build-amd64 /go/src/github.com/jmorganca/ollama/ollama /bin/ollama +EXPOSE 11434 +ENV OLLAMA_HOST 0.0.0.0 + +ENTRYPOINT ["/bin/ollama"] +CMD ["serve"] + FROM runtime-$TARGETARCH EXPOSE 11434 ENV OLLAMA_HOST 0.0.0.0 ENV PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin -ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/opt/rocm/lib: +ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility ENTRYPOINT ["/bin/ollama"] diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh index ef02a144..40054ca6 100755 --- a/scripts/build_docker.sh +++ b/scripts/build_docker.sh @@ -13,3 +13,13 @@ docker build \ -f Dockerfile \ -t ollama/ollama:$VERSION \ . + +docker build \ + --load \ + --platform=linux/amd64 \ + --build-arg=VERSION \ + --build-arg=GOFLAGS \ + --target runtime-rocm \ + -f Dockerfile \ + -t ollama/ollama:$VERSION-rocm \ + . From 9d3dcfd0ec94df07f9b10be3c09b93d6ad52c95e Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 26 Jan 2024 11:04:27 -0800 Subject: [PATCH 19/22] fix logging --- server/download.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/download.go b/server/download.go index 8570b590..f089bd41 100644 --- a/server/download.go +++ b/server/download.go @@ -247,7 +247,7 @@ func (b *blobDownload) downloadChunk(ctx context.Context, requestURL *url.URL, w } if !part.lastUpdated.IsZero() && time.Since(part.lastUpdated) > 5*time.Second { - log.Printf("%s part %d stalled; retrying", b.Digest[7:19], part.N) + slog.Info(fmt.Sprintf("%s part %d stalled; retrying", b.Digest[7:19], part.N)) // reset last updated part.lastUpdated = time.Time{} return errPartStalled From 667a2ba18add1031cc4b208eba7cedf8b33548e6 Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Fri, 26 Jan 2024 11:11:09 -0800 Subject: [PATCH 20/22] Detect lack of AVX and fallback to CPU mode We build the GPU libraries with AVX enabled to ensure that if not all layers fit on the GPU we get better performance in a mixed mode. If the user is using a virtualization/emulation system that lacks AVX this used to result in an illegal instruction error and crash before this fix. Now we will report a warning in the server log, and just use CPU mode to ensure we don't crash. --- gpu/gpu.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index 743b27d1..e234ec39 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -122,9 +122,15 @@ func GetGPUInfo() GpuInfo { initGPUHandles() } + // All our GPU builds have AVX enabled, so fallback to CPU if we don't detect at least AVX + cpuVariant := GetCPUVariant() + if cpuVariant == "" { + slog.Warn("CPU does not have AVX or AVX2, disabling GPU support.") + } + var memInfo C.mem_info_t resp := GpuInfo{} - if gpuHandles.cuda != nil { + if gpuHandles.cuda != nil && cpuVariant != "" { C.cuda_check_vram(*gpuHandles.cuda, &memInfo) if memInfo.err != nil { slog.Info(fmt.Sprintf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))) @@ -143,7 +149,7 @@ func GetGPUInfo() GpuInfo { slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)) } } - } else if gpuHandles.rocm != nil { + } else if gpuHandles.rocm != nil && cpuVariant != "" { C.rocm_check_vram(*gpuHandles.rocm, &memInfo) if memInfo.err != nil { slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))) @@ -185,7 +191,7 @@ func GetGPUInfo() GpuInfo { if resp.Library == "" { C.cpu_check_ram(&memInfo) resp.Library = "cpu" - resp.Variant = GetCPUVariant() + resp.Variant = cpuVariant } if memInfo.err != nil { slog.Info(fmt.Sprintf("error looking up CPU memory: %s", C.GoString(memInfo.err))) From b5cf31b4606a1faa083bd713ea9233bcf46ee570 Mon Sep 17 00:00:00 2001 From: Patrick Devine Date: Fri, 26 Jan 2024 14:28:02 -0800 Subject: [PATCH 21/22] add keep_alive to generate/chat/embedding api endpoints (#2146) --- api/types.go | 42 +++++++++++++++++++++++++----------------- server/routes.go | 26 +++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/api/types.go b/api/types.go index 585daf6c..609c4a8a 100644 --- a/api/types.go +++ b/api/types.go @@ -34,24 +34,26 @@ func (e StatusError) Error() string { type ImageData []byte type GenerateRequest struct { - Model string `json:"model"` - Prompt string `json:"prompt"` - System string `json:"system"` - Template string `json:"template"` - Context []int `json:"context,omitempty"` - Stream *bool `json:"stream,omitempty"` - Raw bool `json:"raw,omitempty"` - Format string `json:"format"` - Images []ImageData `json:"images,omitempty"` + Model string `json:"model"` + Prompt string `json:"prompt"` + System string `json:"system"` + Template string `json:"template"` + Context []int `json:"context,omitempty"` + Stream *bool `json:"stream,omitempty"` + Raw bool `json:"raw,omitempty"` + Format string `json:"format"` + KeepAlive *Duration `json:"keep_alive,omitempty"` + Images []ImageData `json:"images,omitempty"` Options map[string]interface{} `json:"options"` } type ChatRequest struct { - Model string `json:"model"` - Messages []Message `json:"messages"` - Stream *bool `json:"stream,omitempty"` - Format string `json:"format"` + Model string `json:"model"` + Messages []Message `json:"messages"` + Stream *bool `json:"stream,omitempty"` + Format string `json:"format"` + KeepAlive *Duration `json:"keep_alive,omitempty"` Options map[string]interface{} `json:"options"` } @@ -126,8 +128,9 @@ type Runner struct { } type EmbeddingRequest struct { - Model string `json:"model"` - Prompt string `json:"prompt"` + Model string `json:"model"` + Prompt string `json:"prompt"` + KeepAlive *Duration `json:"keep_alive,omitempty"` Options map[string]interface{} `json:"options"` } @@ -413,14 +416,19 @@ func (d *Duration) UnmarshalJSON(b []byte) (err error) { case float64: if t < 0 { t = math.MaxFloat64 + d.Duration = time.Duration(t) + } else { + d.Duration = time.Duration(t * float64(time.Second)) } - - d.Duration = time.Duration(t) case string: d.Duration, err = time.ParseDuration(t) if err != nil { return err } + if d.Duration < 0 { + mf := math.MaxFloat64 + d.Duration = time.Duration(mf) + } } return nil diff --git a/server/routes.go b/server/routes.go index 141f05d4..56c275c9 100644 --- a/server/routes.go +++ b/server/routes.go @@ -186,7 +186,13 @@ func GenerateHandler(c *gin.Context) { return } - sessionDuration := defaultSessionDuration + var sessionDuration time.Duration + if req.KeepAlive == nil { + sessionDuration = defaultSessionDuration + } else { + sessionDuration = req.KeepAlive.Duration + } + if err := load(c, model, opts, sessionDuration); err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return @@ -378,7 +384,14 @@ func EmbeddingHandler(c *gin.Context) { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } - sessionDuration := defaultSessionDuration + + var sessionDuration time.Duration + if req.KeepAlive == nil { + sessionDuration = defaultSessionDuration + } else { + sessionDuration = req.KeepAlive.Duration + } + if err := load(c, model, opts, sessionDuration); err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return @@ -1074,7 +1087,14 @@ func ChatHandler(c *gin.Context) { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } - sessionDuration := defaultSessionDuration + + var sessionDuration time.Duration + if req.KeepAlive == nil { + sessionDuration = defaultSessionDuration + } else { + sessionDuration = req.KeepAlive.Duration + } + if err := load(c, model, opts, sessionDuration); err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return From 59d87127f51bb6e8eb0091c12ad5e58165ddc4f2 Mon Sep 17 00:00:00 2001 From: Jagadish Krishnamoorthy Date: Fri, 26 Jan 2024 22:08:27 -0800 Subject: [PATCH 22/22] Update gpu_info_rocm.c --- gpu/gpu_info_rocm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gpu/gpu_info_rocm.c b/gpu/gpu_info_rocm.c index 2d1db7bb..7ac88611 100644 --- a/gpu/gpu_info_rocm.c +++ b/gpu/gpu_info_rocm.c @@ -178,7 +178,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) { const int buflen = 256; char buf[buflen + 1]; if (h.handle == NULL) { - resp->str = strdup("nvml handle not initialized"); + resp->str = strdup("rocm handle not initialized"); resp->status = 1; return; } @@ -195,4 +195,4 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) { resp->str = strdup(buf); } -#endif // __APPLE__ \ No newline at end of file +#endif // __APPLE__