add "stop" command (#6739)

2024-09-11 16:36:21 -07:00 · 2024-09-11 16:36:21 -07:00 · abed273de3
commit abed273de3
parent 034392624c
5 changed files with 172 additions and 25 deletions
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@ -346,6 +346,39 @@ func (w *progressWriter) Write(p []byte) (n int, err error) {
 	return len(p), nil
 }
 func loadOrUnloadModel(cmd *cobra.Command, opts *runOptions) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.StopAndClear()
 	spinner := progress.NewSpinner("")
 	p.Add("", spinner)
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
 	req := &api.GenerateRequest{
 		Model:     opts.Model,
 		KeepAlive: opts.KeepAlive,
 	}
 	return client.Generate(cmd.Context(), req, func(api.GenerateResponse) error { return nil })
 }
 func StopHandler(cmd *cobra.Command, args []string) error {
 	opts := &runOptions{
 		Model:     args[0],
 		KeepAlive: &api.Duration{Duration: 0},
 	}
 	if err := loadOrUnloadModel(cmd, opts); err != nil {
 		if strings.Contains(err.Error(), "not found") {
 			return fmt.Errorf("couldn't find model \"%s\" to stop", args[0])
 		}
 	}
 	return nil
 }
 func RunHandler(cmd *cobra.Command, args []string) error {
 	interactive := true
@ -424,7 +457,7 @@ func RunHandler(cmd *cobra.Command, args []string) error {
 	opts.ParentModel = info.Details.ParentModel
 	if interactive {
-		if err := loadModel(cmd, &opts); err != nil {
+		if err := loadOrUnloadModel(cmd, &opts); err != nil {
 			return err
 		}
@ -615,7 +648,15 @@ func ListRunningHandler(cmd *cobra.Command, args []string) error {
 				cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
 				procStr = fmt.Sprintf("%d%%/%d%% CPU/GPU", int(cpuPercent), int(100-cpuPercent))
 			}
-			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, format.HumanTime(m.ExpiresAt, "Never")})
+
 			var until string
 			delta := time.Since(m.ExpiresAt)
 			if delta > 0 {
 				until = "Stopping..."
 			} else {
 				until = format.HumanTime(m.ExpiresAt, "Never")
 			}
 			data = append(data, []string{m.Name, m.Digest[:12], format.HumanBytes(m.Size), procStr, until})
 		}
 	}
@ -1294,6 +1335,15 @@ func NewCLI() *cobra.Command {
 	runCmd.Flags().Bool("insecure", false, "Use an insecure registry")
 	runCmd.Flags().Bool("nowordwrap", false, "Don't wrap words to the next line automatically")
 	runCmd.Flags().String("format", "", "Response format (e.g. json)")
 	stopCmd := &cobra.Command{
 		Use:     "stop MODEL",
 		Short:   "Stop a running model",
 		Args:    cobra.ExactArgs(1),
 		PreRunE: checkServerHeartbeat,
 		RunE:    StopHandler,
 	}
 	serveCmd := &cobra.Command{
 		Use:     "serve",
 		Aliases: []string{"start"},
@ -1361,6 +1411,7 @@ func NewCLI() *cobra.Command {
 		createCmd,
 		showCmd,
 		runCmd,
 		stopCmd,
 		pullCmd,
 		pushCmd,
 		listCmd,
@ -1400,6 +1451,7 @@ func NewCLI() *cobra.Command {
 		createCmd,
 		showCmd,
 		runCmd,
 		stopCmd,
 		pullCmd,
 		pushCmd,
 		listCmd,
--- a/cmd/interactive.go
+++ b/cmd/interactive.go
@ -18,7 +18,6 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/readline"
 	"github.com/ollama/ollama/types/errtypes"
 )
@ -31,26 +30,6 @@ const (
 	MultilineSystem
 )
 func loadModel(cmd *cobra.Command, opts *runOptions) error {
 	p := progress.NewProgress(os.Stderr)
 	defer p.StopAndClear()
 	spinner := progress.NewSpinner("")
 	p.Add("", spinner)
 	client, err := api.ClientFromEnvironment()
 	if err != nil {
 		return err
 	}
 	chatReq := &api.ChatRequest{
 		Model:     opts.Model,
 		KeepAlive: opts.KeepAlive,
 	}
 	return client.Chat(cmd.Context(), chatReq, func(api.ChatResponse) error { return nil })
 }
 func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 	usage := func() {
 		fmt.Fprintln(os.Stderr, "Available Commands:")
@ -217,7 +196,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 			opts.Model = args[1]
 			opts.Messages = []api.Message{}
 			fmt.Printf("Loading model '%s'\n", opts.Model)
-			if err := loadModel(cmd, &opts); err != nil {
+			if err := loadOrUnloadModel(cmd, &opts); err != nil {
 				return err
 			}
 			continue
--- a/server/routes.go
+++ b/server/routes.go
@ -117,6 +117,32 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 	}
 	// expire the runner
 	if req.Prompt == "" && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
 		model, err := GetModel(req.Model)
 		if err != nil {
 			switch {
 			case os.IsNotExist(err):
 				c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
 			case err.Error() == "invalid model name":
 				c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 			default:
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			}
 			return
 		}
 		s.sched.expireRunner(model)
 		c.JSON(http.StatusOK, api.GenerateResponse{
 			Model:      req.Model,
 			CreatedAt:  time.Now().UTC(),
 			Response:   "",
 			Done:       true,
 			DoneReason: "unload",
 		})
 		return
 	}
 	if req.Format != "" && req.Format != "json" {
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be empty or \"json\""})
 		return
@ -1322,6 +1348,32 @@ func (s *Server) ChatHandler(c *gin.Context) {
 		return
 	}
 	// expire the runner
 	if len(req.Messages) == 0 && req.KeepAlive != nil && int(req.KeepAlive.Seconds()) == 0 {
 		model, err := GetModel(req.Model)
 		if err != nil {
 			switch {
 			case os.IsNotExist(err):
 				c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model '%s' not found", req.Model)})
 			case err.Error() == "invalid model name":
 				c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
 			default:
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 			}
 			return
 		}
 		s.sched.expireRunner(model)
 		c.JSON(http.StatusOK, api.ChatResponse{
 			Model:      req.Model,
 			CreatedAt:  time.Now().UTC(),
 			Message:    api.Message{Role: "assistant"},
 			Done:       true,
 			DoneReason: "unload",
 		})
 		return
 	}
 	caps := []Capability{CapabilityCompletion}
 	if len(req.Tools) > 0 {
 		caps = append(caps, CapabilityTools)
--- a/server/sched.go
+++ b/server/sched.go
@ -360,7 +360,6 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 			slog.Debug("runner expired event received", "modelPath", runner.modelPath)
 			runner.refMu.Lock()
 			if runner.refCount > 0 {
 				// Shouldn't happen, but safeguard to ensure no leaked runners
 				slog.Debug("expired event with positive ref count, retrying", "modelPath", runner.modelPath, "refCount", runner.refCount)
 				go func(runner *runnerRef) {
 					// We can't unload yet, but want to as soon as the current request completes
@ -802,6 +801,25 @@ func (s *Scheduler) unloadAllRunners() {
 	}
 }
 func (s *Scheduler) expireRunner(model *Model) {
 	s.loadedMu.Lock()
 	defer s.loadedMu.Unlock()
 	runner, ok := s.loaded[model.ModelPath]
 	if ok {
 		runner.refMu.Lock()
 		runner.expiresAt = time.Now()
 		if runner.expireTimer != nil {
 			runner.expireTimer.Stop()
 			runner.expireTimer = nil
 		}
 		runner.sessionDuration = 0
 		if runner.refCount <= 0 {
 			s.expiredCh <- runner
 		}
 		runner.refMu.Unlock()
 	}
 }
 // If other runners are loaded, make sure the pending request will fit in system memory
 // If not, pick a runner to unload, else return nil and the request can be loaded
 func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -406,6 +406,52 @@ func TestGetRunner(t *testing.T) {
 	b.ctxDone()
 }
 func TestExpireRunner(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
 	defer done()
 	s := InitScheduler(ctx)
 	req := &LlmRequest{
 		ctx:             ctx,
 		model:           &Model{ModelPath: "foo"},
 		opts:            api.DefaultOptions(),
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
 	}
 	var ggml *llm.GGML
 	gpus := gpu.GpuInfoList{}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return server, nil
 	}
 	s.load(req, ggml, gpus, 0)
 	select {
 	case err := <-req.errCh:
 		if err != nil {
 			t.Fatalf("expected no errors when loading, got '%s'", err.Error())
 		}
 	case resp := <-req.successCh:
 		s.loadedMu.Lock()
 		if resp.refCount != uint(1) || len(s.loaded) != 1 {
 			t.Fatalf("expected a model to be loaded")
 		}
 		s.loadedMu.Unlock()
 	}
 	s.expireRunner(&Model{ModelPath: "foo"})
 	s.finishedReqCh <- req
 	s.processCompleted(ctx)
 	s.loadedMu.Lock()
 	if len(s.loaded) != 0 {
 		t.Fatalf("expected model to be unloaded")
 	}
 	s.loadedMu.Unlock()
 }
 // TODO - add one scenario that triggers the bogus finished event with positive ref count
 func TestPrematureExpired(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)