optional parameter to not stream response (#639)

* update streaming request accept header * add optional stream param to request bodies
2023-10-11 12:54:27 -04:00 · 2023-10-11 12:54:27 -04:00 · 274d5a5fdf
commit 274d5a5fdf
parent 77295f716e
4 changed files with 94 additions and 18 deletions
--- a/api/client.go
+++ b/api/client.go
@ -17,6 +17,10 @@ import (
 	"github.com/jmorganca/ollama/version"
 )

+const DefaultHost = "127.0.0.1:11434"
+
+var envHost = os.Getenv("OLLAMA_HOST")
+
 type Client struct {
 	base *url.URL
 	http http.Client
@ -143,7 +147,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	}

 	request.Header.Set("Content-Type", "application/json")
-	request.Header.Set("Accept", "application/json")
+	request.Header.Set("Accept", "application/x-ndjson")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))

 	response, err := c.http.Do(request)
--- a/api/types.go
+++ b/api/types.go
@ -37,6 +37,7 @@ type GenerateRequest struct {
 	System   string `json:"system"`
 	Template string `json:"template"`
 	Context  []int  `json:"context,omitempty"`
+	Stream   *bool  `json:"stream,omitempty"`

 	Options map[string]interface{} `json:"options"`
 }
@ -53,8 +54,9 @@ type EmbeddingResponse struct {
 }

 type CreateRequest struct {
-	Name string `json:"name"`
-	Path string `json:"path"`
+	Name   string `json:"name"`
+	Path   string `json:"path"`
+	Stream *bool  `json:"stream,omitempty"`
 }

 type DeleteRequest struct {
@ -81,6 +83,9 @@ type CopyRequest struct {
 type PullRequest struct {
 	Name     string `json:"name"`
 	Insecure bool   `json:"insecure,omitempty"`
+	Username string `json:"username"`
+	Password string `json:"password"`
+	Stream   *bool  `json:"stream,omitempty"`
 }

 type ProgressResponse struct {
@ -93,6 +98,9 @@ type ProgressResponse struct {
 type PushRequest struct {
 	Name     string `json:"name"`
 	Insecure bool   `json:"insecure,omitempty"`
+	Username string `json:"username"`
+	Password string `json:"password"`
+	Stream   *bool  `json:"stream,omitempty"`
 }

 type ListResponse struct {
@ -113,7 +121,7 @@ type TokenResponse struct {
 type GenerateResponse struct {
 	Model     string    `json:"model"`
 	CreatedAt time.Time `json:"created_at"`
-	Response  string    `json:"response,omitempty"`
+	Response  string    `json:"response"`

 	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`
--- a/docs/api.md
+++ b/docs/api.md
@ -12,7 +12,6 @@
 - [Push a Model](#push-a-model)
 - [Generate Embeddings](#generate-embeddings)

-
 ## Conventions

 ### Model names
@ -40,12 +39,13 @@ Generate a response for a given prompt with a provided model. This is a streamin
 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for

-Advanced parameters:
+Advanced parameters (optional):

 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `system`: system prompt to (overrides what is defined in the `Modelfile`)
 - `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
 - `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
+- `stream`: if `false` the response will be be returned as a single response object, rather than a stream of objects

 ### Request

@ -80,6 +80,7 @@ The final response in the stream also includes additional data about the generat
 - `eval_count`: number of tokens the response
 - `eval_duration`: time in nanoseconds spent generating the response
 - `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
+- `response`: empty if the response was streamed, if not streamed, this will contain the full response

 To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.

@ -87,6 +88,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 {
  "model": "llama2:7b",
  "created_at": "2023-08-04T19:22:45.499127Z",
+  "response": "",
  "context": [1, 2, 3],
  "done": true,
  "total_duration": 5589157167,
@ -112,6 +114,7 @@ Create a model from a [`Modelfile`](./modelfile.md)

 - `name`: name of the model to create
 - `path`: path to the Modelfile
+- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects

 ### Request

@ -189,10 +192,10 @@ curl http://localhost:11434/api/show -d '{

 ```json
 {
-    "license": "<contents of license block>",
-    "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
-    "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
-    "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
+  "license": "<contents of license block>",
+  "modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
+  "parameters": "stop                           [INST]\nstop                           [/INST]\nstop                           <<SYS>>\nstop                           <</SYS>>",
+  "template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
 }
 ```

@ -245,6 +248,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where

 - `name`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
+- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects

 ### Request

@ -276,6 +280,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding

 - `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
+- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects

 ### Request

@ -290,15 +295,16 @@ curl -X POST http://localhost:11434/api/push -d '{
 Streaming response that starts with:

 ```json
-{"status":"retrieving manifest"}
+{ "status": "retrieving manifest" }
 ```

 and then:

 ```json
 {
-"status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
-"total":1928429856
+  "status": "starting upload",
+  "digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
+  "total": 1928429856
 }
 ```

@ -306,9 +312,10 @@ Then there is a series of uploading responses:

 ```json
 {
-"status":"starting upload",
-"digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
-"total":1928429856}
+  "status": "starting upload",
+  "digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
+  "total": 1928429856
+}
 ```

 Finally, when the upload is complete:
--- a/server/routes.go
+++ b/server/routes.go
@ -240,6 +240,23 @@ func GenerateHandler(c *gin.Context) {
 		}
 	}()

+	if req.Stream != nil && !*req.Stream {
+		var response api.GenerateResponse
+		generated := ""
+		for resp := range ch {
+			if r, ok := resp.(api.GenerateResponse); ok {
+				generated += r.Response
+				response = r
+			} else {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
+				return
+			}
+		}
+		response.Response = generated
+		c.JSON(http.StatusOK, response)
+		return
+	}
+
 	streamResponse(c, ch)
 }

@ -309,6 +326,11 @@ func PullModelHandler(c *gin.Context) {
 		}
 	}()

+	if req.Stream != nil && !*req.Stream {
+		waitForStream(c, ch)
+		return
+	}
+
 	streamResponse(c, ch)
 }

@ -336,6 +358,11 @@ func PushModelHandler(c *gin.Context) {
 		}
 	}()

+	if req.Stream != nil && !*req.Stream {
+		waitForStream(c, ch)
+		return
+	}
+
 	streamResponse(c, ch)
 }

@ -363,6 +390,11 @@ func CreateModelHandler(c *gin.Context) {
 		}
 	}()

+	if req.Stream != nil && !*req.Stream {
+		waitForStream(c, ch)
+		return
+	}
+
 	streamResponse(c, ch)
 }

@ -603,6 +635,31 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 	return s.Serve(ln)
 }

+func waitForStream(c *gin.Context, ch chan interface{}) {
+	c.Header("Content-Type", "application/json")
+	for resp := range ch {
+		switch r := resp.(type) {
+		case api.ProgressResponse:
+			if r.Status == "success" {
+				c.JSON(http.StatusOK, r)
+				return
+			}
+		case gin.H:
+			if errorMsg, ok := r["error"].(string); ok {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
+				return
+			} else {
+				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in progress response"})
+				return
+			}
+		default:
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected progress response"})
+			return
+		}
+	}
+	c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected end of progress response"})
+}
+
 func streamResponse(c *gin.Context, ch chan any) {
 	c.Header("Content-Type", "application/x-ndjson")
 	c.Stream(func(w io.Writer) bool {