optional parameter to not stream response (#639)

* update streaming request accept header * add optional stream param to request bodies
2023-10-11 12:54:27 -04:00 · 2023-10-11 12:54:27 -04:00 · 274d5a5fdf
commit 274d5a5fdf
parent 77295f716e
4 changed files with 94 additions and 18 deletions
--- a/api/client.go
+++ b/api/client.go
@ -17,6 +17,10 @@ import (
 	"github.com/jmorganca/ollama/version"
 )
 const DefaultHost = "127.0.0.1:11434"
 var envHost = os.Getenv("OLLAMA_HOST")
 type Client struct {
 	base *url.URL
 	http http.Client
@ -143,7 +147,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
 	}
 	request.Header.Set("Content-Type", "application/json")
-	request.Header.Set("Accept", "application/json")
+	request.Header.Set("Accept", "application/x-ndjson")
 	request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
 	response, err := c.http.Do(request)
--- a/api/types.go
+++ b/api/types.go
@ -37,6 +37,7 @@ type GenerateRequest struct {
 	System   string `json:"system"`
 	Template string `json:"template"`
 	Context  []int  `json:"context,omitempty"`
 	Stream   *bool  `json:"stream,omitempty"`
 	Options map[string]interface{} `json:"options"`
 }
@ -55,6 +56,7 @@ type EmbeddingResponse struct {
 type CreateRequest struct {
 	Name   string `json:"name"`
 	Path   string `json:"path"`
 	Stream *bool  `json:"stream,omitempty"`
 }
 type DeleteRequest struct {
@ -81,6 +83,9 @@ type CopyRequest struct {
 type PullRequest struct {
 	Name     string `json:"name"`
 	Insecure bool   `json:"insecure,omitempty"`
 	Username string `json:"username"`
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`
 }
 type ProgressResponse struct {
@ -93,6 +98,9 @@ type ProgressResponse struct {
 type PushRequest struct {
 	Name     string `json:"name"`
 	Insecure bool   `json:"insecure,omitempty"`
 	Username string `json:"username"`
 	Password string `json:"password"`
 	Stream   *bool  `json:"stream,omitempty"`
 }
 type ListResponse struct {
@ -113,7 +121,7 @@ type TokenResponse struct {
 type GenerateResponse struct {
 	Model     string    `json:"model"`
 	CreatedAt time.Time `json:"created_at"`
-	Response  string    `json:"response,omitempty"`
+	Response  string    `json:"response"`
 	Done    bool  `json:"done"`
 	Context []int `json:"context,omitempty"`
--- a/docs/api.md
+++ b/docs/api.md
@ -12,7 +12,6 @@
 - [Push a Model](#push-a-model)
 - [Generate Embeddings](#generate-embeddings)
 ## Conventions
 ### Model names
@ -40,12 +39,13 @@ Generate a response for a given prompt with a provided model. This is a streamin
 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
-Advanced parameters:
+Advanced parameters (optional):
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `system`: system prompt to (overrides what is defined in the `Modelfile`)
 - `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
 - `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be be returned as a single response object, rather than a stream of objects
 ### Request
@ -80,6 +80,7 @@ The final response in the stream also includes additional data about the generat
 - `eval_count`: number of tokens the response
 - `eval_duration`: time in nanoseconds spent generating the response
 - `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
 - `response`: empty if the response was streamed, if not streamed, this will contain the full response
 To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
@ -87,6 +88,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 {
  "model": "llama2:7b",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
  "context": [1, 2, 3],
  "done": true,
  "total_duration": 5589157167,
@ -112,6 +114,7 @@ Create a model from a [`Modelfile`](./modelfile.md)
 - `name`: name of the model to create
 - `path`: path to the Modelfile
 - `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
 ### Request
@ -245,6 +248,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 - `name`: name of the model to pull
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
 - `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
 ### Request
@ -276,6 +280,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
 - `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
 - `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
 - `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
 ### Request
@ -297,7 +302,8 @@ and then:
 ```json
 {
-"status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
+  "status": "starting upload",
  "digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
  "total": 1928429856
 }
 ```
@ -308,7 +314,8 @@ Then there is a series of uploading responses:
 {
  "status": "starting upload",
  "digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
-"total":1928429856}
+  "total": 1928429856
 }
 ```
 Finally, when the upload is complete:
--- a/server/routes.go
+++ b/server/routes.go
@ -240,6 +240,23 @@ func GenerateHandler(c *gin.Context) {
 		}
 	}()
 	if req.Stream != nil && !*req.Stream {
 		var response api.GenerateResponse
 		generated := ""
 		for resp := range ch {
 			if r, ok := resp.(api.GenerateResponse); ok {
 				generated += r.Response
 				response = r
 			} else {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 				return
 			}
 		}
 		response.Response = generated
 		c.JSON(http.StatusOK, response)
 		return
 	}
 	streamResponse(c, ch)
 }
@ -309,6 +326,11 @@ func PullModelHandler(c *gin.Context) {
 		}
 	}()
 	if req.Stream != nil && !*req.Stream {
 		waitForStream(c, ch)
 		return
 	}
 	streamResponse(c, ch)
 }
@ -336,6 +358,11 @@ func PushModelHandler(c *gin.Context) {
 		}
 	}()
 	if req.Stream != nil && !*req.Stream {
 		waitForStream(c, ch)
 		return
 	}
 	streamResponse(c, ch)
 }
@ -363,6 +390,11 @@ func CreateModelHandler(c *gin.Context) {
 		}
 	}()
 	if req.Stream != nil && !*req.Stream {
 		waitForStream(c, ch)
 		return
 	}
 	streamResponse(c, ch)
 }
@ -603,6 +635,31 @@ func Serve(ln net.Listener, allowOrigins []string) error {
 	return s.Serve(ln)
 }
 func waitForStream(c *gin.Context, ch chan interface{}) {
 	c.Header("Content-Type", "application/json")
 	for resp := range ch {
 		switch r := resp.(type) {
 		case api.ProgressResponse:
 			if r.Status == "success" {
 				c.JSON(http.StatusOK, r)
 				return
 			}
 		case gin.H:
 			if errorMsg, ok := r["error"].(string); ok {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
 				return
 			} else {
 				c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in progress response"})
 				return
 			}
 		default:
 			c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected progress response"})
 			return
 		}
 	}
 	c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected end of progress response"})
 }
 func streamResponse(c *gin.Context, ch chan any) {
 	c.Header("Content-Type", "application/x-ndjson")
 	c.Stream(func(w io.Writer) bool {