optional parameter to not stream response (#639)
* update streaming request accept header * add optional stream param to request bodies
This commit is contained in:
parent
77295f716e
commit
274d5a5fdf
4 changed files with 94 additions and 18 deletions
|
@ -17,6 +17,10 @@ import (
|
|||
"github.com/jmorganca/ollama/version"
|
||||
)
|
||||
|
||||
const DefaultHost = "127.0.0.1:11434"
|
||||
|
||||
var envHost = os.Getenv("OLLAMA_HOST")
|
||||
|
||||
type Client struct {
|
||||
base *url.URL
|
||||
http http.Client
|
||||
|
@ -143,7 +147,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
|||
}
|
||||
|
||||
request.Header.Set("Content-Type", "application/json")
|
||||
request.Header.Set("Accept", "application/json")
|
||||
request.Header.Set("Accept", "application/x-ndjson")
|
||||
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||
|
||||
response, err := c.http.Do(request)
|
||||
|
|
10
api/types.go
10
api/types.go
|
@ -37,6 +37,7 @@ type GenerateRequest struct {
|
|||
System string `json:"system"`
|
||||
Template string `json:"template"`
|
||||
Context []int `json:"context,omitempty"`
|
||||
Stream *bool `json:"stream,omitempty"`
|
||||
|
||||
Options map[string]interface{} `json:"options"`
|
||||
}
|
||||
|
@ -55,6 +56,7 @@ type EmbeddingResponse struct {
|
|||
type CreateRequest struct {
|
||||
Name string `json:"name"`
|
||||
Path string `json:"path"`
|
||||
Stream *bool `json:"stream,omitempty"`
|
||||
}
|
||||
|
||||
type DeleteRequest struct {
|
||||
|
@ -81,6 +83,9 @@ type CopyRequest struct {
|
|||
type PullRequest struct {
|
||||
Name string `json:"name"`
|
||||
Insecure bool `json:"insecure,omitempty"`
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
Stream *bool `json:"stream,omitempty"`
|
||||
}
|
||||
|
||||
type ProgressResponse struct {
|
||||
|
@ -93,6 +98,9 @@ type ProgressResponse struct {
|
|||
type PushRequest struct {
|
||||
Name string `json:"name"`
|
||||
Insecure bool `json:"insecure,omitempty"`
|
||||
Username string `json:"username"`
|
||||
Password string `json:"password"`
|
||||
Stream *bool `json:"stream,omitempty"`
|
||||
}
|
||||
|
||||
type ListResponse struct {
|
||||
|
@ -113,7 +121,7 @@ type TokenResponse struct {
|
|||
type GenerateResponse struct {
|
||||
Model string `json:"model"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
Response string `json:"response,omitempty"`
|
||||
Response string `json:"response"`
|
||||
|
||||
Done bool `json:"done"`
|
||||
Context []int `json:"context,omitempty"`
|
||||
|
|
23
docs/api.md
23
docs/api.md
|
@ -12,7 +12,6 @@
|
|||
- [Push a Model](#push-a-model)
|
||||
- [Generate Embeddings](#generate-embeddings)
|
||||
|
||||
|
||||
## Conventions
|
||||
|
||||
### Model names
|
||||
|
@ -40,12 +39,13 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
|||
- `model`: (required) the [model name](#model-names)
|
||||
- `prompt`: the prompt to generate a response for
|
||||
|
||||
Advanced parameters:
|
||||
Advanced parameters (optional):
|
||||
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `system`: system prompt to (overrides what is defined in the `Modelfile`)
|
||||
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
|
||||
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
|
||||
- `stream`: if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||
|
||||
### Request
|
||||
|
||||
|
@ -80,6 +80,7 @@ The final response in the stream also includes additional data about the generat
|
|||
- `eval_count`: number of tokens the response
|
||||
- `eval_duration`: time in nanoseconds spent generating the response
|
||||
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
|
||||
- `response`: empty if the response was streamed, if not streamed, this will contain the full response
|
||||
|
||||
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
|
||||
|
||||
|
@ -87,6 +88,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
|||
{
|
||||
"model": "llama2:7b",
|
||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||
"response": "",
|
||||
"context": [1, 2, 3],
|
||||
"done": true,
|
||||
"total_duration": 5589157167,
|
||||
|
@ -112,6 +114,7 @@ Create a model from a [`Modelfile`](./modelfile.md)
|
|||
|
||||
- `name`: name of the model to create
|
||||
- `path`: path to the Modelfile
|
||||
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||
|
||||
### Request
|
||||
|
||||
|
@ -245,6 +248,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
|
|||
|
||||
- `name`: name of the model to pull
|
||||
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
|
||||
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||
|
||||
### Request
|
||||
|
||||
|
@ -276,6 +280,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
|
|||
|
||||
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
|
||||
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
|
||||
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||
|
||||
### Request
|
||||
|
||||
|
@ -290,15 +295,16 @@ curl -X POST http://localhost:11434/api/push -d '{
|
|||
Streaming response that starts with:
|
||||
|
||||
```json
|
||||
{"status":"retrieving manifest"}
|
||||
{ "status": "retrieving manifest" }
|
||||
```
|
||||
|
||||
and then:
|
||||
|
||||
```json
|
||||
{
|
||||
"status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||
"total":1928429856
|
||||
"status": "starting upload",
|
||||
"digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||
"total": 1928429856
|
||||
}
|
||||
```
|
||||
|
||||
|
@ -306,9 +312,10 @@ Then there is a series of uploading responses:
|
|||
|
||||
```json
|
||||
{
|
||||
"status":"starting upload",
|
||||
"digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||
"total":1928429856}
|
||||
"status": "starting upload",
|
||||
"digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||
"total": 1928429856
|
||||
}
|
||||
```
|
||||
|
||||
Finally, when the upload is complete:
|
||||
|
|
|
@ -240,6 +240,23 @@ func GenerateHandler(c *gin.Context) {
|
|||
}
|
||||
}()
|
||||
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
var response api.GenerateResponse
|
||||
generated := ""
|
||||
for resp := range ch {
|
||||
if r, ok := resp.(api.GenerateResponse); ok {
|
||||
generated += r.Response
|
||||
response = r
|
||||
} else {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
}
|
||||
response.Response = generated
|
||||
c.JSON(http.StatusOK, response)
|
||||
return
|
||||
}
|
||||
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
|
@ -309,6 +326,11 @@ func PullModelHandler(c *gin.Context) {
|
|||
}
|
||||
}()
|
||||
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
waitForStream(c, ch)
|
||||
return
|
||||
}
|
||||
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
|
@ -336,6 +358,11 @@ func PushModelHandler(c *gin.Context) {
|
|||
}
|
||||
}()
|
||||
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
waitForStream(c, ch)
|
||||
return
|
||||
}
|
||||
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
|
@ -363,6 +390,11 @@ func CreateModelHandler(c *gin.Context) {
|
|||
}
|
||||
}()
|
||||
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
waitForStream(c, ch)
|
||||
return
|
||||
}
|
||||
|
||||
streamResponse(c, ch)
|
||||
}
|
||||
|
||||
|
@ -603,6 +635,31 @@ func Serve(ln net.Listener, allowOrigins []string) error {
|
|||
return s.Serve(ln)
|
||||
}
|
||||
|
||||
func waitForStream(c *gin.Context, ch chan interface{}) {
|
||||
c.Header("Content-Type", "application/json")
|
||||
for resp := range ch {
|
||||
switch r := resp.(type) {
|
||||
case api.ProgressResponse:
|
||||
if r.Status == "success" {
|
||||
c.JSON(http.StatusOK, r)
|
||||
return
|
||||
}
|
||||
case gin.H:
|
||||
if errorMsg, ok := r["error"].(string); ok {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
|
||||
return
|
||||
} else {
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in progress response"})
|
||||
return
|
||||
}
|
||||
default:
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected progress response"})
|
||||
return
|
||||
}
|
||||
}
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected end of progress response"})
|
||||
}
|
||||
|
||||
func streamResponse(c *gin.Context, ch chan any) {
|
||||
c.Header("Content-Type", "application/x-ndjson")
|
||||
c.Stream(func(w io.Writer) bool {
|
||||
|
|
Loading…
Reference in a new issue