optional parameter to not stream response (#639)
* update streaming request accept header * add optional stream param to request bodies
This commit is contained in:
parent
77295f716e
commit
274d5a5fdf
4 changed files with 94 additions and 18 deletions
|
@ -17,6 +17,10 @@ import (
|
||||||
"github.com/jmorganca/ollama/version"
|
"github.com/jmorganca/ollama/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const DefaultHost = "127.0.0.1:11434"
|
||||||
|
|
||||||
|
var envHost = os.Getenv("OLLAMA_HOST")
|
||||||
|
|
||||||
type Client struct {
|
type Client struct {
|
||||||
base *url.URL
|
base *url.URL
|
||||||
http http.Client
|
http http.Client
|
||||||
|
@ -143,7 +147,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
||||||
}
|
}
|
||||||
|
|
||||||
request.Header.Set("Content-Type", "application/json")
|
request.Header.Set("Content-Type", "application/json")
|
||||||
request.Header.Set("Accept", "application/json")
|
request.Header.Set("Accept", "application/x-ndjson")
|
||||||
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||||
|
|
||||||
response, err := c.http.Do(request)
|
response, err := c.http.Do(request)
|
||||||
|
|
10
api/types.go
10
api/types.go
|
@ -37,6 +37,7 @@ type GenerateRequest struct {
|
||||||
System string `json:"system"`
|
System string `json:"system"`
|
||||||
Template string `json:"template"`
|
Template string `json:"template"`
|
||||||
Context []int `json:"context,omitempty"`
|
Context []int `json:"context,omitempty"`
|
||||||
|
Stream *bool `json:"stream,omitempty"`
|
||||||
|
|
||||||
Options map[string]interface{} `json:"options"`
|
Options map[string]interface{} `json:"options"`
|
||||||
}
|
}
|
||||||
|
@ -55,6 +56,7 @@ type EmbeddingResponse struct {
|
||||||
type CreateRequest struct {
|
type CreateRequest struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Path string `json:"path"`
|
Path string `json:"path"`
|
||||||
|
Stream *bool `json:"stream,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type DeleteRequest struct {
|
type DeleteRequest struct {
|
||||||
|
@ -81,6 +83,9 @@ type CopyRequest struct {
|
||||||
type PullRequest struct {
|
type PullRequest struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Insecure bool `json:"insecure,omitempty"`
|
Insecure bool `json:"insecure,omitempty"`
|
||||||
|
Username string `json:"username"`
|
||||||
|
Password string `json:"password"`
|
||||||
|
Stream *bool `json:"stream,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ProgressResponse struct {
|
type ProgressResponse struct {
|
||||||
|
@ -93,6 +98,9 @@ type ProgressResponse struct {
|
||||||
type PushRequest struct {
|
type PushRequest struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Insecure bool `json:"insecure,omitempty"`
|
Insecure bool `json:"insecure,omitempty"`
|
||||||
|
Username string `json:"username"`
|
||||||
|
Password string `json:"password"`
|
||||||
|
Stream *bool `json:"stream,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ListResponse struct {
|
type ListResponse struct {
|
||||||
|
@ -113,7 +121,7 @@ type TokenResponse struct {
|
||||||
type GenerateResponse struct {
|
type GenerateResponse struct {
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
Response string `json:"response,omitempty"`
|
Response string `json:"response"`
|
||||||
|
|
||||||
Done bool `json:"done"`
|
Done bool `json:"done"`
|
||||||
Context []int `json:"context,omitempty"`
|
Context []int `json:"context,omitempty"`
|
||||||
|
|
15
docs/api.md
15
docs/api.md
|
@ -12,7 +12,6 @@
|
||||||
- [Push a Model](#push-a-model)
|
- [Push a Model](#push-a-model)
|
||||||
- [Generate Embeddings](#generate-embeddings)
|
- [Generate Embeddings](#generate-embeddings)
|
||||||
|
|
||||||
|
|
||||||
## Conventions
|
## Conventions
|
||||||
|
|
||||||
### Model names
|
### Model names
|
||||||
|
@ -40,12 +39,13 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
||||||
- `model`: (required) the [model name](#model-names)
|
- `model`: (required) the [model name](#model-names)
|
||||||
- `prompt`: the prompt to generate a response for
|
- `prompt`: the prompt to generate a response for
|
||||||
|
|
||||||
Advanced parameters:
|
Advanced parameters (optional):
|
||||||
|
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||||
- `system`: system prompt to (overrides what is defined in the `Modelfile`)
|
- `system`: system prompt to (overrides what is defined in the `Modelfile`)
|
||||||
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
|
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
|
||||||
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
|
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
|
||||||
|
- `stream`: if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
|
@ -80,6 +80,7 @@ The final response in the stream also includes additional data about the generat
|
||||||
- `eval_count`: number of tokens the response
|
- `eval_count`: number of tokens the response
|
||||||
- `eval_duration`: time in nanoseconds spent generating the response
|
- `eval_duration`: time in nanoseconds spent generating the response
|
||||||
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
|
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
|
||||||
|
- `response`: empty if the response was streamed, if not streamed, this will contain the full response
|
||||||
|
|
||||||
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
|
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
|
||||||
|
|
||||||
|
@ -87,6 +88,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
||||||
{
|
{
|
||||||
"model": "llama2:7b",
|
"model": "llama2:7b",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
|
"response": "",
|
||||||
"context": [1, 2, 3],
|
"context": [1, 2, 3],
|
||||||
"done": true,
|
"done": true,
|
||||||
"total_duration": 5589157167,
|
"total_duration": 5589157167,
|
||||||
|
@ -112,6 +114,7 @@ Create a model from a [`Modelfile`](./modelfile.md)
|
||||||
|
|
||||||
- `name`: name of the model to create
|
- `name`: name of the model to create
|
||||||
- `path`: path to the Modelfile
|
- `path`: path to the Modelfile
|
||||||
|
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
|
@ -245,6 +248,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
|
||||||
|
|
||||||
- `name`: name of the model to pull
|
- `name`: name of the model to pull
|
||||||
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
|
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
|
||||||
|
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
|
@ -276,6 +280,7 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
|
||||||
|
|
||||||
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
|
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
|
||||||
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
|
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
|
||||||
|
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
|
@ -297,7 +302,8 @@ and then:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
"status": "starting upload",
|
||||||
|
"digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||||
"total": 1928429856
|
"total": 1928429856
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
@ -308,7 +314,8 @@ Then there is a series of uploading responses:
|
||||||
{
|
{
|
||||||
"status": "starting upload",
|
"status": "starting upload",
|
||||||
"digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
"digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||||
"total":1928429856}
|
"total": 1928429856
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Finally, when the upload is complete:
|
Finally, when the upload is complete:
|
||||||
|
|
|
@ -240,6 +240,23 @@ func GenerateHandler(c *gin.Context) {
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if req.Stream != nil && !*req.Stream {
|
||||||
|
var response api.GenerateResponse
|
||||||
|
generated := ""
|
||||||
|
for resp := range ch {
|
||||||
|
if r, ok := resp.(api.GenerateResponse); ok {
|
||||||
|
generated += r.Response
|
||||||
|
response = r
|
||||||
|
} else {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
response.Response = generated
|
||||||
|
c.JSON(http.StatusOK, response)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
streamResponse(c, ch)
|
streamResponse(c, ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -309,6 +326,11 @@ func PullModelHandler(c *gin.Context) {
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if req.Stream != nil && !*req.Stream {
|
||||||
|
waitForStream(c, ch)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
streamResponse(c, ch)
|
streamResponse(c, ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -336,6 +358,11 @@ func PushModelHandler(c *gin.Context) {
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if req.Stream != nil && !*req.Stream {
|
||||||
|
waitForStream(c, ch)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
streamResponse(c, ch)
|
streamResponse(c, ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -363,6 +390,11 @@ func CreateModelHandler(c *gin.Context) {
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if req.Stream != nil && !*req.Stream {
|
||||||
|
waitForStream(c, ch)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
streamResponse(c, ch)
|
streamResponse(c, ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -603,6 +635,31 @@ func Serve(ln net.Listener, allowOrigins []string) error {
|
||||||
return s.Serve(ln)
|
return s.Serve(ln)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func waitForStream(c *gin.Context, ch chan interface{}) {
|
||||||
|
c.Header("Content-Type", "application/json")
|
||||||
|
for resp := range ch {
|
||||||
|
switch r := resp.(type) {
|
||||||
|
case api.ProgressResponse:
|
||||||
|
if r.Status == "success" {
|
||||||
|
c.JSON(http.StatusOK, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case gin.H:
|
||||||
|
if errorMsg, ok := r["error"].(string); ok {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
|
||||||
|
return
|
||||||
|
} else {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in progress response"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected progress response"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected end of progress response"})
|
||||||
|
}
|
||||||
|
|
||||||
func streamResponse(c *gin.Context, ch chan any) {
|
func streamResponse(c *gin.Context, ch chan any) {
|
||||||
c.Header("Content-Type", "application/x-ndjson")
|
c.Header("Content-Type", "application/x-ndjson")
|
||||||
c.Stream(func(w io.Writer) bool {
|
c.Stream(func(w io.Writer) bool {
|
||||||
|
|
Loading…
Reference in a new issue