optional parameter to not stream response (#639)
* update streaming request accept header * add optional stream param to request bodies
This commit is contained in:
parent
77295f716e
commit
274d5a5fdf
4 changed files with 94 additions and 18 deletions
|
@ -17,6 +17,10 @@ import (
|
||||||
"github.com/jmorganca/ollama/version"
|
"github.com/jmorganca/ollama/version"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const DefaultHost = "127.0.0.1:11434"
|
||||||
|
|
||||||
|
var envHost = os.Getenv("OLLAMA_HOST")
|
||||||
|
|
||||||
type Client struct {
|
type Client struct {
|
||||||
base *url.URL
|
base *url.URL
|
||||||
http http.Client
|
http http.Client
|
||||||
|
@ -143,7 +147,7 @@ func (c *Client) stream(ctx context.Context, method, path string, data any, fn f
|
||||||
}
|
}
|
||||||
|
|
||||||
request.Header.Set("Content-Type", "application/json")
|
request.Header.Set("Content-Type", "application/json")
|
||||||
request.Header.Set("Accept", "application/json")
|
request.Header.Set("Accept", "application/x-ndjson")
|
||||||
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
request.Header.Set("User-Agent", fmt.Sprintf("ollama/%s (%s %s) Go/%s", version.Version, runtime.GOARCH, runtime.GOOS, runtime.Version()))
|
||||||
|
|
||||||
response, err := c.http.Do(request)
|
response, err := c.http.Do(request)
|
||||||
|
|
14
api/types.go
14
api/types.go
|
@ -37,6 +37,7 @@ type GenerateRequest struct {
|
||||||
System string `json:"system"`
|
System string `json:"system"`
|
||||||
Template string `json:"template"`
|
Template string `json:"template"`
|
||||||
Context []int `json:"context,omitempty"`
|
Context []int `json:"context,omitempty"`
|
||||||
|
Stream *bool `json:"stream,omitempty"`
|
||||||
|
|
||||||
Options map[string]interface{} `json:"options"`
|
Options map[string]interface{} `json:"options"`
|
||||||
}
|
}
|
||||||
|
@ -53,8 +54,9 @@ type EmbeddingResponse struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type CreateRequest struct {
|
type CreateRequest struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Path string `json:"path"`
|
Path string `json:"path"`
|
||||||
|
Stream *bool `json:"stream,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type DeleteRequest struct {
|
type DeleteRequest struct {
|
||||||
|
@ -81,6 +83,9 @@ type CopyRequest struct {
|
||||||
type PullRequest struct {
|
type PullRequest struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Insecure bool `json:"insecure,omitempty"`
|
Insecure bool `json:"insecure,omitempty"`
|
||||||
|
Username string `json:"username"`
|
||||||
|
Password string `json:"password"`
|
||||||
|
Stream *bool `json:"stream,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ProgressResponse struct {
|
type ProgressResponse struct {
|
||||||
|
@ -93,6 +98,9 @@ type ProgressResponse struct {
|
||||||
type PushRequest struct {
|
type PushRequest struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Insecure bool `json:"insecure,omitempty"`
|
Insecure bool `json:"insecure,omitempty"`
|
||||||
|
Username string `json:"username"`
|
||||||
|
Password string `json:"password"`
|
||||||
|
Stream *bool `json:"stream,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type ListResponse struct {
|
type ListResponse struct {
|
||||||
|
@ -113,7 +121,7 @@ type TokenResponse struct {
|
||||||
type GenerateResponse struct {
|
type GenerateResponse struct {
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
Response string `json:"response,omitempty"`
|
Response string `json:"response"`
|
||||||
|
|
||||||
Done bool `json:"done"`
|
Done bool `json:"done"`
|
||||||
Context []int `json:"context,omitempty"`
|
Context []int `json:"context,omitempty"`
|
||||||
|
|
35
docs/api.md
35
docs/api.md
|
@ -12,7 +12,6 @@
|
||||||
- [Push a Model](#push-a-model)
|
- [Push a Model](#push-a-model)
|
||||||
- [Generate Embeddings](#generate-embeddings)
|
- [Generate Embeddings](#generate-embeddings)
|
||||||
|
|
||||||
|
|
||||||
## Conventions
|
## Conventions
|
||||||
|
|
||||||
### Model names
|
### Model names
|
||||||
|
@ -40,12 +39,13 @@ Generate a response for a given prompt with a provided model. This is a streamin
|
||||||
- `model`: (required) the [model name](#model-names)
|
- `model`: (required) the [model name](#model-names)
|
||||||
- `prompt`: the prompt to generate a response for
|
- `prompt`: the prompt to generate a response for
|
||||||
|
|
||||||
Advanced parameters:
|
Advanced parameters (optional):
|
||||||
|
|
||||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||||
- `system`: system prompt to (overrides what is defined in the `Modelfile`)
|
- `system`: system prompt to (overrides what is defined in the `Modelfile`)
|
||||||
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
|
- `template`: the full prompt or prompt template (overrides what is defined in the `Modelfile`)
|
||||||
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
|
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
|
||||||
|
- `stream`: if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
|
@ -80,6 +80,7 @@ The final response in the stream also includes additional data about the generat
|
||||||
- `eval_count`: number of tokens the response
|
- `eval_count`: number of tokens the response
|
||||||
- `eval_duration`: time in nanoseconds spent generating the response
|
- `eval_duration`: time in nanoseconds spent generating the response
|
||||||
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
|
- `context`: an encoding of the conversation used in this response, this can be sent in the next request to keep a conversational memory
|
||||||
|
- `response`: empty if the response was streamed, if not streamed, this will contain the full response
|
||||||
|
|
||||||
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
|
To calculate how fast the response is generated in tokens per second (token/s), divide `eval_count` / `eval_duration`.
|
||||||
|
|
||||||
|
@ -87,6 +88,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
|
||||||
{
|
{
|
||||||
"model": "llama2:7b",
|
"model": "llama2:7b",
|
||||||
"created_at": "2023-08-04T19:22:45.499127Z",
|
"created_at": "2023-08-04T19:22:45.499127Z",
|
||||||
|
"response": "",
|
||||||
"context": [1, 2, 3],
|
"context": [1, 2, 3],
|
||||||
"done": true,
|
"done": true,
|
||||||
"total_duration": 5589157167,
|
"total_duration": 5589157167,
|
||||||
|
@ -112,6 +114,7 @@ Create a model from a [`Modelfile`](./modelfile.md)
|
||||||
|
|
||||||
- `name`: name of the model to create
|
- `name`: name of the model to create
|
||||||
- `path`: path to the Modelfile
|
- `path`: path to the Modelfile
|
||||||
|
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
|
@ -179,7 +182,7 @@ Show details about a model including modelfile, template, parameters, license, a
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
curl http://localhost:11434/api/show -d '{
|
curl http://localhost:11434/api/show -d '{
|
||||||
"name": "llama2:7b"
|
"name": "llama2:7b"
|
||||||
}'
|
}'
|
||||||
|
@ -189,10 +192,10 @@ curl http://localhost:11434/api/show -d '{
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"license": "<contents of license block>",
|
"license": "<contents of license block>",
|
||||||
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
|
"modelfile": "# Modelfile generated by \"ollama show\"\n# To build a new Modelfile based on this one, replace the FROM line with:\n# FROM llama2:latest\n\nFROM /Users/username/.ollama/models/blobs/sha256:8daa9615cce30c259a9555b1cc250d461d1bc69980a274b44d7eda0be78076d8\nTEMPLATE \"\"\"[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] \"\"\"\nSYSTEM \"\"\"\"\"\"\nPARAMETER stop [INST]\nPARAMETER stop [/INST]\nPARAMETER stop <<SYS>>\nPARAMETER stop <</SYS>>\n",
|
||||||
"parameters": "stop [INST]\nstop [/INST]\nstop <<SYS>>\nstop <</SYS>>",
|
"parameters": "stop [INST]\nstop [/INST]\nstop <<SYS>>\nstop <</SYS>>",
|
||||||
"template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
|
"template": "[INST] {{ if and .First .System }}<<SYS>>{{ .System }}<</SYS>>\n\n{{ end }}{{ .Prompt }} [/INST] "
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -245,6 +248,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
|
||||||
|
|
||||||
- `name`: name of the model to pull
|
- `name`: name of the model to pull
|
||||||
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
|
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pulling from your own library during development.
|
||||||
|
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
|
@ -275,7 +279,8 @@ Upload a model to a model library. Requires registering for ollama.ai and adding
|
||||||
### Parameters
|
### Parameters
|
||||||
|
|
||||||
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
|
- `name`: name of the model to push in the form of `<namespace>/<model>:<tag>`
|
||||||
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
|
- `insecure`: (optional) allow insecure connections to the library. Only use this if you are pushing to your library during development.
|
||||||
|
- `stream`: (optional) if `false` the response will be be returned as a single response object, rather than a stream of objects
|
||||||
|
|
||||||
### Request
|
### Request
|
||||||
|
|
||||||
|
@ -290,15 +295,16 @@ curl -X POST http://localhost:11434/api/push -d '{
|
||||||
Streaming response that starts with:
|
Streaming response that starts with:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{"status":"retrieving manifest"}
|
{ "status": "retrieving manifest" }
|
||||||
```
|
```
|
||||||
|
|
||||||
and then:
|
and then:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"status":"starting upload","digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
"status": "starting upload",
|
||||||
"total":1928429856
|
"digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||||
|
"total": 1928429856
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -306,9 +312,10 @@ Then there is a series of uploading responses:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"status":"starting upload",
|
"status": "starting upload",
|
||||||
"digest":"sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
"digest": "sha256:bc07c81de745696fdf5afca05e065818a8149fb0c77266fb584d9b2cba3711ab",
|
||||||
"total":1928429856}
|
"total": 1928429856
|
||||||
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Finally, when the upload is complete:
|
Finally, when the upload is complete:
|
||||||
|
|
|
@ -240,6 +240,23 @@ func GenerateHandler(c *gin.Context) {
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if req.Stream != nil && !*req.Stream {
|
||||||
|
var response api.GenerateResponse
|
||||||
|
generated := ""
|
||||||
|
for resp := range ch {
|
||||||
|
if r, ok := resp.(api.GenerateResponse); ok {
|
||||||
|
generated += r.Response
|
||||||
|
response = r
|
||||||
|
} else {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
response.Response = generated
|
||||||
|
c.JSON(http.StatusOK, response)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
streamResponse(c, ch)
|
streamResponse(c, ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -309,6 +326,11 @@ func PullModelHandler(c *gin.Context) {
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if req.Stream != nil && !*req.Stream {
|
||||||
|
waitForStream(c, ch)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
streamResponse(c, ch)
|
streamResponse(c, ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -336,6 +358,11 @@ func PushModelHandler(c *gin.Context) {
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if req.Stream != nil && !*req.Stream {
|
||||||
|
waitForStream(c, ch)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
streamResponse(c, ch)
|
streamResponse(c, ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -363,6 +390,11 @@ func CreateModelHandler(c *gin.Context) {
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if req.Stream != nil && !*req.Stream {
|
||||||
|
waitForStream(c, ch)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
streamResponse(c, ch)
|
streamResponse(c, ch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -603,6 +635,31 @@ func Serve(ln net.Listener, allowOrigins []string) error {
|
||||||
return s.Serve(ln)
|
return s.Serve(ln)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func waitForStream(c *gin.Context, ch chan interface{}) {
|
||||||
|
c.Header("Content-Type", "application/json")
|
||||||
|
for resp := range ch {
|
||||||
|
switch r := resp.(type) {
|
||||||
|
case api.ProgressResponse:
|
||||||
|
if r.Status == "success" {
|
||||||
|
c.JSON(http.StatusOK, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case gin.H:
|
||||||
|
if errorMsg, ok := r["error"].(string); ok {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": errorMsg})
|
||||||
|
return
|
||||||
|
} else {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected error format in progress response"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected progress response"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "unexpected end of progress response"})
|
||||||
|
}
|
||||||
|
|
||||||
func streamResponse(c *gin.Context, ch chan any) {
|
func streamResponse(c *gin.Context, ch chan any) {
|
||||||
c.Header("Content-Type", "application/x-ndjson")
|
c.Header("Content-Type", "application/x-ndjson")
|
||||||
c.Stream(func(w io.Writer) bool {
|
c.Stream(func(w io.Writer) bool {
|
||||||
|
|
Loading…
Reference in a new issue