JSON mode: add `"format" as an api parameter (#1051)

* add `"format": "json"` as an API parameter --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com>
2023-11-09 16:44:02 -08:00 · 2023-11-09 16:44:02 -08:00 · 5cba29b9d6
commit 5cba29b9d6
parent 5b39503bcd
5 changed files with 97 additions and 9 deletions
--- a/api/types.go
+++ b/api/types.go
@ -38,6 +38,7 @@ type GenerateRequest struct {
 	Context  []int  `json:"context,omitempty"`
 	Stream   *bool  `json:"stream,omitempty"`
 	Raw      bool   `json:"raw,omitempty"`
 	Format   string `json:"format"`
 	Options map[string]interface{} `json:"options"`
 }
--- a/docs/api.md
+++ b/docs/api.md
@ -38,6 +38,7 @@ Generate a response for a given prompt with a provided model. This is a streamin
 - `model`: (required) the [model name](#model-names)
 - `prompt`: the prompt to generate a response for
 - `format`: the format to return a response in. Currently the only accepted value is `json`
 Advanced parameters (optional):
@ -48,13 +49,17 @@ Advanced parameters (optional):
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `raw`: if `true` no formatting will be applied to the prompt and no context will be returned. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API, and are managing history yourself.
 ### JSON mode
 Enable JSON mode by setting the `format` parameter to `json` and specifying the model should use JSON in the `prompt`. This will structure the response as valid JSON. See the JSON mode [example](#request-json-mode) below.
 ### Examples
 #### Request
 ```shell
 curl -X POST http://localhost:11434/api/generate -d '{
-  "model": "llama2:7b",
+  "model": "llama2",
  "prompt": "Why is the sky blue?"
 }'
 ```
@ -65,7 +70,7 @@ A stream of JSON objects is returned:
 ```json
 {
-  "model": "llama2:7b",
+  "model": "llama2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "response": "The",
  "done": false
@ -89,7 +94,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 ```json
 {
-  "model": "llama2:7b",
+  "model": "llama2",
  "created_at": "2023-08-04T19:22:45.499127Z",
  "response": "",
  "context": [1, 2, 3],
@ -105,7 +110,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 }
 ```
-#### Request
+#### Request (No streaming)
 ```shell
 curl -X POST http://localhost:11434/api/generate -d '{
@ -137,7 +142,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
 }
 ```
-#### Request
+#### Request (Raw mode)
 In some cases you may wish to bypass the templating system and provide a full prompt. In this case, you can use the `raw` parameter to disable formatting and context.
@ -167,7 +172,54 @@ curl -X POST http://localhost:11434/api/generate -d '{
 }
 ```
-#### Request
+#### Request (JSON mode)
 ```shell
 curl -X POST http://localhost:11434/api/generate -d '{
  "model": "llama2",
  "prompt": "What color is the sky at different times of the day? Respond using JSON",
  "format": "json",
  "stream": false
 }'
 ```
 #### Response
 ```json
 {
  "model": "llama2",
  "created_at": "2023-11-09T21:07:55.186497Z",
  "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
  "done": true,
  "total_duration": 4661289125,
  "load_duration": 1714434500,
  "prompt_eval_count": 36,
  "prompt_eval_duration": 264132000,
  "eval_count": 75,
  "eval_duration": 2112149000
 }
 ```
 The value of `response` will be a string containing JSON similar to:
 ```json
 {
  "morning": {
    "color": "blue"
  },
  "noon": {
    "color": "blue-gray"
  },
  "afternoon": {
    "color": "warm gray"
  },
  "evening": {
    "color": "orange"
  }
 }
 ```
 #### Request (With options)
 If you want to set custom options for the model at runtime rather than in the Modelfile, you can do so with the `options` parameter. This example sets every available option, but you can set any of them individually and omit the ones you do not want to override.
--- a/llm/llama.go
+++ b/llm/llama.go
@ -27,6 +27,34 @@ import (
 	"github.com/jmorganca/ollama/format"
 )
 const jsonGrammar = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
 object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
  )? "}" ws
 array  ::=
  "[" ws (
            value
    ("," ws value)*
  )? "]" ws
 string ::=
  "\"" (
    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws
 number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 # Optional space: by convention, applied in this grammar after literal chars when allowed
 ws ::= ([ \t\n] ws)?
 `
 //go:embed llama.cpp/*/build/*/bin/*
 var llamaCppEmbed embed.FS
@ -497,7 +525,7 @@ type prediction struct {
 const maxBufferSize = 512 * format.KiloByte
-func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, fn func(api.GenerateResponse)) error {
+func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string, format string, fn func(api.GenerateResponse)) error {
 	prevConvo, err := llm.Decode(ctx, prevContext)
 	if err != nil {
 		return err
@ -532,6 +560,10 @@ func (llm *llama) Predict(ctx context.Context, prevContext []int, prompt string,
 		"stop":              llm.Stop,
 	}
 	if format == "json" {
 		request["grammar"] = jsonGrammar
 	}
 	// Handling JSON marshaling with special characters unescaped.
 	buffer := &bytes.Buffer{}
 	enc := json.NewEncoder(buffer)
--- a/llm/llm.go
+++ b/llm/llm.go
@ -14,7 +14,7 @@ import (
 )
 type LLM interface {
-	Predict(context.Context, []int, string, func(api.GenerateResponse)) error
+	Predict(context.Context, []int, string, string, func(api.GenerateResponse)) error
 	Embedding(context.Context, string) ([]float64, error)
 	Encode(context.Context, string) ([]int, error)
 	Decode(context.Context, []int) (string, error)
--- a/server/routes.go
+++ b/server/routes.go
@ -163,6 +163,9 @@ func GenerateHandler(c *gin.Context) {
 	case req.Model == "":
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "model is required"})
 		return
 	case len(req.Format) > 0 && req.Format != "json":
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "format must be json"})
 		return
 	case req.Raw && (req.Template != "" || req.System != "" || len(req.Context) > 0):
 		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "raw mode does not support template, system, or context"})
 		return
@ -231,7 +234,7 @@ func GenerateHandler(c *gin.Context) {
 			ch <- r
 		}
-		if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, fn); err != nil {
+		if err := loaded.runner.Predict(c.Request.Context(), req.Context, prompt, req.Format, fn); err != nil {
 			ch <- gin.H{"error": err.Error()}
 		}
 	}()