This commit is contained in:
baalajimaestro 2024-01-17 12:10:00 +05:30
commit 833126bbd3
Signed by: baalajimaestro
GPG key ID: F93C394FE9BBAFD5
14 changed files with 259 additions and 42 deletions

View file

@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [Unreleased]
## [0.2.29]
- feat: Update llama.cpp to ggerganov/llama.cpp@4483396751c79dea540808b9cb9238245d06da2b
- feat: Add split_mode option by @abetlen in 84615adbc6855c8384807c42f0130f9a1763f99d
- feat: Implement GGUF metadata KV overrides by @phiharri in #1011
- fix: Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor by @yieldthought in #1012
- fix: Fix low_level_api_chat_cpp example to match current API by @aniljava in #1086
- fix: Fix Pydantic model parsing by @DeNeutoy in #1087
## [0.2.28]
- feat: Update llama.cpp to ggerganov/llama.cpp@6efb8eb30e7025b168f3fda3ff83b9b386428ad6

View file

@ -14,6 +14,7 @@ This package provides:
- High-level Python API for text completion
- OpenAI-like API
- [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp)
- [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html)
- OpenAI compatible web server
- [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
- [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)

View file

@ -106,7 +106,7 @@ def gpt_params_parse(argv = None):
parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta")
parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
parser.add_argument("-p", "--prompt", type=str, default=None, help="initial prompt",dest="prompt")
parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session")
parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")

View file

@ -62,7 +62,7 @@ specified) expect poor results""", file=sys.stderr)
self.multibyte_fix = []
# model load
self.lparams = llama_cpp.llama_context_default_params()
self.lparams = llama_cpp.llama_model_default_params()
self.lparams.n_ctx = self.params.n_ctx
self.lparams.n_parts = self.params.n_parts
self.lparams.seed = self.params.seed
@ -72,7 +72,11 @@ specified) expect poor results""", file=sys.stderr)
self.model = llama_cpp.llama_load_model_from_file(
self.params.model.encode("utf8"), self.lparams)
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.lparams)
# Context Params.
self.cparams = llama_cpp.llama_context_default_params()
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.cparams)
if (not self.ctx):
raise RuntimeError(f"error: failed to load model '{self.params.model}'")
@ -244,7 +248,7 @@ n_keep = {self.params.n_keep}
# tokenize a prompt
def _tokenize(self, prompt, bos=True):
_arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))()
_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos)
_n = llama_cpp.llama_tokenize(self.model, prompt.encode("utf8", errors="ignore"), len(prompt), _arr, len(_arr), bos, False)
return _arr[:_n]
def set_color(self, c):
@ -304,7 +308,7 @@ n_keep = {self.params.n_keep}
self.n_past += n_eval"""
if (llama_cpp.llama_eval(
self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past
) != 0):
raise Exception("Failed to llama_eval!")
@ -332,7 +336,7 @@ n_keep = {self.params.n_keep}
id = 0
logits = llama_cpp.llama_get_logits(self.ctx)
n_vocab = llama_cpp.llama_n_vocab(self.ctx)
n_vocab = llama_cpp.llama_n_vocab(self.model)
# Apply params.logit_bias map
for key, value in self.params.logit_bias.items():
@ -349,12 +353,20 @@ n_keep = {self.params.n_keep}
last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx)
_arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:])
llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p,
_arr,
last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty))
llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p,
_arr,
last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
llama_cpp.llama_sample_repetition_penalties(
ctx=self.ctx,
candidates=candidates_p,
last_tokens_data = _arr,
penalty_last_n = last_n_repeat,
penalty_repeat = llama_cpp.c_float(self.params.repeat_penalty),
penalty_freq = llama_cpp.c_float(self.params.frequency_penalty),
penalty_present = llama_cpp.c_float(self.params.presence_penalty),
)
# NOT PRESENT IN CURRENT VERSION ?
# llama_cpp.llama_sample_frequency_and_presence_penalti(self.ctx, candidates_p,
# _arr,
# last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty))
if not self.params.penalize_nl:
logits[llama_cpp.llama_token_nl()] = nl_logit
@ -473,7 +485,7 @@ n_keep = {self.params.n_keep}
def token_to_str(self, token_id: int) -> bytes:
size = 32
buffer = (ctypes.c_char * size)()
n = llama_cpp.llama_token_to_piece_with_model(
n = llama_cpp.llama_token_to_piece(
self.model, llama_cpp.llama_token(token_id), buffer, size)
assert n <= size
return bytes(buffer[:n])
@ -532,6 +544,9 @@ n_keep = {self.params.n_keep}
print(i,end="",flush=True)
self.params.input_echo = False
# Using string instead of tokens to check for antiprompt,
# It is more reliable than tokens for interactive mode.
generated_str = ""
while self.params.interactive:
self.set_color(util.CONSOLE_COLOR_USER_INPUT)
if (self.params.instruct):
@ -546,6 +561,10 @@ n_keep = {self.params.n_keep}
try:
for i in self.output():
print(i,end="",flush=True)
generated_str += i
for ap in self.params.antiprompt:
if generated_str.endswith(ap):
raise KeyboardInterrupt
except KeyboardInterrupt:
self.set_color(util.CONSOLE_COLOR_DEFAULT)
if not self.params.instruct:
@ -561,7 +580,7 @@ if __name__ == "__main__":
time_now = datetime.now()
prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
Transcript below contains only the recorded dialog between two, without any annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
The transcript only includes text, it does not include markup like HTML and Markdown.
@ -576,7 +595,10 @@ The transcript only includes text, it does not include markup like HTML and Mark
{USER_NAME}: Name a color.
{AI_NAME}: Blue
{USER_NAME}: """
params = gpt_params_parse()
if params.prompt is None and params.file is None:
params.prompt = prompt
with LLaMAInteract(params) as m:
m.interact()

View file

@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *
__version__ = "0.2.28"
__version__ = "0.2.29"

View file

@ -1,11 +1,15 @@
import os
import sys
import sys, traceback
# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor
outnull_file = open(os.devnull, "w")
errnull_file = open(os.devnull, "w")
class suppress_stdout_stderr(object):
# NOTE: these must be "saved" here to avoid exceptions when using
# this context manager inside of a __del__ method
open = open
sys = sys
os = os
@ -21,9 +25,6 @@ class suppress_stdout_stderr(object):
if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'):
return self # Return the instance without making changes
self.outnull_file = self.open(self.os.devnull, "w")
self.errnull_file = self.open(self.os.devnull, "w")
self.old_stdout_fileno_undup = self.sys.stdout.fileno()
self.old_stderr_fileno_undup = self.sys.stderr.fileno()
@ -33,11 +34,11 @@ class suppress_stdout_stderr(object):
self.old_stdout = self.sys.stdout
self.old_stderr = self.sys.stderr
self.os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
self.os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)
self.os.dup2(outnull_file.fileno(), self.old_stdout_fileno_undup)
self.os.dup2(errnull_file.fileno(), self.old_stderr_fileno_undup)
self.sys.stdout = self.outnull_file
self.sys.stderr = self.errnull_file
self.sys.stdout = outnull_file
self.sys.stderr = errnull_file
return self
def __exit__(self, *_):
@ -54,6 +55,3 @@ class suppress_stdout_stderr(object):
self.os.close(self.old_stdout_fileno)
self.os.close(self.old_stderr_fileno)
self.outnull_file.close()
self.errnull_file.close()

View file

@ -730,11 +730,13 @@ class Llama:
*,
# Model Params
n_gpu_layers: int = 0,
split_mode: int = llama_cpp.LLAMA_SPLIT_LAYER,
main_gpu: int = 0,
tensor_split: Optional[List[float]] = None,
vocab_only: bool = False,
use_mmap: bool = True,
use_mlock: bool = False,
kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None,
# Context Params
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
n_ctx: int = 512,
@ -798,11 +800,13 @@ class Llama:
Args:
model_path: Path to the model.
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
main_gpu: The GPU that is used for scratch and small tensors.
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
vocab_only: Only load the vocabulary no weights.
use_mmap: Use mmap if possible.
use_mlock: Force the system to keep the model in RAM.
kv_overrides: Key-value overrides for the model.
seed: RNG seed, -1 for random
n_ctx: Text context, 0 = from model
n_batch: Prompt processing maximum batch size
@ -848,6 +852,7 @@ class Llama:
self.model_params.n_gpu_layers = (
0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
self.model_params.split_mode = split_mode
self.model_params.main_gpu = main_gpu
self.tensor_split = tensor_split
self._c_tensor_split = None
@ -866,6 +871,34 @@ class Llama:
self.model_params.use_mmap = use_mmap if lora_path is None else False
self.model_params.use_mlock = use_mlock
self.kv_overrides = kv_overrides
if kv_overrides is not None:
n_overrides = len(kv_overrides)
self._kv_overrides_array = llama_cpp.llama_model_kv_override * (n_overrides + 1)
self._kv_overrides_array_keys = []
for k, v in kv_overrides.items():
key_buf = ctypes.create_string_buffer(k.encode("utf-8"))
self._kv_overrides_array_keys.append(key_buf)
self._kv_overrides_array[i].key = key_buf
if isinstance(v, int):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_INT
self._kv_overrides_array[i].value.int_value = v
elif isinstance(v, float):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_FLOAT
self._kv_overrides_array[i].value.float_value = v
elif isinstance(v, bool):
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_BOOL
self._kv_overrides_array[i].value.bool_value = v
else:
raise ValueError(f"Unknown value type for {k}: {v}")
self._kv_overrides_array_sentinel_key = b'\0'
# null array sentinel
self._kv_overrides_array[n_overrides].key = self._kv_overrides_array_sentinel_key
self.model_params.kv_overrides = self._kv_overrides_array
self.n_batch = min(n_ctx, n_batch) # ???
self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
self.n_threads_batch = n_threads_batch or max(
@ -2143,11 +2176,13 @@ class Llama:
model_path=self.model_path,
# Model Params
n_gpu_layers=self.model_params.n_gpu_layers,
split_mode=self.model_params.split_mode,
main_gpu=self.model_params.main_gpu,
tensor_split=self.tensor_split,
vocab_only=self.model_params.vocab_only,
use_mmap=self.model_params.use_mmap,
use_mlock=self.model_params.use_mlock,
kv_overrides=self.kv_overrides,
# Context Params
seed=self.context_params.seed,
n_ctx=self.context_params.n_ctx,
@ -2185,11 +2220,13 @@ class Llama:
model_path=state["model_path"],
# Model Params
n_gpu_layers=state["n_gpu_layers"],
split_mode=state["split_mode"],
main_gpu=state["main_gpu"],
tensor_split=state["tensor_split"],
vocab_only=state["vocab_only"],
use_mmap=state["use_mmap"],
use_mlock=state["use_mlock"],
kv_overrides=state["kv_overrides"],
# Context Params
seed=state["seed"],
n_ctx=state["n_ctx"],

View file

@ -229,6 +229,7 @@ LLAMA_SPLIT_NONE = 0
LLAMA_SPLIT_LAYER = 1
LLAMA_SPLIT_ROW = 2
# typedef struct llama_token_data {
# llama_token id; // token id
# float logit; // log-odds of the token
@ -395,6 +396,7 @@ class llama_model_kv_override(Structure):
# // override key-value pairs of the model meta data
# const struct llama_model_kv_override * kv_overrides;
# // Keep the booleans together to avoid misalignment during copy-by-value.
# bool vocab_only; // only load the vocabulary, no weights
# bool use_mmap; // use mmap if possible
@ -526,6 +528,7 @@ It might not exist for progress report where '.' is output repeatedly."""
# bool quantize_output_tensor; // quantize output.weight
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
# bool pure; // disable k-quant mixtures and quantize all tensors to the same type
# void * imatrix; // pointer to importance matrix data
# } llama_model_quantize_params;
class llama_model_quantize_params(Structure):
"""Parameters for llama_model_quantize
@ -537,6 +540,7 @@ class llama_model_quantize_params(Structure):
quantize_output_tensor (bool): quantize output.weight
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
pure (bool): disable k-quant mixtures and quantize all tensors to the same type
imatrix (ctypes.c_void_p): pointer to importance matrix data
"""
_fields_ = [
@ -545,6 +549,8 @@ class llama_model_quantize_params(Structure):
("allow_requantize", c_bool),
("quantize_output_tensor", c_bool),
("only_copy", c_bool),
("pure", c_bool),
("imatrix", c_void_p),
]
@ -1956,14 +1962,39 @@ _lib.llama_sample_repetition_penalties.restype = None
# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
# LLAMA_API void llama_sample_classifier_free_guidance(
# /// @param logits Logits extracted from the original generation context.
# /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
# /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
# LLAMA_API void llama_sample_apply_guidance(
# struct llama_context * ctx,
# float * logits,
# float * logits_guidance,
# float scale);
def llama_sample_apply_guidance(
ctx: llama_context_p,
logits, # type: _Pointer[c_float]
logits_guidance, # type: _Pointer[c_float]
scale: Union[c_float, float],
):
"""Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806"""
return _lib.llama_sample_apply_guidance(ctx, logits, logits_guidance, scale)
_lib.llama_sample_apply_guidance.argtypes = [
llama_context_p,
c_float_p,
c_float_p,
c_float,
]
_lib.llama_sample_apply_guidance.restype = None
# LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
# struct llama_context * ctx,
# llama_token_data_array * candidates,
# struct llama_context * guidance_ctx,
# float scale);
# float scale),
# "use llama_sample_apply_guidance() instead");
def llama_sample_classifier_free_guidance(
ctx: llama_context_p,
candidates, # type: _Pointer[llama_token_data_array]

View file

@ -1433,7 +1433,6 @@ class SchemaConverter:
def visit(self, schema: Dict[str, Any], name: str) -> str:
schema_type: Optional[str] = schema.get("type") # type: ignore
assert isinstance(schema_type, str), f"Unrecognized schema: {schema}"
rule_name = name or "root"
if "$defs" in schema:

View file

@ -197,7 +197,36 @@ async def authenticate(
@router.post(
"/v1/completions", summary="Completion", dependencies=[Depends(authenticate)]
"/v1/completions",
summary="Completion",
dependencies=[Depends(authenticate)],
response_model= Union[
llama_cpp.CreateCompletionResponse,
str,
],
responses={
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"anyOf": [
{"$ref": "#/components/schemas/CreateCompletionResponse"}
],
"title": "Completion response, when stream=False",
}
},
"text/event-stream":{
"schema": {
"type": "string",
"title": "Server Side Streaming response, when stream=True. " +
"See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format", # noqa: E501
"example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]"""
}
}
},
}
},
)
@router.post(
"/v1/engines/copilot-codex/completions",
@ -280,7 +309,33 @@ async def create_embedding(
@router.post(
"/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)]
"/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)],
response_model= Union[
llama_cpp.ChatCompletion, str
],
responses={
"200": {
"description": "Successful Response",
"content": {
"application/json": {
"schema": {
"anyOf": [
{"$ref": "#/components/schemas/CreateChatCompletionResponse"}
],
"title": "Completion response, when stream=False",
}
},
"text/event-stream":{
"schema": {
"type": "string",
"title": "Server Side Streaming response, when stream=True" +
"See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format", # noqa: E501
"example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]"""
}
}
},
}
},
)
async def create_chat_completion(
request: Request,

View file

@ -1,6 +1,6 @@
from __future__ import annotations
from typing import Optional, Union, List
from typing import Dict, Optional, Union, List
import llama_cpp
@ -72,6 +72,23 @@ class LlamaProxy:
clip_model_path=settings.clip_model_path, verbose=settings.verbose
)
kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
if settings.kv_overrides is not None:
assert isinstance(settings.kv_overrides, list)
kv_overrides = {}
for kv in settings.kv_overrides:
key, value = kv.split("=")
if ":" in value:
value_type, value = value.split(":")
if value_type == "bool":
kv_overrides[key] = value.lower() in ["true", "1"]
elif value_type == "int":
kv_overrides[key] = int(value)
elif value_type == "float":
kv_overrides[key] = float(value)
else:
raise ValueError(f"Unknown value type {value_type}")
_model = llama_cpp.Llama(
model_path=settings.model,
# Model Params
@ -81,6 +98,7 @@ class LlamaProxy:
vocab_only=settings.vocab_only,
use_mmap=settings.use_mmap,
use_mlock=settings.use_mlock,
kv_overrides=kv_overrides,
# Context Params
seed=settings.seed,
n_ctx=settings.n_ctx,

View file

@ -28,6 +28,10 @@ class ModelSettings(BaseSettings):
ge=-1,
description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
)
split_mode: int = Field(
default=llama_cpp.LLAMA_SPLIT_LAYER,
description="The split mode to use.",
)
main_gpu: int = Field(
default=0,
ge=0,
@ -48,11 +52,15 @@ class ModelSettings(BaseSettings):
default=llama_cpp.llama_mlock_supported(),
description="Use mlock.",
)
kv_overrides: Optional[List[str]] = Field(
default=None,
description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
)
# Context Params
seed: int = Field(
default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
)
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
n_ctx: int = Field(default=2048, ge=0, description="The context size.")
n_batch: int = Field(
default=512, ge=1, description="The batch size to use per eval."
)

View file

@ -1,4 +1,5 @@
import llama_cpp
import json
tree = """
leaf ::= "."
@ -6,8 +7,46 @@ node ::= leaf | "(" node node ")"
root ::= node
"""
def test_grammar_from_string():
grammar = llama_cpp.LlamaGrammar.from_string(tree)
assert grammar._n_rules == 3
assert grammar._start_rule_index == 2
assert grammar.grammar is not None
def test_composed_pydantic_grammar():
"""
from pydantic import BaseModel
class A(BaseModel):
a: int
class B(BaseModel):
a: A
b: int
"""
# This schema corresponds to the grammar in the comment above.
# We don't use the pydantic models directly to avoid the dependency.
schema = {
"$defs": {
"A": {
"properties": {"a": {"title": "A", "type": "integer"}},
"required": ["a"],
"title": "A",
"type": "object",
}
},
"properties": {
"a": {"$ref": "#/$defs/A"},
"b": {"title": "B", "type": "integer"},
},
"required": ["a", "b"],
"title": "B",
"type": "object",
}
grammar = llama_cpp.LlamaGrammar.from_json_schema(json.dumps(schema))
assert grammar.grammar is not None

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit 76484fbfd355df388f71d6edaa98e1692a74de7e
Subproject commit 5c999609013a30c06e6fd28be8db5c2074bcc196