This commit is contained in:
commit
eebae1a368
11 changed files with 63 additions and 237 deletions
|
@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.2.53]
|
||||||
|
|
||||||
|
- feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a
|
||||||
|
- fix: eos/bos_token set correctly for Jinja2ChatFormatter and automatic chat formatter by @CISC in #1230
|
||||||
|
|
||||||
## [0.2.52]
|
## [0.2.52]
|
||||||
|
|
||||||
- feat: Update llama.cpp to ggerganov/llama.cpp@a33e6a0d2a66104ea9a906bdbf8a94d050189d91
|
- feat: Update llama.cpp to ggerganov/llama.cpp@a33e6a0d2a66104ea9a906bdbf8a94d050189d91
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .llama_cpp import *
|
from .llama_cpp import *
|
||||||
from .llama import *
|
from .llama import *
|
||||||
|
|
||||||
__version__ = "0.2.52"
|
__version__ = "0.2.53"
|
|
@ -357,21 +357,6 @@ class _LlamaContext:
|
||||||
penalty_present,
|
penalty_present,
|
||||||
)
|
)
|
||||||
|
|
||||||
def sample_classifier_free_guidance(
|
|
||||||
self,
|
|
||||||
candidates: "_LlamaTokenDataArray",
|
|
||||||
guidance_ctx: "_LlamaContext",
|
|
||||||
scale: float,
|
|
||||||
):
|
|
||||||
assert self.ctx is not None
|
|
||||||
assert guidance_ctx.ctx is not None
|
|
||||||
llama_cpp.llama_sample_classifier_free_guidance(
|
|
||||||
self.ctx,
|
|
||||||
llama_cpp.byref(candidates.candidates),
|
|
||||||
guidance_ctx.ctx,
|
|
||||||
scale,
|
|
||||||
)
|
|
||||||
|
|
||||||
def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
|
def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
|
||||||
assert self.ctx is not None
|
assert self.ctx is not None
|
||||||
llama_cpp.llama_sample_softmax(
|
llama_cpp.llama_sample_softmax(
|
||||||
|
@ -720,7 +705,7 @@ class _LlamaSamplingContext:
|
||||||
return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8")
|
return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8")
|
||||||
|
|
||||||
def sample(
|
def sample(
|
||||||
self, ctx_main: _LlamaContext, ctx_cfg: Optional[_LlamaContext] = None, idx: int = 0, logits_array: Optional[npt.NDArray[np.single]] = None
|
self, ctx_main: _LlamaContext, idx: int = 0, logits_array: Optional[npt.NDArray[np.single]] = None
|
||||||
):
|
):
|
||||||
n_vocab = ctx_main.model.n_vocab()
|
n_vocab = ctx_main.model.n_vocab()
|
||||||
id: int = 0
|
id: int = 0
|
||||||
|
@ -741,11 +726,6 @@ class _LlamaSamplingContext:
|
||||||
) # TODO: Only create this once
|
) # TODO: Only create this once
|
||||||
token_data_array.copy_logits(logits_array)
|
token_data_array.copy_logits(logits_array)
|
||||||
|
|
||||||
if ctx_cfg is not None:
|
|
||||||
ctx_main.sample_classifier_free_guidance(
|
|
||||||
token_data_array, ctx_cfg, self.params.cfg_scale
|
|
||||||
)
|
|
||||||
|
|
||||||
# apply penalties
|
# apply penalties
|
||||||
if len(self.prev) > 0:
|
if len(self.prev) > 0:
|
||||||
nl_token = ctx_main.model.token_nl()
|
nl_token = ctx_main.model.token_nl()
|
||||||
|
|
|
@ -408,8 +408,8 @@ class Llama:
|
||||||
except:
|
except:
|
||||||
bos_token_id = self.token_bos()
|
bos_token_id = self.token_bos()
|
||||||
|
|
||||||
eos_token = self.detokenize([eos_token_id]).decode("utf-8")
|
eos_token = self._model.token_get_text(eos_token_id)
|
||||||
bos_token = self.detokenize([bos_token_id]).decode("utf-8")
|
bos_token = self._model.token_get_text(bos_token_id)
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"Using chat template: {template}", file=sys.stderr)
|
print(f"Using chat template: {template}", file=sys.stderr)
|
||||||
|
|
|
@ -111,6 +111,7 @@ if TYPE_CHECKING:
|
||||||
|
|
||||||
F = TypeVar("F", bound=Callable[..., Any])
|
F = TypeVar("F", bound=Callable[..., Any])
|
||||||
|
|
||||||
|
|
||||||
def ctypes_function_for_shared_library(lib: ctypes.CDLL):
|
def ctypes_function_for_shared_library(lib: ctypes.CDLL):
|
||||||
def ctypes_function(
|
def ctypes_function(
|
||||||
name: str, argtypes: List[Any], restype: Any, enabled: bool = True
|
name: str, argtypes: List[Any], restype: Any, enabled: bool = True
|
||||||
|
@ -264,6 +265,7 @@ LLAMA_TOKEN_TYPE_BYTE = 6
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
||||||
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||||
|
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||||
|
|
||||||
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
# };
|
# };
|
||||||
|
@ -295,6 +297,7 @@ LLAMA_FTYPE_MOSTLY_IQ3_S = 26
|
||||||
LLAMA_FTYPE_MOSTLY_IQ3_M = 27
|
LLAMA_FTYPE_MOSTLY_IQ3_M = 27
|
||||||
LLAMA_FTYPE_MOSTLY_IQ2_S = 28
|
LLAMA_FTYPE_MOSTLY_IQ2_S = 28
|
||||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29
|
||||||
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30
|
||||||
LLAMA_FTYPE_GUESSED = 1024
|
LLAMA_FTYPE_GUESSED = 1024
|
||||||
|
|
||||||
# enum llama_rope_scaling_type {
|
# enum llama_rope_scaling_type {
|
||||||
|
@ -548,6 +551,7 @@ class llama_model_params(ctypes.Structure):
|
||||||
# float yarn_beta_fast; // YaRN low correction dim
|
# float yarn_beta_fast; // YaRN low correction dim
|
||||||
# float yarn_beta_slow; // YaRN high correction dim
|
# float yarn_beta_slow; // YaRN high correction dim
|
||||||
# uint32_t yarn_orig_ctx; // YaRN original context size
|
# uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
|
# float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
|
||||||
|
|
||||||
# ggml_backend_sched_eval_callback cb_eval;
|
# ggml_backend_sched_eval_callback cb_eval;
|
||||||
# void * cb_eval_user_data;
|
# void * cb_eval_user_data;
|
||||||
|
@ -580,6 +584,7 @@ class llama_context_params(ctypes.Structure):
|
||||||
yarn_beta_fast (float): YaRN low correction dim
|
yarn_beta_fast (float): YaRN low correction dim
|
||||||
yarn_beta_slow (float): YaRN high correction dim
|
yarn_beta_slow (float): YaRN high correction dim
|
||||||
yarn_orig_ctx (int): YaRN original context size
|
yarn_orig_ctx (int): YaRN original context size
|
||||||
|
defrag_thold (float): defragment the KV cache if holes/size > thold, < 0 disabled (default)
|
||||||
cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval
|
cb_eval (ggml_backend_sched_eval_callback): callback for scheduling eval
|
||||||
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
|
cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
|
||||||
type_k (int): data type for K cache
|
type_k (int): data type for K cache
|
||||||
|
@ -605,6 +610,7 @@ class llama_context_params(ctypes.Structure):
|
||||||
("yarn_beta_fast", ctypes.c_float),
|
("yarn_beta_fast", ctypes.c_float),
|
||||||
("yarn_beta_slow", ctypes.c_float),
|
("yarn_beta_slow", ctypes.c_float),
|
||||||
("yarn_orig_ctx", ctypes.c_uint32),
|
("yarn_orig_ctx", ctypes.c_uint32),
|
||||||
|
("defrag_thold", ctypes.c_float),
|
||||||
("cb_eval", ggml_backend_sched_eval_callback),
|
("cb_eval", ggml_backend_sched_eval_callback),
|
||||||
("cb_eval_user_data", ctypes.c_void_p),
|
("cb_eval_user_data", ctypes.c_void_p),
|
||||||
("type_k", ctypes.c_int),
|
("type_k", ctypes.c_int),
|
||||||
|
@ -933,18 +939,6 @@ def llama_supports_gpu_offload() -> bool:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
|
|
||||||
@ctypes_function("llama_mmap_supported", [], ctypes.c_bool)
|
|
||||||
def llama_mmap_supported() -> bool:
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
|
|
||||||
@ctypes_function("llama_mlock_supported", [], ctypes.c_bool)
|
|
||||||
def llama_mlock_supported() -> bool:
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
# LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||||
@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
|
@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
|
||||||
def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
|
def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
|
||||||
|
@ -1153,47 +1147,6 @@ def llama_model_quantize(
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
# // Apply a LoRA adapter to a loaded model
|
|
||||||
# // path_base_model is the path to a higher quality model to use as a base for
|
|
||||||
# // the layers modified by the adapter. Can be NULL to use the current loaded model.
|
|
||||||
# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
|
||||||
# // will be applied on top of the previous one
|
|
||||||
# // Returns 0 on success
|
|
||||||
# LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
|
|
||||||
# struct llama_context * ctx,
|
|
||||||
# const char * path_lora,
|
|
||||||
# float scale,
|
|
||||||
# const char * path_base_model,
|
|
||||||
# int32_t n_threads),
|
|
||||||
# "use llama_model_apply_lora_from_file instead");
|
|
||||||
@ctypes_function(
|
|
||||||
"llama_apply_lora_from_file",
|
|
||||||
[
|
|
||||||
llama_context_p_ctypes,
|
|
||||||
ctypes.c_char_p,
|
|
||||||
ctypes.c_float,
|
|
||||||
ctypes.c_char_p,
|
|
||||||
ctypes.c_int32,
|
|
||||||
],
|
|
||||||
ctypes.c_int32,
|
|
||||||
)
|
|
||||||
def llama_apply_lora_from_file(
|
|
||||||
ctx: llama_context_p,
|
|
||||||
path_lora: Union[ctypes.c_char_p, bytes],
|
|
||||||
scale: Union[ctypes.c_float, float],
|
|
||||||
path_base_model: Union[ctypes.c_char_p, bytes],
|
|
||||||
n_threads: Union[ctypes.c_int32, int],
|
|
||||||
/,
|
|
||||||
) -> int:
|
|
||||||
"""Apply a LoRA adapter to a loaded model
|
|
||||||
path_base_model is the path to a higher quality model to use as a base for
|
|
||||||
the layers modified by the adapter. Can be NULL to use the current loaded model.
|
|
||||||
The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
|
||||||
will be applied on top of the previous one
|
|
||||||
Returns 0 on success"""
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API int32_t llama_model_apply_lora_from_file(
|
# LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||||
# const struct llama_model * model,
|
# const struct llama_model * model,
|
||||||
# const char * path_lora,
|
# const char * path_lora,
|
||||||
|
@ -1215,7 +1168,7 @@ def llama_model_apply_lora_from_file(
|
||||||
model: llama_model_p,
|
model: llama_model_p,
|
||||||
path_lora: Union[ctypes.c_char_p, bytes],
|
path_lora: Union[ctypes.c_char_p, bytes],
|
||||||
scale: Union[ctypes.c_float, float],
|
scale: Union[ctypes.c_float, float],
|
||||||
path_base_model: Union[ctypes.c_char_p, bytes],
|
path_base_model: Union[ctypes.c_char_p, bytes, None],
|
||||||
n_threads: Union[ctypes.c_int32, int],
|
n_threads: Union[ctypes.c_int32, int],
|
||||||
/,
|
/,
|
||||||
) -> int:
|
) -> int:
|
||||||
|
@ -1642,72 +1595,6 @@ def llama_save_session_file(
|
||||||
# //
|
# //
|
||||||
|
|
||||||
|
|
||||||
# // Run the llama inference to obtain the logits and probabilities for the next token(s).
|
|
||||||
# // tokens + n_tokens is the provided batch of new tokens to process
|
|
||||||
# // n_past is the number of tokens to use from previous eval calls
|
|
||||||
# // Returns 0 on success
|
|
||||||
# // DEPRECATED: use llama_decode() instead
|
|
||||||
# LLAMA_API DEPRECATED(int llama_eval(
|
|
||||||
# struct llama_context * ctx,
|
|
||||||
# llama_token * tokens,
|
|
||||||
# int32_t n_tokens,
|
|
||||||
# int32_t n_past),
|
|
||||||
# "use llama_decode() instead");
|
|
||||||
@ctypes_function(
|
|
||||||
"llama_eval",
|
|
||||||
[
|
|
||||||
llama_context_p_ctypes,
|
|
||||||
llama_token_p,
|
|
||||||
ctypes.c_int32,
|
|
||||||
ctypes.c_int32,
|
|
||||||
],
|
|
||||||
ctypes.c_int,
|
|
||||||
)
|
|
||||||
def llama_eval(
|
|
||||||
ctx: llama_context_p,
|
|
||||||
tokens: CtypesArray[llama_token],
|
|
||||||
n_tokens: Union[ctypes.c_int, int],
|
|
||||||
n_past: Union[ctypes.c_int, int],
|
|
||||||
/,
|
|
||||||
) -> int:
|
|
||||||
"""Run the llama inference to obtain the logits and probabilities for the next token(s).
|
|
||||||
tokens + n_tokens is the provided batch of new tokens to process
|
|
||||||
n_past is the number of tokens to use from previous eval calls
|
|
||||||
Returns 0 on success
|
|
||||||
DEPRECATED: use llama_decode() instead"""
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
# // Same as llama_eval, but use float matrix input directly.
|
|
||||||
# // DEPRECATED: use llama_decode() instead
|
|
||||||
# LLAMA_API DEPRECATED(int llama_eval_embd(
|
|
||||||
# struct llama_context * ctx,
|
|
||||||
# float * embd,
|
|
||||||
# int32_t n_tokens,
|
|
||||||
# int32_t n_past),
|
|
||||||
# "use llama_decode() instead");
|
|
||||||
@ctypes_function(
|
|
||||||
"llama_eval_embd",
|
|
||||||
[
|
|
||||||
llama_context_p_ctypes,
|
|
||||||
ctypes.POINTER(ctypes.c_float),
|
|
||||||
ctypes.c_int32,
|
|
||||||
ctypes.c_int32,
|
|
||||||
],
|
|
||||||
ctypes.c_int,
|
|
||||||
)
|
|
||||||
def llama_eval_embd(
|
|
||||||
ctx: llama_context_p,
|
|
||||||
embd: CtypesArray[ctypes.c_float],
|
|
||||||
n_tokens: Union[ctypes.c_int, int],
|
|
||||||
n_past: Union[ctypes.c_int, int],
|
|
||||||
/,
|
|
||||||
) -> int:
|
|
||||||
"""Same as llama_eval, but use float matrix input directly.
|
|
||||||
DEPRECATED: use llama_decode() instead"""
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
# // Return batch for single sequence of tokens starting at pos_0
|
# // Return batch for single sequence of tokens starting at pos_0
|
||||||
# //
|
# //
|
||||||
# // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
# // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
||||||
|
@ -2242,35 +2129,6 @@ def llama_sample_apply_guidance(
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
|
|
||||||
# struct llama_context * ctx,
|
|
||||||
# llama_token_data_array * candidates,
|
|
||||||
# struct llama_context * guidance_ctx,
|
|
||||||
# float scale),
|
|
||||||
# "use llama_sample_apply_guidance() instead");
|
|
||||||
@ctypes_function(
|
|
||||||
"llama_sample_classifier_free_guidance",
|
|
||||||
[
|
|
||||||
llama_context_p_ctypes,
|
|
||||||
llama_token_data_array_p,
|
|
||||||
llama_context_p_ctypes,
|
|
||||||
ctypes.c_float,
|
|
||||||
],
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
def llama_sample_classifier_free_guidance(
|
|
||||||
ctx: llama_context_p,
|
|
||||||
candidates: Union[
|
|
||||||
CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
|
|
||||||
],
|
|
||||||
guidance_ctx: llama_context_p,
|
|
||||||
scale: Union[ctypes.c_float, float],
|
|
||||||
/,
|
|
||||||
):
|
|
||||||
"""Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806"""
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
||||||
# LLAMA_API void llama_sample_softmax(
|
# LLAMA_API void llama_sample_softmax(
|
||||||
# struct llama_context * ctx,
|
# struct llama_context * ctx,
|
||||||
|
@ -2469,28 +2327,6 @@ def llama_sample_temp(
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
# LLAMA_API DEPRECATED(void llama_sample_temperature(
|
|
||||||
# struct llama_context * ctx,
|
|
||||||
# llama_token_data_array * candidates,
|
|
||||||
# float temp),
|
|
||||||
# "use llama_sample_temp instead");
|
|
||||||
@ctypes_function(
|
|
||||||
"llama_sample_temperature",
|
|
||||||
[llama_context_p_ctypes, llama_token_data_array_p, ctypes.c_float],
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
def llama_sample_temperature(
|
|
||||||
ctx: llama_context_p,
|
|
||||||
candidates: Union[
|
|
||||||
CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
|
|
||||||
],
|
|
||||||
temp: Union[ctypes.c_float, float],
|
|
||||||
/,
|
|
||||||
):
|
|
||||||
"""use llama_sample_temp instead"""
|
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
# /// @details Apply constraints from grammar
|
# /// @details Apply constraints from grammar
|
||||||
# LLAMA_API void llama_sample_grammar(
|
# LLAMA_API void llama_sample_grammar(
|
||||||
# struct llama_context * ctx,
|
# struct llama_context * ctx,
|
||||||
|
|
|
@ -200,7 +200,7 @@ async def authenticate(
|
||||||
"/v1/completions",
|
"/v1/completions",
|
||||||
summary="Completion",
|
summary="Completion",
|
||||||
dependencies=[Depends(authenticate)],
|
dependencies=[Depends(authenticate)],
|
||||||
response_model= Union[
|
response_model=Union[
|
||||||
llama_cpp.CreateCompletionResponse,
|
llama_cpp.CreateCompletionResponse,
|
||||||
str,
|
str,
|
||||||
],
|
],
|
||||||
|
@ -216,15 +216,15 @@ async def authenticate(
|
||||||
"title": "Completion response, when stream=False",
|
"title": "Completion response, when stream=False",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"text/event-stream":{
|
"text/event-stream": {
|
||||||
"schema": {
|
"schema": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"title": "Server Side Streaming response, when stream=True. " +
|
"title": "Server Side Streaming response, when stream=True. "
|
||||||
"See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format", # noqa: E501
|
+ "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format", # noqa: E501
|
||||||
"example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]"""
|
"example": """data: {... see CreateCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -290,7 +290,7 @@ async def create_completion(
|
||||||
inner_send_chan=send_chan,
|
inner_send_chan=send_chan,
|
||||||
iterator=iterator(),
|
iterator=iterator(),
|
||||||
),
|
),
|
||||||
sep='\n',
|
sep="\n",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return iterator_or_completion
|
return iterator_or_completion
|
||||||
|
@ -310,10 +310,10 @@ async def create_embedding(
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
@router.post(
|
||||||
"/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)],
|
"/v1/chat/completions",
|
||||||
response_model= Union[
|
summary="Chat",
|
||||||
llama_cpp.ChatCompletion, str
|
dependencies=[Depends(authenticate)],
|
||||||
],
|
response_model=Union[llama_cpp.ChatCompletion, str],
|
||||||
responses={
|
responses={
|
||||||
"200": {
|
"200": {
|
||||||
"description": "Successful Response",
|
"description": "Successful Response",
|
||||||
|
@ -321,20 +321,22 @@ async def create_embedding(
|
||||||
"application/json": {
|
"application/json": {
|
||||||
"schema": {
|
"schema": {
|
||||||
"anyOf": [
|
"anyOf": [
|
||||||
{"$ref": "#/components/schemas/CreateChatCompletionResponse"}
|
{
|
||||||
|
"$ref": "#/components/schemas/CreateChatCompletionResponse"
|
||||||
|
}
|
||||||
],
|
],
|
||||||
"title": "Completion response, when stream=False",
|
"title": "Completion response, when stream=False",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"text/event-stream":{
|
"text/event-stream": {
|
||||||
"schema": {
|
"schema": {
|
||||||
"type": "string",
|
"type": "string",
|
||||||
"title": "Server Side Streaming response, when stream=True" +
|
"title": "Server Side Streaming response, when stream=True"
|
||||||
"See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format", # noqa: E501
|
+ "See SSE format: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format", # noqa: E501
|
||||||
"example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]"""
|
"example": """data: {... see CreateChatCompletionResponse ...} \\n\\n data: ... \\n\\n ... data: [DONE]""",
|
||||||
}
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
@ -383,7 +385,7 @@ async def create_chat_completion(
|
||||||
inner_send_chan=send_chan,
|
inner_send_chan=send_chan,
|
||||||
iterator=iterator(),
|
iterator=iterator(),
|
||||||
),
|
),
|
||||||
sep='\n',
|
sep="\n",
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return iterator_or_completion
|
return iterator_or_completion
|
||||||
|
|
|
@ -22,6 +22,7 @@ from llama_cpp.server.types import (
|
||||||
CreateChatCompletionRequest,
|
CreateChatCompletionRequest,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class ErrorResponse(TypedDict):
|
class ErrorResponse(TypedDict):
|
||||||
"""OpenAI style error response"""
|
"""OpenAI style error response"""
|
||||||
|
|
||||||
|
@ -207,4 +208,3 @@ class RouteErrorHandler(APIRoute):
|
||||||
)
|
)
|
||||||
|
|
||||||
return custom_route_handler
|
return custom_route_handler
|
||||||
|
|
||||||
|
|
|
@ -88,15 +88,15 @@ class LlamaProxy:
|
||||||
assert (
|
assert (
|
||||||
settings.hf_tokenizer_config_path is not None
|
settings.hf_tokenizer_config_path is not None
|
||||||
), "hf_tokenizer_config_path must be set for hf-tokenizer-config"
|
), "hf_tokenizer_config_path must be set for hf-tokenizer-config"
|
||||||
chat_handler = (
|
chat_handler = llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler(
|
||||||
llama_cpp.llama_chat_format.hf_tokenizer_config_to_chat_completion_handler(
|
|
||||||
json.load(open(settings.hf_tokenizer_config_path))
|
json.load(open(settings.hf_tokenizer_config_path))
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
|
tokenizer: Optional[llama_cpp.BaseLlamaTokenizer] = None
|
||||||
if settings.hf_pretrained_model_name_or_path is not None:
|
if settings.hf_pretrained_model_name_or_path is not None:
|
||||||
tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(settings.hf_pretrained_model_name_or_path)
|
tokenizer = llama_tokenizer.LlamaHFTokenizer.from_pretrained(
|
||||||
|
settings.hf_pretrained_model_name_or_path
|
||||||
|
)
|
||||||
|
|
||||||
draft_model = None
|
draft_model = None
|
||||||
if settings.draft_model is not None:
|
if settings.draft_model is not None:
|
||||||
|
@ -126,12 +126,15 @@ class LlamaProxy:
|
||||||
kwargs = {}
|
kwargs = {}
|
||||||
|
|
||||||
if settings.hf_model_repo_id is not None:
|
if settings.hf_model_repo_id is not None:
|
||||||
create_fn = functools.partial(llama_cpp.Llama.from_pretrained, repo_id=settings.hf_model_repo_id, filename=settings.model)
|
create_fn = functools.partial(
|
||||||
|
llama_cpp.Llama.from_pretrained,
|
||||||
|
repo_id=settings.hf_model_repo_id,
|
||||||
|
filename=settings.model,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
create_fn = llama_cpp.Llama
|
create_fn = llama_cpp.Llama
|
||||||
kwargs["model_path"] = settings.model
|
kwargs["model_path"] = settings.model
|
||||||
|
|
||||||
|
|
||||||
_model = create_fn(
|
_model = create_fn(
|
||||||
**kwargs,
|
**kwargs,
|
||||||
# Model Params
|
# Model Params
|
||||||
|
|
|
@ -45,11 +45,11 @@ class ModelSettings(BaseSettings):
|
||||||
default=False, description="Whether to only return the vocabulary."
|
default=False, description="Whether to only return the vocabulary."
|
||||||
)
|
)
|
||||||
use_mmap: bool = Field(
|
use_mmap: bool = Field(
|
||||||
default=llama_cpp.llama_mmap_supported(),
|
default=llama_cpp.llama_supports_mmap(),
|
||||||
description="Use mmap.",
|
description="Use mmap.",
|
||||||
)
|
)
|
||||||
use_mlock: bool = Field(
|
use_mlock: bool = Field(
|
||||||
default=llama_cpp.llama_mlock_supported(),
|
default=llama_cpp.llama_supports_mlock(),
|
||||||
description="Use mlock.",
|
description="Use mlock.",
|
||||||
)
|
)
|
||||||
kv_overrides: Optional[List[str]] = Field(
|
kv_overrides: Optional[List[str]] = Field(
|
||||||
|
@ -74,7 +74,9 @@ class ModelSettings(BaseSettings):
|
||||||
ge=0,
|
ge=0,
|
||||||
description="The number of threads to use when batch processing.",
|
description="The number of threads to use when batch processing.",
|
||||||
)
|
)
|
||||||
rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED)
|
rope_scaling_type: int = Field(
|
||||||
|
default=llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED
|
||||||
|
)
|
||||||
rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
|
rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
|
||||||
rope_freq_scale: float = Field(
|
rope_freq_scale: float = Field(
|
||||||
default=0.0, description="RoPE frequency scaling factor"
|
default=0.0, description="RoPE frequency scaling factor"
|
||||||
|
@ -193,6 +195,4 @@ class Settings(ServerSettings, ModelSettings):
|
||||||
class ConfigFileSettings(ServerSettings):
|
class ConfigFileSettings(ServerSettings):
|
||||||
"""Configuration file format settings."""
|
"""Configuration file format settings."""
|
||||||
|
|
||||||
models: List[ModelSettings] = Field(
|
models: List[ModelSettings] = Field(default=[], description="Model configs")
|
||||||
default=[], description="Model configs"
|
|
||||||
)
|
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit a33e6a0d2a66104ea9a906bdbf8a94d050189d91
|
Subproject commit 08c5ee87e4cceb603ecceac90734fcdade57311b
|
Loading…
Reference in a new issue