Update llama.cpp
This commit is contained in:
parent
7a536e86c2
commit
cdf59768f5
4 changed files with 16 additions and 5 deletions
|
@ -83,6 +83,7 @@ class Llama:
|
||||||
# NOTE: These parameters are likely to change in the future.
|
# NOTE: These parameters are likely to change in the future.
|
||||||
n_ctx: int = 512,
|
n_ctx: int = 512,
|
||||||
n_parts: int = -1,
|
n_parts: int = -1,
|
||||||
|
n_gpu_layers: int = 0,
|
||||||
seed: int = 1337,
|
seed: int = 1337,
|
||||||
f16_kv: bool = True,
|
f16_kv: bool = True,
|
||||||
logits_all: bool = False,
|
logits_all: bool = False,
|
||||||
|
@ -129,6 +130,7 @@ class Llama:
|
||||||
self.params = llama_cpp.llama_context_default_params()
|
self.params = llama_cpp.llama_context_default_params()
|
||||||
self.params.n_ctx = n_ctx
|
self.params.n_ctx = n_ctx
|
||||||
self.params.n_parts = n_parts
|
self.params.n_parts = n_parts
|
||||||
|
self.params.n_gpu_layers = n_gpu_layers
|
||||||
self.params.seed = seed
|
self.params.seed = seed
|
||||||
self.params.f16_kv = f16_kv
|
self.params.f16_kv = f16_kv
|
||||||
self.params.logits_all = logits_all
|
self.params.logits_all = logits_all
|
||||||
|
@ -1081,6 +1083,7 @@ class Llama:
|
||||||
model_path=self.model_path,
|
model_path=self.model_path,
|
||||||
n_ctx=self.params.n_ctx,
|
n_ctx=self.params.n_ctx,
|
||||||
n_parts=self.params.n_parts,
|
n_parts=self.params.n_parts,
|
||||||
|
n_gpu_layers=self.params.n_gpu_layers,
|
||||||
seed=self.params.seed,
|
seed=self.params.seed,
|
||||||
f16_kv=self.params.f16_kv,
|
f16_kv=self.params.f16_kv,
|
||||||
logits_all=self.params.logits_all,
|
logits_all=self.params.logits_all,
|
||||||
|
@ -1100,6 +1103,7 @@ class Llama:
|
||||||
model_path=state["model_path"],
|
model_path=state["model_path"],
|
||||||
n_ctx=state["n_ctx"],
|
n_ctx=state["n_ctx"],
|
||||||
n_parts=state["n_parts"],
|
n_parts=state["n_parts"],
|
||||||
|
n_gpu_layers=state["n_gpu_layers"],
|
||||||
seed=state["seed"],
|
seed=state["seed"],
|
||||||
f16_kv=state["f16_kv"],
|
f16_kv=state["f16_kv"],
|
||||||
logits_all=state["logits_all"],
|
logits_all=state["logits_all"],
|
||||||
|
|
|
@ -68,7 +68,7 @@ _lib_base_name = "llama"
|
||||||
_lib = _load_shared_library(_lib_base_name)
|
_lib = _load_shared_library(_lib_base_name)
|
||||||
|
|
||||||
# C types
|
# C types
|
||||||
LLAMA_FILE_VERSION = c_int(1)
|
LLAMA_FILE_VERSION = c_int(2)
|
||||||
LLAMA_FILE_MAGIC = b"ggjt"
|
LLAMA_FILE_MAGIC = b"ggjt"
|
||||||
LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
|
LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
|
||||||
LLAMA_SESSION_MAGIC = b"ggsn"
|
LLAMA_SESSION_MAGIC = b"ggsn"
|
||||||
|
@ -109,6 +109,7 @@ class llama_context_params(Structure):
|
||||||
_fields_ = [
|
_fields_ = [
|
||||||
("n_ctx", c_int), # text context
|
("n_ctx", c_int), # text context
|
||||||
("n_parts", c_int), # -1 for default
|
("n_parts", c_int), # -1 for default
|
||||||
|
("n_gpu_layers", c_int), # number of layers to store in VRAM
|
||||||
("seed", c_int), # RNG seed, 0 for random
|
("seed", c_int), # RNG seed, 0 for random
|
||||||
("f16_kv", c_bool), # use fp16 for KV cache
|
("f16_kv", c_bool), # use fp16 for KV cache
|
||||||
(
|
(
|
||||||
|
@ -135,7 +136,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
|
||||||
4
|
4
|
||||||
) # tok_embeddings.weight and output.weight are F16
|
) # tok_embeddings.weight and output.weight are F16
|
||||||
LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
|
# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
|
||||||
# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors
|
# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors
|
||||||
|
@ -259,9 +260,9 @@ _lib.llama_get_state_size.restype = c_size_t
|
||||||
# Destination needs to have allocated enough memory.
|
# Destination needs to have allocated enough memory.
|
||||||
# Returns the number of bytes copied
|
# Returns the number of bytes copied
|
||||||
def llama_copy_state_data(
|
def llama_copy_state_data(
|
||||||
ctx: llama_context_p, dest # type: Array[c_uint8]
|
ctx: llama_context_p, dst # type: Array[c_uint8]
|
||||||
) -> int:
|
) -> int:
|
||||||
return _lib.llama_copy_state_data(ctx, dest)
|
return _lib.llama_copy_state_data(ctx, dst)
|
||||||
|
|
||||||
|
|
||||||
_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
|
_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
|
||||||
|
|
|
@ -17,6 +17,11 @@ class Settings(BaseSettings):
|
||||||
description="The path to the model to use for generating completions."
|
description="The path to the model to use for generating completions."
|
||||||
)
|
)
|
||||||
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
|
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
|
||||||
|
n_gpu_layers: int = Field(
|
||||||
|
default=0,
|
||||||
|
ge=0,
|
||||||
|
description="The number of layers to put on the GPU. The rest will be on the CPU.",
|
||||||
|
)
|
||||||
n_batch: int = Field(
|
n_batch: int = Field(
|
||||||
default=512, ge=1, description="The batch size to use per eval."
|
default=512, ge=1, description="The batch size to use per eval."
|
||||||
)
|
)
|
||||||
|
@ -80,6 +85,7 @@ def create_app(settings: Optional[Settings] = None):
|
||||||
global llama
|
global llama
|
||||||
llama = llama_cpp.Llama(
|
llama = llama_cpp.Llama(
|
||||||
model_path=settings.model,
|
model_path=settings.model,
|
||||||
|
n_gpu_layers=settings.n_gpu_layers,
|
||||||
f16_kv=settings.f16_kv,
|
f16_kv=settings.f16_kv,
|
||||||
use_mlock=settings.use_mlock,
|
use_mlock=settings.use_mlock,
|
||||||
use_mmap=settings.use_mmap,
|
use_mmap=settings.use_mmap,
|
||||||
|
|
2
vendor/llama.cpp
vendored
2
vendor/llama.cpp
vendored
|
@ -1 +1 @@
|
||||||
Subproject commit b608b55a3ea8e4760c617418538465449175bdb8
|
Subproject commit 08737ef720f0510c7ec2aa84d7f70c691073c35d
|
Loading…
Reference in a new issue