Update llama.cpp

This commit is contained in:
Andrei Betlen 2023-05-14 00:04:22 -04:00
parent 7a536e86c2
commit cdf59768f5
4 changed files with 16 additions and 5 deletions

View file

@ -83,6 +83,7 @@ class Llama:
# NOTE: These parameters are likely to change in the future. # NOTE: These parameters are likely to change in the future.
n_ctx: int = 512, n_ctx: int = 512,
n_parts: int = -1, n_parts: int = -1,
n_gpu_layers: int = 0,
seed: int = 1337, seed: int = 1337,
f16_kv: bool = True, f16_kv: bool = True,
logits_all: bool = False, logits_all: bool = False,
@ -129,6 +130,7 @@ class Llama:
self.params = llama_cpp.llama_context_default_params() self.params = llama_cpp.llama_context_default_params()
self.params.n_ctx = n_ctx self.params.n_ctx = n_ctx
self.params.n_parts = n_parts self.params.n_parts = n_parts
self.params.n_gpu_layers = n_gpu_layers
self.params.seed = seed self.params.seed = seed
self.params.f16_kv = f16_kv self.params.f16_kv = f16_kv
self.params.logits_all = logits_all self.params.logits_all = logits_all
@ -1081,6 +1083,7 @@ class Llama:
model_path=self.model_path, model_path=self.model_path,
n_ctx=self.params.n_ctx, n_ctx=self.params.n_ctx,
n_parts=self.params.n_parts, n_parts=self.params.n_parts,
n_gpu_layers=self.params.n_gpu_layers,
seed=self.params.seed, seed=self.params.seed,
f16_kv=self.params.f16_kv, f16_kv=self.params.f16_kv,
logits_all=self.params.logits_all, logits_all=self.params.logits_all,
@ -1100,6 +1103,7 @@ class Llama:
model_path=state["model_path"], model_path=state["model_path"],
n_ctx=state["n_ctx"], n_ctx=state["n_ctx"],
n_parts=state["n_parts"], n_parts=state["n_parts"],
n_gpu_layers=state["n_gpu_layers"],
seed=state["seed"], seed=state["seed"],
f16_kv=state["f16_kv"], f16_kv=state["f16_kv"],
logits_all=state["logits_all"], logits_all=state["logits_all"],

View file

@ -68,7 +68,7 @@ _lib_base_name = "llama"
_lib = _load_shared_library(_lib_base_name) _lib = _load_shared_library(_lib_base_name)
# C types # C types
LLAMA_FILE_VERSION = c_int(1) LLAMA_FILE_VERSION = c_int(2)
LLAMA_FILE_MAGIC = b"ggjt" LLAMA_FILE_MAGIC = b"ggjt"
LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml" LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
LLAMA_SESSION_MAGIC = b"ggsn" LLAMA_SESSION_MAGIC = b"ggsn"
@ -109,6 +109,7 @@ class llama_context_params(Structure):
_fields_ = [ _fields_ = [
("n_ctx", c_int), # text context ("n_ctx", c_int), # text context
("n_parts", c_int), # -1 for default ("n_parts", c_int), # -1 for default
("n_gpu_layers", c_int), # number of layers to store in VRAM
("seed", c_int), # RNG seed, 0 for random ("seed", c_int), # RNG seed, 0 for random
("f16_kv", c_bool), # use fp16 for KV cache ("f16_kv", c_bool), # use fp16 for KV cache
( (
@ -135,7 +136,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int( LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
4 4
) # tok_embeddings.weight and output.weight are F16 ) # tok_embeddings.weight and output.weight are F16
LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors # LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors # LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors
@ -259,9 +260,9 @@ _lib.llama_get_state_size.restype = c_size_t
# Destination needs to have allocated enough memory. # Destination needs to have allocated enough memory.
# Returns the number of bytes copied # Returns the number of bytes copied
def llama_copy_state_data( def llama_copy_state_data(
ctx: llama_context_p, dest # type: Array[c_uint8] ctx: llama_context_p, dst # type: Array[c_uint8]
) -> int: ) -> int:
return _lib.llama_copy_state_data(ctx, dest) return _lib.llama_copy_state_data(ctx, dst)
_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]

View file

@ -17,6 +17,11 @@ class Settings(BaseSettings):
description="The path to the model to use for generating completions." description="The path to the model to use for generating completions."
) )
n_ctx: int = Field(default=2048, ge=1, description="The context size.") n_ctx: int = Field(default=2048, ge=1, description="The context size.")
n_gpu_layers: int = Field(
default=0,
ge=0,
description="The number of layers to put on the GPU. The rest will be on the CPU.",
)
n_batch: int = Field( n_batch: int = Field(
default=512, ge=1, description="The batch size to use per eval." default=512, ge=1, description="The batch size to use per eval."
) )
@ -80,6 +85,7 @@ def create_app(settings: Optional[Settings] = None):
global llama global llama
llama = llama_cpp.Llama( llama = llama_cpp.Llama(
model_path=settings.model, model_path=settings.model,
n_gpu_layers=settings.n_gpu_layers,
f16_kv=settings.f16_kv, f16_kv=settings.f16_kv,
use_mlock=settings.use_mlock, use_mlock=settings.use_mlock,
use_mmap=settings.use_mmap, use_mmap=settings.use_mmap,

2
vendor/llama.cpp vendored

@ -1 +1 @@
Subproject commit b608b55a3ea8e4760c617418538465449175bdb8 Subproject commit 08737ef720f0510c7ec2aa84d7f70c691073c35d