diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2a6f7cf..21c0875 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -238,6 +238,7 @@ class Llama: n_ctx: Maximum context size. n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. seed: Random seed. -1 for random. + n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded. f16_kv: Use half-precision for key/value cache. logits_all: Return logits for all tokens, not just the last token. vocab_only: Only load the vocabulary no weights. @@ -266,7 +267,7 @@ class Llama: self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx - self.params.n_gpu_layers = n_gpu_layers + self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers self.params.seed = seed self.params.f16_kv = f16_kv self.params.logits_all = logits_all diff --git a/llama_cpp/py.typed b/llama_cpp/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py index 8e6139d..74040d5 100644 --- a/setup.py +++ b/setup.py @@ -15,6 +15,7 @@ setup( author_email="abetlen@gmail.com", license="MIT", package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"}, + package_data={"llama_cpp": ["py.typed"]}, packages=["llama_cpp", "llama_cpp.server"], install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={