Merge pull request #499 from abetlen/v0.2-wip

llama-cpp-python v0.2.0
2023-09-12 19:04:18 -04:00 · 2023-09-12 19:04:18 -04:00 · 3feeafed1a
commit 3feeafed1a
parent c7901f1141 bcef9ab2d9
22 changed files with 240 additions and 2017 deletions
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@ -26,7 +26,8 @@ jobs:
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          python -m pip install --upgrade pip
          python -m pip install -e .[all]
      - name: Build wheels
        run: python -m cibuildwheel --output-dir wheelhouse
@ -46,10 +47,11 @@ jobs:
      - uses: actions/setup-python@v3
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          python -m pip install --upgrade pip build
          python -m pip install -e .[all]
      - name: Build source distribution
        run: |
-          python setup.py sdist
+          python -m build --sdist
      - uses: actions/upload-artifact@v3
        with:
          path: ./dist/*.tar.gz
--- a/.github/workflows/publish-to-test.yaml
+++ b/.github/workflows/publish-to-test.yaml
@ -19,10 +19,11 @@ jobs:
        python-version: "3.8"
    - name: Install dependencies
      run: |
-        python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+        python3 -m pip install --upgrade pip build
        python3 -m pip install -e .[all]
    - name: Build source distribution
      run: |
-        python setup.py sdist
+        python3 -m build --sdist
    - name: Publish to Test PyPI
      uses: pypa/gh-action-pypi-publish@release/v1
      with:
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@ -19,10 +19,11 @@ jobs:
        python-version: "3.8"
    - name: Install dependencies
      run: |
-        python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+        python3 -m pip install --upgrade pip build
        python3 -m pip install -e .[all]
    - name: Build source distribution
      run: |
-        python setup.py sdist
+        python3 -m build --sdist
    - name: Publish distribution to PyPI
      # TODO: move to tag based releases
      # if: startsWith(github.ref, 'refs/tags')
--- a/.github/workflows/test-pypi.yaml
+++ b/.github/workflows/test-pypi.yaml
@ -18,7 +18,7 @@ jobs:
      - name: Install dependencies
        run: |
          python3 -m pip install --upgrade pip
-          python3 -m pip install --verbose llama-cpp-python[server,test]
+          python3 -m pip install --verbose llama-cpp-python[all]
      - name: Test with pytest
        run: |
          python3 -c "import llama_cpp"
@ -38,7 +38,7 @@ jobs:
      - name: Install dependencies
        run: |
          python3 -m pip install --upgrade pip
-          python3 -m pip install --verbose llama-cpp-python[server,test]
+          python3 -m pip install --verbose llama-cpp-python[all]
      - name: Test with pytest
        run: |
          python3 -c "import llama_cpp"
@ -58,7 +58,7 @@ jobs:
      - name: Install dependencies
        run: |
          python3 -m pip install --upgrade pip
-          python3 -m pip install --verbose llama-cpp-python[server,test]
+          python3 -m pip install --verbose llama-cpp-python[all]
      - name: Test with pytest
        run: |
          python3 -c "import llama_cpp"
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
      - uses: actions/checkout@v3
@ -26,18 +26,18 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
+          python3 -m pip install --upgrade pip
-          pip install . -v
+          python3 -m pip install .[all] -v
      - name: Test with pytest
        run: |
-          pytest
+          python3 -m pytest
  build-windows:
    runs-on: windows-latest
    strategy:
      matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
      - uses: actions/checkout@v3
@ -49,18 +49,18 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
+          python3 -m pip install --upgrade pip
-          pip install . -v
+          python3 -m pip install .[all] -v
      - name: Test with pytest
        run: |
-          pytest
+          python3 -m pytest
  build-macos:
    runs-on: macos-latest
    strategy:
      matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
    steps:
      - uses: actions/checkout@v3
@ -72,8 +72,8 @@ jobs:
          python-version: ${{ matrix.python-version }}
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
+          python3 -m pip install --upgrade pip
-          pip install . -v
+          python3 -m pip install .[all] --verbose
      - name: Test with pytest
        run: |
-          pytest
+          python3 -m pytest
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,33 +2,37 @@ cmake_minimum_required(VERSION 3.4...3.22)
 project(llama_cpp)
-option(FORCE_CMAKE "Force CMake build of Python bindings" OFF)
+option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python package" ON)
 set(FORCE_CMAKE $ENV{FORCE_CMAKE})
-if (UNIX AND NOT FORCE_CMAKE)
+if (LLAMA_BUILD)
    add_custom_command(
        OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
        COMMAND make libllama.so
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp
    )
    add_custom_target(
        run ALL
        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
    )
    install(
        FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
        DESTINATION llama_cpp
    )
 else()
    set(BUILD_SHARED_LIBS "On")
    if (APPLE)
        # Need to disable these llama.cpp flags on Apple
        # otherwise users may encounter invalid instruction errors
        set(LLAMA_AVX "Off" CACHE BOOL "llama: enable AVX" FORCE)
        set(LLAMA_AVX2 "Off" CACHE BOOL "llama: enable AVX2" FORCE)
        set(LLAMA_FMA "Off" CACHE BOOL "llama: enable FMA" FORCE)
        set(LLAMA_F16C "Off" CACHE BOOL "llama: enable F16C" FORCE)
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -mtune=native")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native")
    endif()
    add_subdirectory(vendor/llama.cpp)
    install(
        TARGETS llama 
-        LIBRARY DESTINATION llama_cpp
+        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        RUNTIME DESTINATION llama_cpp
+        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        ARCHIVE DESTINATION llama_cpp
+        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        FRAMEWORK DESTINATION llama_cpp
+        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
-        RESOURCE DESTINATION llama_cpp
+        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
    )
    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
    install(
        TARGETS llama 
        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
    )
 endif()
--- a/18
+++ b/18
@ -5,26 +5,30 @@ update:
 update.vendor:
 	cd vendor/llama.cpp && git pull origin master
 deps:
 	python3 -m pip install pip
 	python3 -m pip install -e ".[all]"
 build:
-	python3 setup.py develop
+	python3 -m pip install -e .
 build.cuda:
-	CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
+	CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install -e .
 build.opencl:
-	CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop
+	CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install -e .
 build.openblas:
-	CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
+	CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install -e .
 build.blis:
-	CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
+	CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" python3 -m pip install -e .
 build.metal:
-	CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop
+	CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install -e .
 build.sdist:
-	python3 setup.py sdist
+	python3 -m build --sdist
 deploy.pypi:
 	python3 -m twine upload dist/*
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# 🦙 Python Bindings for `llama.cpp`
+# 🦙 Python Bindings for [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
 [![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
 [![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
@ -48,7 +48,6 @@ Otherwise, while installing it will build the llama.ccp x86 version which will b
 ### Installation with Hardware Acceleration
 `llama.cpp` supports multiple BLAS backends for faster processing.
 Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend.
 To install with OpenBLAS, set the `LLAMA_BLAS and LLAMA_BLAS_VENDOR` environment variables before installing:
@ -208,24 +207,26 @@ If you find any issues with the documentation, please open an issue or submit a
 This package is under active development and I welcome any contributions.
-To get started, clone the repository and install the package in development mode:
+To get started, clone the repository and install the package in editable / development mode:
 ```bash
 git clone --recurse-submodules https://github.com/abetlen/llama-cpp-python.git
 cd llama-cpp-python
 # Upgrade pip (required for editable mode)
 pip install --upgrade pip
 # Install with pip
 pip install -e .
 # if you want to use the fastapi / openapi server
 pip install -e .[server]
-# If you're a poetry user, installing will also include a virtual environment
+# to install all optional dependencies
-poetry install --all-extras
+pip install -e .[all]
 . .venv/bin/activate
-# Will need to be re-run any time vendor/llama.cpp is updated
+# to clear the local build cache
-python3 setup.py develop
+make clean
 ```
 # How does this compare to other Python bindings of `llama.cpp`?
--- a/docker/cuda_simple/Dockerfile
+++ b/docker/cuda_simple/Dockerfile
@ -21,7 +21,7 @@ ENV LLAMA_CUBLAS=1
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 # Install llama-cpp-python (build with cuda)
-RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
 # Run the server
 CMD python3 -m llama_cpp.server
--- a/docker/simple/Dockerfile
+++ b/docker/simple/Dockerfile
@ -19,9 +19,9 @@ RUN mkdir /app
 WORKDIR /app
 COPY . /app
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+RUN python3 -m pip install --upgrade pip
-RUN make build && make clean
+RUN make deps && make build && make clean
 # Set environment variable for the host
 ENV HOST=0.0.0.0
--- a/docs/index.md
+++ b/docs/index.md
@ -82,9 +82,12 @@ To get started, clone the repository and install the package in development mode
 ```bash
 git clone git@github.com:abetlen/llama-cpp-python.git
 cd llama-cpp-python
 git submodule update --init --recursive
 # Will need to be re-run any time vendor/llama.cpp is updated
-python3 setup.py develop
+
 pip install --upgrade pip
 pip install -e .[all]
 ```
 ## License
--- a/docs/install/macos.md
+++ b/docs/install/macos.md
@ -30,7 +30,7 @@ conda activate llama
    *(you needed xcode installed in order pip to build/compile the C++ code)*
 ```
 pip uninstall llama-cpp-python -y
-CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
+CMAKE_ARGS="-DLLAMA_METAL=on" pip install -U llama-cpp-python --no-cache-dir
 pip install 'llama-cpp-python[server]'
 # you should now have llama-cpp-python v0.1.62 or higher installed
--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-from .version import __version__
+__version__ = "0.2.0"
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -182,21 +182,27 @@ class LlamaState:
        self.llama_state_size = llama_state_size
-LogitsProcessor = Callable[[List[int], List[float]], List[float]]
+LogitsProcessor = Callable[
    [npt.NDArray[np.intc], npt.NDArray[np.single]], npt.NDArray[np.single]
 ]
 class LogitsProcessorList(List[LogitsProcessor]):
-    def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]:
+    def __call__(
        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
    ) -> npt.NDArray[np.single]:
        for processor in self:
            scores = processor(input_ids, scores)
        return scores
-StoppingCriteria = Callable[[List[int], List[float]], bool]
+StoppingCriteria = Callable[[npt.NDArray[np.intc], npt.NDArray[np.single]], bool]
 class StoppingCriteriaList(List[StoppingCriteria]):
-    def __call__(self, input_ids: List[int], logits: List[float]) -> bool:
+    def __call__(
        self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
    ) -> bool:
        return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
@ -281,11 +287,12 @@ class Llama:
        self._p_tensor_split = None
        if self.tensor_split is not None:
-            FloatArray = (ctypes.c_float * len(self.tensor_split))(*self.tensor_split)
+            # Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
-            self._p_tensor_split = ctypes.POINTER(ctypes.c_float)(
+            FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
-                FloatArray
+            self._c_tensor_split = FloatArray(
                *tensor_split
            )  # keep a reference to the array so it is not gc'd
-            self.params.tensor_split = self._p_tensor_split
+            self.params.tensor_split = self._c_tensor_split
        self.params.rope_freq_base = rope_freq_base
        self.params.rope_freq_scale = rope_freq_scale
@ -535,11 +542,7 @@ class Llama:
        logits: npt.NDArray[np.single] = self._scores[-1, :]
        if logits_processor is not None:
-            logits = np.array(
+            logits[:] = logits_processor(self._input_ids, logits)
                logits_processor(self._input_ids.tolist(), logits.tolist()),
                dtype=np.single,
            )
            self._scores[-1, :] = logits
        nl_logit = logits[self._token_nl]
        candidates = self._candidates
@ -778,7 +781,7 @@ class Llama:
                grammar=grammar,
            )
            if stopping_criteria is not None and stopping_criteria(
-                self._input_ids.tolist(), self._scores[-1, :].tolist()
+                self._input_ids, self._scores[-1, :]
            ):
                return
            tokens_or_none = yield token
@ -1105,7 +1108,7 @@ class Llama:
                break
        if stopping_criteria is not None and stopping_criteria(
-            self._input_ids.tolist(), self._scores[-1, :].tolist()
+            self._input_ids, self._scores[-1, :]
        ):
            text = self.detokenize(completion_tokens)
            finish_reason = "stop"
@ -1525,6 +1528,8 @@ class Llama:
    def create_chat_completion(
        self,
        messages: List[ChatCompletionMessage],
        functions: Optional[List[ChatCompletionFunction]] = None,
        function_call: Optional[Union[str, ChatCompletionFunctionCall]] = None,
        temperature: float = 0.2,
        top_p: float = 0.95,
        top_k: int = 40,
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -87,8 +87,8 @@ c_size_t_p = POINTER(c_size_t)
 # llama.h bindings
 GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas")
-GGML_CUDA_MAX_DEVICES = ctypes.c_int(16)
+GGML_CUDA_MAX_DEVICES = 16
-LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1)
+LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else 1
 # define LLAMA_DEFAULT_SEED 0xFFFFFFFF
 LLAMA_DEFAULT_SEED = ctypes.c_int(0xFFFFFFFF)
@ -335,13 +335,13 @@ llama_grammar_p = c_void_p
 #     // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
 #     LLAMA_GRETYPE_CHAR_ALT       = 6,
 # };
-LLAMA_GRETYPE_END = c_int(0)
+LLAMA_GRETYPE_END = 0
-LLAMA_GRETYPE_ALT = c_int(1)
+LLAMA_GRETYPE_ALT = 1
-LLAMA_GRETYPE_RULE_REF = c_int(2)
+LLAMA_GRETYPE_RULE_REF = 2
-LLAMA_GRETYPE_CHAR = c_int(3)
+LLAMA_GRETYPE_CHAR = 3
-LLAMA_GRETYPE_CHAR_NOT = c_int(4)
+LLAMA_GRETYPE_CHAR_NOT = 4
-LLAMA_GRETYPE_CHAR_RNG_UPPER = c_int(5)
+LLAMA_GRETYPE_CHAR_RNG_UPPER = 5
-LLAMA_GRETYPE_CHAR_ALT = c_int(6)
+LLAMA_GRETYPE_CHAR_ALT = 6
 # typedef struct llama_grammar_element {
@ -407,7 +407,7 @@ _lib.llama_model_quantize_default_params.restype = llama_model_quantize_params
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
 # LLAMA_API void llama_backend_init(bool numa);
-def llama_backend_init(numa: c_bool):
+def llama_backend_init(numa: Union[c_bool, bool]):
    return _lib.llama_backend_init(numa)
@ -651,9 +651,9 @@ _lib.llama_model_quantize.restype = c_int
 #                          int   n_threads);
 def llama_apply_lora_from_file(
    ctx: llama_context_p,
-    path_lora: c_char_p,
+    path_lora: Union[c_char_p, bytes],
-    path_base_model: c_char_p,
+    path_base_model: Union[c_char_p, bytes],
-    n_threads: c_int,
+    n_threads: Union[c_int, int],
 ) -> int:
    return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
@ -671,7 +671,7 @@ def llama_model_apply_lora_from_file(
    model: llama_model_p,
    path_lora: Union[c_char_p, bytes],
    path_base_model: Union[c_char_p, bytes],
-    n_threads: c_int,
+    n_threads: Union[c_int, int],
 ) -> int:
    return _lib.llama_model_apply_lora_from_file(
        model, path_lora, path_base_model, n_threads
@ -751,7 +751,7 @@ def llama_load_session_file(
    ctx: llama_context_p,
    path_session: bytes,
    tokens_out,  # type: Array[llama_token]
-    n_token_capacity: c_size_t,
+    n_token_capacity: Union[c_size_t, int],
    n_token_count_out,  # type: _Pointer[c_size_t]
 ) -> int:
    return _lib.llama_load_session_file(
@ -774,7 +774,7 @@ def llama_save_session_file(
    ctx: llama_context_p,
    path_session: bytes,
    tokens,  # type: Array[llama_token]
-    n_token_count: c_size_t,
+    n_token_count: Union[c_size_t, int],
 ) -> int:
    return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
@ -801,9 +801,9 @@ _lib.llama_save_session_file.restype = c_size_t
 def llama_eval(
    ctx: llama_context_p,
    tokens,  # type: Array[llama_token]
-    n_tokens: c_int,
+    n_tokens: Union[c_int, int],
-    n_past: c_int,
+    n_past: Union[c_int, int],
-    n_threads: c_int,
+    n_threads: Union[c_int, int],
 ) -> int:
    return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
@ -822,9 +822,9 @@ _lib.llama_eval.restype = c_int
 def llama_eval_embd(
    ctx: llama_context_p,
    embd,  # type: Array[c_float]
-    n_tokens: c_int,
+    n_tokens: Union[c_int, int],
-    n_past: c_int,
+    n_past: Union[c_int, int],
-    n_threads: c_int,
+    n_threads: Union[c_int, int],
 ) -> int:
    return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads)
@ -1042,8 +1042,8 @@ _lib.llama_token_to_piece_with_model.restype = c_int
 #                                 size_t    start_rule_index);
 def llama_grammar_init(
    rules,  # type: Array[llama_grammar_element_p] # type: ignore
-    n_rules: c_size_t,
+    n_rules: Union[c_size_t, int],
-    start_rule_index: c_size_t,
+    start_rule_index: Union[c_size_t, int],
 ) -> llama_grammar_p:
    return _lib.llama_grammar_init(rules, n_rules, start_rule_index)
@ -1084,8 +1084,8 @@ def llama_sample_repetition_penalty(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
    last_tokens_data,  # type: Array[llama_token]
-    last_tokens_size: c_int,
+    last_tokens_size: Union[c_int, int],
-    penalty: c_float,
+    penalty: Union[c_float, float],
 ):
    return _lib.llama_sample_repetition_penalty(
        ctx, candidates, last_tokens_data, last_tokens_size, penalty
@ -1108,9 +1108,9 @@ def llama_sample_frequency_and_presence_penalties(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
    last_tokens_data,  # type: Array[llama_token]
-    last_tokens_size: c_int,
+    last_tokens_size: Union[c_int, int],
-    alpha_frequency: c_float,
+    alpha_frequency: Union[c_float, float],
-    alpha_presence: c_float,
+    alpha_presence: Union[c_float, float],
 ):
    return _lib.llama_sample_frequency_and_presence_penalties(
        ctx,
@ -1146,7 +1146,7 @@ def llama_sample_classifier_free_guidance(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
    guidance_ctx: llama_context_p,
-    scale: c_float,
+    scale: Union[c_float, float],
 ):
    return _lib.llama_sample_classifier_free_guidance(
        ctx, candidates, guidance_ctx, scale
@ -1182,8 +1182,8 @@ _lib.llama_sample_softmax.restype = None
 def llama_sample_top_k(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
-    k: c_int,
+    k: Union[c_int, int],
-    min_keep: c_size_t,
+    min_keep: Union[c_size_t, int],
 ):
    return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
@ -1202,8 +1202,8 @@ _lib.llama_sample_top_k.restype = None
 def llama_sample_top_p(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
-    p: c_float,
+    p: Union[c_float, float],
-    min_keep: c_size_t,
+    min_keep: Union[c_size_t, int],
 ):
    return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
@ -1222,8 +1222,8 @@ _lib.llama_sample_top_p.restype = None
 def llama_sample_tail_free(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
-    z: c_float,
+    z: Union[c_float, float],
-    min_keep: c_size_t,
+    min_keep: Union[c_size_t, int],
 ):
    return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
@ -1242,8 +1242,8 @@ _lib.llama_sample_tail_free.restype = None
 def llama_sample_typical(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
-    p: c_float,
+    p: Union[c_float, float],
-    min_keep: c_size_t,
+    min_keep: Union[c_size_t, int],
 ):
    return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
@ -1261,7 +1261,7 @@ _lib.llama_sample_typical.restype = None
 def llama_sample_temperature(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
-    temp: c_float,
+    temp: Union[c_float, float],
 ):
    return _lib.llama_sample_temperature(ctx, candidates, temp)
@ -1301,9 +1301,9 @@ _lib.llama_sample_grammar.restype = None
 def llama_sample_token_mirostat(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
-    tau: c_float,
+    tau: Union[c_float, float],
-    eta: c_float,
+    eta: Union[c_float, float],
-    m: c_int,
+    m: Union[c_int, int],
    mu,  # type: _Pointer[c_float]
 ) -> int:
    return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
@ -1329,8 +1329,8 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 def llama_sample_token_mirostat_v2(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
-    tau: c_float,
+    tau: Union[c_float, float],
-    eta: c_float,
+    eta: Union[c_float, float],
    mu,  # type: _Pointer[c_float]
 ) -> int:
    return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
@ -1531,5 +1531,5 @@ _lib.llama_dump_timing_info_yaml.restype = None
 _llama_initialized = False
 if not _llama_initialized:
-    llama_backend_init(c_bool(False))
+    llama_backend_init(False)
    _llama_initialized = True
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@ -63,6 +63,16 @@ class ChatCompletionMessage(TypedDict):
    user: NotRequired[str]
 class ChatCompletionFunction(TypedDict):
    name: str
    description: NotRequired[str]
    parameters: Dict[str, Any]  # TODO: make this more specific
 class ChatCompletionFunctionCall(TypedDict):
    name: str
 class ChatCompletionChoice(TypedDict):
    index: int
    message: ChatCompletionMessage
@ -77,9 +87,11 @@ class ChatCompletion(TypedDict):
    choices: List[ChatCompletionChoice]
    usage: CompletionUsage
 class ChatCompletionChunkDeltaEmpty(TypedDict):
    pass
 class ChatCompletionChunkDelta(TypedDict):
    role: NotRequired[Literal["assistant"]]
    content: NotRequired[str]
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -19,6 +19,9 @@ from pydantic import BaseModel, Field
 from pydantic_settings import BaseSettings
 from sse_starlette.sse import EventSourceResponse
 import numpy as np
 import numpy.typing as npt
 class Settings(BaseSettings):
    model: str = Field(
@ -38,11 +41,13 @@ class Settings(BaseSettings):
        default=None,
        description="Split layers across multiple GPUs in proportion.",
    )
-    rope_freq_base: float = Field(default=10000, ge=1, description="RoPE base frequency")
+    rope_freq_base: float = Field(
-    rope_freq_scale: float = Field(default=1.0, description="RoPE frequency scaling factor")
+        default=10000, ge=1, description="RoPE base frequency"
    seed: int = Field(
        default=1337, description="Random seed. -1 for random."
    )
    rope_freq_scale: float = Field(
        default=1.0, description="RoPE frequency scaling factor"
    )
    seed: int = Field(default=1337, description="Random seed. -1 for random.")
    n_batch: int = Field(
        default=512, ge=1, description="The batch size to use per eval."
    )
@ -559,9 +564,9 @@ def make_logit_bias_processor(
                to_bias[input_id] = score
    def logit_bias_processor(
-        input_ids: List[int],
+        input_ids: npt.NDArray[np.intc],
-        scores: List[float],
+        scores: npt.NDArray[np.single],
-    ) -> List[float]:
+    ) -> npt.NDArray[np.single]:
        new_scores = [None] * len(scores)
        for input_id, score in enumerate(scores):
            new_scores[input_id] = score + to_bias.get(input_id, 0.0)
@ -594,9 +599,11 @@ async def create_completion(
    kwargs = body.model_dump(exclude=exclude)
    if body.logit_bias is not None:
-        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
+        kwargs["logits_processor"] = llama_cpp.LogitsProcessorList(
            [
                make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
-        ])
+            ]
        )
    iterator_or_completion: Union[llama_cpp.Completion, Iterator[
        llama_cpp.CompletionChunk
@ -663,6 +670,14 @@ class CreateChatCompletionRequest(BaseModel):
    messages: List[ChatCompletionRequestMessage] = Field(
        default=[], description="A list of messages to generate completions for."
    )
    functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
        default=None,
        description="A list of functions to apply to the generated completions.",
    )
    function_call: Optional[Union[str, llama_cpp.ChatCompletionFunctionCall]] = Field(
        default=None,
        description="A function to apply to the generated completions.",
    )
    max_tokens: int = max_tokens_field
    temperature: float = temperature_field
    top_p: float = top_p_field
@ -721,9 +736,11 @@ async def create_chat_completion(
    kwargs = body.model_dump(exclude=exclude)
    if body.logit_bias is not None:
-        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
+        kwargs["logits_processor"] = llama_cpp.LogitsProcessorList(
            [
                make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
-        ])
+            ]
        )
    iterator_or_completion: Union[llama_cpp.ChatCompletion, Iterator[
        llama_cpp.ChatCompletionChunk
--- a/llama_cpp/version.py
+++ b/llama_cpp/version.py
@ -1 +0,0 @@
 __version__ = "0.1.85"
--- a/poetry.lock
+++ b/poetry.lock
--- a/poetry.toml
+++ b/poetry.toml
@ -1,3 +0,0 @@
 [virtualenvs]
 in-project = true
 prefer-active-python = true
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,48 +1,63 @@
-[tool.poetry]
+[build-system]
 requires = ["scikit-build-core>=0.5.0"]
 build-backend = "scikit_build_core.build"
 [project]
 name = "llama_cpp_python"
-version = "0.1.85"
+dynamic = ["version"]
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
 readme = "README.md"
-homepage = "https://github.com/abetlen/llama-cpp-python"
+license = { text = "MIT" }
-repository = "https://github.com/abetlen/llama-cpp-python"
+authors = [
-packages = [{include = "llama_cpp"}]
+    { name = "Andrei Betlen", email = "abetlen@gmail.com" },
-include = [
+]
-    "LICENSE.md",
+dependencies = [
    "typing-extensions>=4.5.0",
    "numpy>=1.20.0",
    "diskcache>=5.6.1",
 ]
 requires-python = ">=3.8"
 [project.optional-dependencies]
 server = [
    "uvicorn>=0.22.0",
    "fastapi>=0.100.0",
    "pydantic-settings>=2.0.1",
    "sse-starlette>=1.6.1",
 ]
 test = [
    "pytest>=7.4.0",
    "httpx>=0.24.1",
 ]
 dev = [
    "black>=23.3.0",
    "twine>=4.0.2",
    "mkdocs>=1.4.3",
    "mkdocstrings[python]>=0.22.0",
    "mkdocs-material>=9.1.18",
    "pytest>=7.4.0",
    "httpx>=0.24.1",
 ]
 all = [
    "llama_cpp_python[server,test,dev]",
 ]
-[tool.poetry.dependencies]
+[tool.scikit-build]
-python = "^3.8.1"
+wheel.packages = ["llama_cpp"]
-typing-extensions = "^4.7.1"
+cmake.verbose = true
-numpy = "^1.24.4"
+cmake.minimum-version = "3.12"
-diskcache = "^5.6.3"
+minimum-version = "0.5"
-uvicorn = { version = "^0.23.2", optional = true }
+ninja.make-fallback = false
 fastapi = { version = ">=0.100.0", optional = true }
 sse-starlette = { version = ">=1.6.1", optional = true }
 pydantic-settings = { version = ">=2.0.1", optional = true }
-[tool.poetry.group.dev.dependencies]
+[tool.scikit-build.metadata.version]
-black = "^23.9.1"
+provider = "scikit_build_core.metadata.regex"
-twine = "^4.0.2"
+input = "llama_cpp/__init__.py"
 mkdocs = "^1.5.2"
 mkdocstrings = {extras = ["python"], version = "^0.23.0"}
 mkdocs-material = "^9.3.1"
 pytest = "^7.4.2"
 httpx = "^0.25.0"
 scikit-build = "0.17.6"
-[tool.poetry.extras]
+[project.urls]
-server = ["uvicorn", "fastapi", "pydantic-settings", "sse-starlette"]
+Homepage = "https://github.com/abetlen/llama-cpp-python"
 Issues = "https://github.com/abetlen/llama-cpp-python/issues"
 [tool.pytest.ini_options]
 addopts = "--ignore=vendor"
 [build-system]
 requires = [
    "setuptools>=42",
    "scikit-build>=0.13",
    "cmake>=3.18",
    "ninja",
 ]
 build-backend = "setuptools.build_meta"
--- a/setup.py
+++ b/setup.py
@ -1,35 +0,0 @@
 from skbuild import setup
 from pathlib import Path
 this_directory = Path(__file__).parent
 long_description = (this_directory / "README.md").read_text(encoding="utf-8")
 exec(open('llama_cpp/version.py').read())
 setup(
    name="llama_cpp_python",
    description="A Python wrapper for llama.cpp",
    long_description=long_description,
    long_description_content_type="text/markdown",
    version=__version__,
    author="Andrei Betlen",
    author_email="abetlen@gmail.com",
    license="MIT",
    package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"},
    package_data={"llama_cpp": ["py.typed"]},
    packages=["llama_cpp", "llama_cpp.server"],
    install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
    extras_require={
        "server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
    },
    python_requires=">=3.7",
    classifiers=[
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
    ],
 )