20 changed files with 193 additions and 807 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -8,12 +8,8 @@ updates:
  - package-ecosystem: "pip" # See documentation for possible values
    directory: "/" # Location of package manifests
    schedule:
-      interval: "daily"
+      interval: "weekly"
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
-      interval: "daily"
-  - package-ecosystem: "docker"
-    directory: "/"
-    schedule:
-      interval: "daily"   
+      interval: "weekly"    
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@ -29,7 +29,7 @@ jobs:
          python -m pip install -e .[all]

      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.18.0
+        uses: pypa/cibuildwheel@v2.17.0
        env:
          # disable repair
          CIBW_REPAIR_WHEEL_COMMAND: ""
@ -56,7 +56,7 @@ jobs:
          platforms: linux/arm64

      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.18.0
+        uses: pypa/cibuildwheel@v2.17.0
        env:
          CIBW_SKIP: "*musllinux* pp*"
          CIBW_REPAIR_WHEEL_COMMAND: ""
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,61 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ## [Unreleased]

-## [0.2.75]
-
- feat: Update llama.cpp to ggerganov/llama.cpp@13ad16af1231ab2d245d35df3295bcfa23de1305
- fix: segfault for models without eos / bos tokens by @abetlen in d99a6ba607a4885fb00e63e967964aa41bdbbbcb
- feat: add MinTokensLogitProcessor and min_tokens argument to server by @twaka in #1333
- misc: Remove unnecessary metadata lookups by @CISC in #1448
-
-## [0.2.74]
-
- feat: Update llama.cpp to ggerganov/llama.cpp@b228aba91ac2cd9eb90e9d423ba1d0d20e0117e2
- fix: Enable CUDA backend for llava by @abetlen in 7f59856fa6f3e23f07e12fc15aeb9359dc6c3bb4
- docs: Fix typo in README.md by @yupbank in #1444
-
-## [0.2.73]
-
- feat: Update llama.cpp to ggerganov/llama.cpp@25c6e82e7a1ad25a42b0894e87d9b5c557409516
- fix: Clear kv cache at beginning of image chat formats to avoid bug when image is evaluated first by @abetlen in ac55d0a175115d1e719672ce1cb1bec776c738b1
-
-## [0.2.72]
-
- fix(security): Remote Code Execution by Server-Side Template Injection in Model Metadata by @retr0reg in b454f40a9a1787b2b5659cd2cb00819d983185df
- fix(security): Update remaining jinja chat templates to use immutable sandbox by @CISC in #1441
-
-## [0.2.71]
-
- feat: Update llama.cpp to ggerganov/llama.cpp@911b3900dded9a1cfe0f0e41b82c7a29baf3a217
- fix: Make leading bos_token optional for image chat formats, fix nanollava system message by @abetlen in 77122638b4153e31d9f277b3d905c2900b536632
- fix: free last image embed in llava chat handler by @abetlen in 3757328b703b2cd32dcbd5853271e3a8c8599fe7
-
-## [0.2.70]
-
- feat: Update llama.cpp to ggerganov/llama.cpp@c0e6fbf8c380718102bd25fcb8d2e55f8f9480d1
- feat: fill-in-middle support by @CISC in #1386
- fix: adding missing args in create_completion for functionary chat handler by @skalade in #1430
- docs: update README.md @eltociear in #1432
- fix: chat_format log where auto-detected format prints None by @balvisio in #1434
- feat(server): Add support for setting root_path by @abetlen in 0318702cdc860999ee70f277425edbbfe0e60419
- feat(ci): Add docker checks and check deps more frequently by @Smartappli in #1426
- fix: detokenization case where first token does not start with a leading space by @noamgat in #1375
- feat: Implement streaming for Functionary v2 + Bug fixes by @jeffrey-fong in #1419
- fix: Use memmove to copy str_value kv_override by @abetlen in 9f7a85571ae80d3b6ddbd3e1bae407b9f1e3448a
- feat(server): Remove temperature bounds checks for server by @abetlen in 0a454bebe67d12a446981eb16028c168ca5faa81
- fix(server): Propagate flash_attn to model load by @dthuerck in #1424
-
-## [0.2.69]
-
- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
- feat: Add llama-3-vision-alpha chat format by @abetlen in 31b1d95a6c19f5b615a3286069f181a415f872e8
- fix: Change default verbose value of verbose in image chat format handlers to True to match Llama by @abetlen in 4f01c452b6c738dc56eacac3758119b12c57ea94
- fix: Suppress all logs when verbose=False, use hardcoded fileno's to work in colab notebooks by @abetlen in f116175a5a7c84569c88cad231855c1e6e59ff6e
- fix: UTF-8 handling with grammars by @jsoma in #1415
-
 ## [0.2.68]

- feat: Update llama.cpp to ggerganov/llama.cpp@77e15bec6217a39be59b9cc83d6b9afb6b0d8167
+- feat: Update llama.cpp to ggerganov/llama.cpp@
 - feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
 - fix(ci): Fix build-and-release.yaml by @Smartappli in #1413

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -51,9 +51,8 @@ if (LLAMA_BUILD)
    )

    if (LLAVA_BUILD)
-        if (LLAMA_CUBLAS OR LLAMA_CUDA)
+        if (LLAMA_CUBLAS)
            add_compile_definitions(GGML_USE_CUBLAS)
-            add_compile_definitions(GGML_USE_CUDA)
        endif()

        if (LLAMA_METAL)
--- a/2
+++ b/2
@ -16,7 +16,7 @@ build.debug:
 	CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false  --editable .

 build.cuda:
-	CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
+	CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --verbose -e .

 build.opencl:
 	CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .
--- a/README.md
+++ b/README.md
@ -516,7 +516,7 @@ chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
 llm = Llama(
  model_path="./path/to/llava/llama-model.gguf",
  chat_handler=chat_handler,
-  n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
+  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
 )
 llm.create_chat_completion(
    messages = [
@ -547,10 +547,10 @@ llm = Llama.from_pretrained(
  repo_id="vikhyatk/moondream2",
  filename="*text-model*",
  chat_handler=chat_handler,
-  n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
+  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
 )

-response = llm.create_chat_completion(
+respoonse = llm.create_chat_completion(
    messages = [
        {
            "role": "user",
--- a/examples/ray/README.md
+++ b/examples/ray/README.md
@ -1,19 +0,0 @@
-This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
-
-First, install the requirements:
-
-```bash
-$ pip install -r requirements.txt
-```
-
-Deploy a GGUF model to Ray Serve with the following command:
-
-```bash
-$ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf'
-```
-
-This will start an API endpoint at `http://localhost:8000/`. You can query the model like this:
-
-```bash
-$ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000
-```
--- a/examples/ray/llm.py
+++ b/examples/ray/llm.py
@ -1,20 +0,0 @@
-from starlette.requests import Request
-from typing import Dict
-from ray import serve
-from ray.serve import Application
-from llama_cpp import Llama
-
-@serve.deployment
-class LlamaDeployment:
-    def __init__(self, model_path: str):
-        self._llm = Llama(model_path=model_path)
-
-    async def __call__(self, http_request: Request) -> Dict:
-        input_json = await http_request.json()
-        prompt = input_json["prompt"]
-        max_tokens = input_json.get("max_tokens", 64)
-        return self._llm(prompt, max_tokens=max_tokens)
-
-
-def llm_builder(args: Dict[str, str]) -> Application:
-    return LlamaDeployment.bind(args["model_path"])
--- a/examples/ray/requirements.txt
+++ b/examples/ray/requirements.txt
@ -1,3 +0,0 @@
-ray[serve]
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-llama-cpp-python
--- a/llama_cpp/init.py
+++ b/llama_cpp/init.py
@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *

-__version__ = "0.2.75"
+__version__ = "0.2.68"
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@ -203,7 +203,7 @@ class _LlamaModel:
        # NOTE: Llama1 models automatically added a space at the start of the prompt
        # this line removes a leading space if the first token is a beginning of sentence token
        return (
-            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b' ' else output
+            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
        )

    # Extra
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@ -262,12 +262,7 @@ class Llama:
                        raise ValueError(f"Value for {k} is too long: {v}")
                    v_bytes = v_bytes.ljust(128, b"\0")
                    self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
-                    # copy min(v_bytes, 128) to str_value
-                    ctypes.memmove(
-                        self._kv_overrides_array[i].value.str_value,
-                        v_bytes,
-                        min(len(v_bytes), 128),
-                    )
+                    self._kv_overrides_array[i].value.str_value[:128] = v_bytes
                else:
                    raise ValueError(f"Unknown value type for {k}: {v}")

@ -378,7 +373,6 @@ class Llama:

        self.chat_format = chat_format
        self.chat_handler = chat_handler
-        self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = {}

        self.draft_model = draft_model

@ -410,33 +404,10 @@ class Llama:
        if self.verbose:
            print(f"Model metadata: {self.metadata}", file=sys.stderr)

-        eos_token_id = self.token_eos()
-        bos_token_id = self.token_bos()
-
-        eos_token = self._model.token_get_text(eos_token_id) if eos_token_id != -1 else ""
-        bos_token = self._model.token_get_text(bos_token_id) if bos_token_id != -1 else ""
-
-        # Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
-        template_choices = dict((name[10:], template) for name, template in self.metadata.items() if name.startswith("tokenizer.chat_template."))
-
-        if "tokenizer.chat_template" in self.metadata:
-            template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"]
-
-        if self.verbose and template_choices:
-            print(f"Available chat formats from metadata: {', '.join(template_choices.keys())}", file=sys.stderr)
-
-        for name, template in template_choices.items():
-            self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter(
-                template=template,
-                eos_token=eos_token,
-                bos_token=bos_token,
-                stop_token_ids=[eos_token_id],
-            ).to_chat_handler()
-
        if (
            self.chat_format is None
            and self.chat_handler is None
-            and "chat_template.default" in template_choices
+            and "tokenizer.chat_template" in self.metadata
        ):
            chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(
                self.metadata
@ -447,17 +418,35 @@ class Llama:
                if self.verbose:
                    print(f"Guessed chat format: {chat_format}", file=sys.stderr)
            else:
+                template = self.metadata["tokenizer.chat_template"]
+                try:
+                    eos_token_id = int(self.metadata["tokenizer.ggml.eos_token_id"])
+                except:
+                    eos_token_id = self.token_eos()
+                try:
+                    bos_token_id = int(self.metadata["tokenizer.ggml.bos_token_id"])
+                except:
+                    bos_token_id = self.token_bos()
+
+                eos_token = self._model.token_get_text(eos_token_id)
+                bos_token = self._model.token_get_text(bos_token_id)
+
                if self.verbose:
-                    print(f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr)
+                    print(f"Using gguf chat template: {template}", file=sys.stderr)
                    print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
                    print(f"Using chat bos_token: {bos_token}", file=sys.stderr)

-                self.chat_format = "chat_template.default"
+                self.chat_handler = llama_chat_format.Jinja2ChatFormatter(
+                    template=template,
+                    eos_token=eos_token,
+                    bos_token=bos_token,
+                    stop_token_ids=[eos_token_id],
+                ).to_chat_handler()

        if self.chat_format is None and self.chat_handler is None:
            self.chat_format = "llama-2"
            if self.verbose:
-                print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr)
+                print(f"Using fallback chat format: {chat_format}", file=sys.stderr)

    @property
    def ctx(self) -> llama_cpp.llama_context_p:
@ -961,54 +950,19 @@ class Llama:

        completion_id: str = f"cmpl-{str(uuid.uuid4())}"
        created: int = int(time.time())
-        prefix_token_id: int = self._model.token_prefix()
-        middle_token_id: int = self._model.token_middle()
-        suffix_token_id: int = self._model.token_suffix()
        # If prompt is empty, initialize completion with BOS token to avoid
        # detokenization including a space at the beginning of the completion
        completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
        # Add blank space to start of prompt to match OG llama tokenizer
        prompt_tokens: List[int] = (
            (
-                [prefix_token_id]
-                if prefix_token_id >= 0 and suffix is not None
-                else []
-            )
-            +
-            (
-                (
-                    self.tokenize(prompt.encode("utf-8"), add_bos=(prefix_token_id < 0 or suffix is None), special=(prefix_token_id < 0 or suffix is None))
+                self.tokenize(prompt.encode("utf-8"), special=True)
                if prompt != ""
-                    else (
-                        []
-                        if prefix_token_id >= 0 and suffix is not None
                else [self.token_bos()]
            )
-                )
            if isinstance(prompt, str)
            else prompt
        )
-            +
-            (
-                (
-                    [suffix_token_id]
-                    +
-                    (
-                        self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)
-                        if suffix
-                        else []
-                    )
-                )
-                if suffix_token_id >= 0 and suffix is not None
-                else []
-            )
-            +
-            (
-                [middle_token_id]
-                if middle_token_id >= 0 and suffix is not None
-                else []
-            )
-        )
        text: bytes = b""
        returned_tokens: int = 0
        stop = (
@ -1387,7 +1341,7 @@ class Llama:
        if echo:
            text_str = prompt + text_str

-        if suffix_token_id < 0 and suffix is not None:
+        if suffix is not None:
            text_str = text_str + suffix

        logprobs_or_none: Optional[CompletionLogprobs] = None
@ -1725,7 +1679,7 @@ class Llama:
        Returns:
            Generated chat completion or a stream of chat completion chunks.
        """
-        handler = self.chat_handler or self._chat_handlers.get(self.chat_format) or llama_chat_format.get_chat_completion_handler(
+        handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
            self.chat_format
        )
        return handler(
@ -2084,19 +2038,3 @@ class StoppingCriteriaList(List[StoppingCriteria]):
        self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
    ) -> bool:
        return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
-
-
-class MinTokensLogitsProcessor(LogitsProcessor):
-    def __init__(self, min_tokens: int, token_eos: int):
-        self.min_tokens = min_tokens
-        self.token_eos = token_eos
-        self.prompt_tokens = None
-
-    def __call__(
-        self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
-    ) -> npt.NDArray[np.single]:
-        if self.prompt_tokens is None:
-            self.prompt_tokens = len(input_ids)
-        if len(input_ids) - self.prompt_tokens < self.min_tokens:
-            scores[self.token_eos] = -np.inf
-        return scores
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@ -11,7 +11,6 @@ from contextlib import ExitStack
 from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, Union, Protocol, cast

 import jinja2
-from jinja2.sandbox import ImmutableSandboxedEnvironment

 import numpy as np
 import numpy.typing as npt
@ -192,7 +191,7 @@ class Jinja2ChatFormatter(ChatFormatter):
        self.add_generation_prompt = add_generation_prompt
        self.stop_token_ids = set(stop_token_ids) if stop_token_ids is not None else None

-        self._environment = ImmutableSandboxedEnvironment(
+        self._environment = jinja2.Environment(
            loader=jinja2.BaseLoader(),
            trim_blocks=True,
            lstrip_blocks=True,
@ -685,7 +684,8 @@ def hf_tokenizer_config_to_chat_formatter(
    assert isinstance(tokenizer_config["eos_token"], str)
    eos_token = tokenizer_config["eos_token"]

-    env = ImmutableSandboxedEnvironment(
+    env = jinja2.Environment(
+        loader=jinja2.BaseLoader(),
        trim_blocks=True,
        lstrip_blocks=True,
    ).from_string(chat_template)
@ -1894,8 +1894,6 @@ def functionary_v1_v2_chat_handler(
        function_call = (
            tool_choice if isinstance(tool_choice, str) else tool_choice["function"]
        )
-    elif function_call is not None:
-        pass
    else:
        function_call = "auto"

@ -1932,10 +1930,11 @@ def functionary_v1_v2_chat_handler(
            logits_processor=logits_processor,
            grammar=grammar,
        )
-        if stream is False:
        completion_or_completion_chunks["choices"][0]["text"] = completion_or_completion_chunks["choices"][0]["text"].lstrip()
        return _convert_completion_to_chat(completion_or_completion_chunks, stream=stream)  # type: ignore

+    assert stream is False  # TODO: support stream mode
+
    def get_grammar(function_call):
        function_body = None
        for function in functions or []:
@ -1969,7 +1968,7 @@ def functionary_v1_v2_chat_handler(

        return grammar

-    def create_completion(prompt, stop, grammar):
+    def create_completion(stop):
        completion = cast(llama_types.Completion, llama.create_completion(
            prompt=prompt,
            temperature=temperature,
@ -1977,7 +1976,7 @@ def functionary_v1_v2_chat_handler(
            top_k=top_k,
            min_p=min_p,
            typical_p=typical_p,
-            stream=stream,
+            stream=False,
            stop=stop,
            max_tokens=max_tokens,
            presence_penalty=presence_penalty,
@ -1998,315 +1997,6 @@ def functionary_v1_v2_chat_handler(
    function_calls, function_bodies = [], []
    completion_tokens = 0

-    def generate_streaming(tools, functions, function_call, prompt):
-        assert version == "v2", "Streaming for v1 is not supported"
-        
-        chunk_id, chunk_created = None, None
-        
-        # If tool_choice/function_call is provided
-        if isinstance(function_call, dict):
-            prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
-            grammar = get_grammar(function_call["name"])
-            stops = [STOP_TOKEN, FROM_TOKEN]
-            tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
-            completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
-            completion_text = ""
-            first = True
-            for chunk in completion:
-                # Yield the tool/function name first
-                if first:
-                    if tools is not None:
-                        func_call_dict = {
-                            "tool_calls": [
-                                {
-                                    "index": 0,
-                                    "id": "call_" + tool_id,
-                                    "type": "function",
-                                    "function": {"name": function_call["name"], "arguments": ""},
-                                }
-                            ]
-                        }
-                    else:
-                        func_call_dict = {"function_call": {"name": function_call["name"], "arguments": ""}}
-                    yield llama_types.CreateChatCompletionStreamResponse(
-                        id="chat" + chunk["id"],
-                        object="chat.completion.chunk",
-                        created=chunk["created"],
-                        model=chunk["model"],
-                        choices=[
-                            {"index": 0, "logprobs": None, "delta": {"role": None, "content": None, **func_call_dict}}
-                        ],
-                    )
-                    first = False
-                if tools is not None:
-                    func_call_dict = {
-                        "tool_calls": [
-                            {
-                                "index": 0,
-                                "id": "call_" + tool_id,
-                                "type": "function",
-                                "function": {
-                                    "name": None,
-                                    "arguments": chunk["choices"][0]["text"].rstrip(),
-                                },
-                            }
-                        ]
-                    }
-                else:
-                    func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
-                if len(chunk["choices"][0]["text"].rstrip()) > 0:
-                    yield llama_types.CreateChatCompletionStreamResponse(
-                        id="chat" + chunk["id"],
-                        object="chat.completion.chunk",
-                        created=chunk["created"],
-                        model=chunk["model"],
-                        choices=[
-                            {
-                                "index": 0,
-                                "logprobs": chunk["choices"][0]["logprobs"],
-                                "delta": {
-                                    "role": None,
-                                    "content": None,
-                                    **func_call_dict,
-                                },
-                            }
-                        ],
-                    )
-            # Yield tool_call/function_call stop message
-            yield llama_types.CreateChatCompletionStreamResponse(
-                id="chat" + chunk["id"],
-                object="chat.completion.chunk",
-                created=chunk["created"],
-                model=chunk["model"],
-                choices=[
-                    {
-                        "index": 0,
-                        "finish_reason": "tool_calls" if tools is not None else "function_call",
-                        "logprobs": None,
-                        "delta": {
-                            "role": None, "content": None, "function_call": None, "tool_calls": None
-                        },
-                    }
-                ],
-            )
-        # If "auto" or no tool_choice/function_call
-        elif isinstance(function_call, str) and function_call == "auto":
-            tool_index = 0
-            while True:
-                # Generate function name first
-                grammar = None
-                stops = CONTENT_TOKEN
-                completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
-                completion_text = ""
-                for chunk in completion:
-                    completion_text += chunk["choices"][0]["text"]
-                if chunk_id is None:
-                    chunk_id = chunk["id"]
-                if chunk_created is None:
-                    chunk_created = chunk["created"]
-                function_name = completion_text.strip()
-                if function_name == "all":
-                    prompt += "all\n<|content|>"
-                    # Yield the first empty message for content
-                    yield llama_types.CreateChatCompletionStreamResponse(
-                        id="chat" + chunk_id,
-                        model=chunk["model"],
-                        created=chunk_created,
-                        object="chat.completion.chunk",
-                        choices=[
-                            {
-                                "index": 0,
-                                "delta": {"role": "assistant", "content": ""},
-                                "logprobs": None,
-                                "finish_reason": None,
-                            }
-                        ],
-                    )
-                else:
-                    prompt += f"{function_name}\n<|content|>"
-                    grammar = get_grammar(function_name)
-                    tool_id = "".join([random.choice(string.ascii_letters + string.digits) for _ in range(24)])
-                    if tools is not None:
-                        func_call_dict = {
-                            "tool_calls": [
-                                {
-                                    "index": tool_index,
-                                    "id": "call_" + tool_id,
-                                    "type": "function",
-                                    "function": {"name": function_name, "arguments": ""},
-                                }
-                            ]
-                        }
-                    else:
-                        func_call_dict = {"function_call": {"name": function_name, "arguments": ""}}
-                    # Stream function name
-                    yield llama_types.CreateChatCompletionStreamResponse(
-                        id="chat" + chunk_id,
-                        object="chat.completion.chunk",
-                        created=chunk_created,
-                        model=chunk["model"],
-                        choices=[
-                            {
-                                "index": 0,
-                                "logprobs": chunk["choices"][0]["logprobs"],
-                                "delta": {
-                                    "role": "assistant",
-                                    "content": None,
-                                    **func_call_dict,
-                                },
-                            }
-                        ],
-                    )
-                # Generate content
-                stops = [RECIPIENT_TOKEN, STOP_TOKEN]
-                completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
-                if function_name == "all":
-                    completion_text = ""
-                    stop_sequence, buffer, is_end = "\n<|from|>assistant\n<|recipient|>", [], False
-                    for i, chunk in enumerate(completion):
-                        completion_text += chunk["choices"][0]["text"]
-                        if is_end:
-                            buffer.append(chunk["choices"][0]["text"].strip(" "))
-                            if stop_sequence.startswith("".join(buffer)):
-                                continue
-                            else:
-                                buffer.pop()
-                                while len(buffer) > 0:
-                                    yield llama_types.CreateChatCompletionStreamResponse(
-                                        id="chat" + chunk_id,
-                                        object="chat.completion.chunk",
-                                        created=chunk_created,
-                                        model=chunk["model"],
-                                        choices=[
-                                            {
-                                                "index": 0,
-                                                "logprobs": chunk["choices"][0]["logprobs"],
-                                                "delta": {
-                                                    "role": "assistant", "content": buffer.pop(0)
-                                                },
-                                            }
-                                        ],
-                                    )
-                                is_end = False
-                        elif chunk["choices"][0]["text"] == "\n":
-                            is_end = True
-                            buffer.append(chunk["choices"][0]["text"].strip(" "))
-                            continue
-
-                        if len(buffer) == 0 and len(chunk["choices"][0]["text"]) > 0:
-                            yield llama_types.CreateChatCompletionStreamResponse(
-                                id="chat" + chunk_id,
-                                object="chat.completion.chunk",
-                                created=chunk_created,
-                                model=chunk["model"],
-                                choices=[
-                                    {
-                                        "index": 0,
-                                        "logprobs": chunk["choices"][0]["logprobs"],
-                                        "delta": {
-                                            "role": "assistant",
-                                            "content": chunk["choices"][0]["text"] if i > 0 else chunk["choices"][0]["text"].lstrip()
-                                        },
-                                    }
-                                ],
-                            )
-                    # Check whether the model wants to generate another turn
-                    if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
-                        if completion_text.endswith("\n<|from|>assistant\n"):
-                            cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
-                        elif completion_text.endswith("\n<|from|> assistant\n"):
-                            cleaned_completion_text = completion_text[:-len("\n<|from|> assistant\n")].strip()
-                        else:
-                            cleaned_completion_text = completion_text.strip()
-                        prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
-                    else:
-                        # Yield stop message
-                        yield llama_types.CreateChatCompletionStreamResponse(
-                            id="chat" + chunk_id,
-                            model=chunk["model"],
-                            created=chunk_created,
-                            object="chat.completion.chunk",
-                            choices=[
-                                {
-                                    "index": 0,
-                                    "delta": {},
-                                    "logprobs": None,
-                                    "finish_reason": "stop",
-                                }
-                            ],
-                        )
-                        break
-                else:
-                    # Check whether the model wants to generate another turn
-                    completion_text = ""
-                    for chunk in completion:
-                        completion_text += chunk["choices"][0]["text"]
-                        if len(chunk["choices"][0]["text"].rstrip()) > 0:
-                            if tools is not None:
-                                func_call_dict = {
-                                    "tool_calls": [
-                                        {
-                                            "index": tool_index,
-                                            "id": "call_" + tool_id,
-                                            "type": "function",
-                                            "function": {
-                                                "name": None,
-                                                "arguments": chunk["choices"][0]["text"].rstrip(),
-                                            },
-                                        }
-                                    ]
-                                }
-                            else:
-                                func_call_dict = {"function_call": {"name": None, "arguments": chunk["choices"][0]["text"].rstrip()}}
-                            yield llama_types.CreateChatCompletionStreamResponse(
-                                id="chat" + chunk_id,
-                                object="chat.completion.chunk",
-                                created=chunk_created,
-                                model=chunk["model"],
-                                choices=[
-                                    {
-                                        "index": 0,
-                                        "logprobs": chunk["choices"][0]["logprobs"],
-                                        "delta": {
-                                            "role": None,
-                                            "content": None,
-                                            **func_call_dict,
-                                        },
-                                    }
-                                ],
-                            )
-                    prompt += completion_text.strip()
-                    grammar = None
-                    completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
-                    completion_text += "".join([chunk["choices"][0]["text"] for chunk in completion])
-                    if ("<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text) and tools is not None:
-                        prompt += "\n<|from|>assistant\n<|recipient|>"
-                        tool_index += 1
-                    else:
-                        # Yield tool_call/function_call stop message
-                        yield llama_types.CreateChatCompletionStreamResponse(
-                            id="chat" + chunk_id,
-                            object="chat.completion.chunk",
-                            created=chunk_created,
-                            model=chunk["model"],
-                            choices=[
-                                {
-                                    "index": 0,
-                                    "finish_reason": "tool_calls" if tools is not None else "function_call",
-                                    "logprobs": None,
-                                    "delta": {
-                                        "role": None, "content": None, "function_call": None, "tool_calls": None
-                                    },
-                                }
-                            ],
-                        )
-                        break
-        
-    if stream is not False:
-        return generate_streaming(
-            tools=tools, functions=functions, function_call=function_call, prompt=prompt
-        )
-    else:
    if version == "v1":
        # If no or "auto" tool_choice/function_call
        if isinstance(function_call, str) and function_call == "auto":
@ -2322,7 +2012,7 @@ def functionary_v1_v2_chat_handler(
            prompt = prompt
            stops = ["\n", END_ASSISTANT_TOKEN]

-            completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+        completion = create_completion(stop=stops)
        completion_text = completion["choices"][0]["text"]
        completion_tokens += completion["usage"]["completion_tokens"]
        
@ -2349,7 +2039,7 @@ def functionary_v1_v2_chat_handler(
                completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
            )
            grammar = get_grammar(function_calls[-1])
-                completion = create_completion(prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar)
+            completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
            completion_tokens += completion["usage"]["completion_tokens"]
            function_bodies.append(completion["choices"][0]["text"].strip())
        # If the prompt involves a function call, just append generated parameters to function_bodies
@ -2363,7 +2053,7 @@ def functionary_v1_v2_chat_handler(
            function_calls.append(function_call)
            grammar = get_grammar(function_call)
            stops = [STOP_TOKEN, FROM_TOKEN]
-                completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+            completion = create_completion(stop=stops)
            completion_text = completion["choices"][0]["text"]
            completion_tokens += completion["usage"]["completion_tokens"]
            function_bodies.append(completion_text.strip())
@ -2373,7 +2063,7 @@ def functionary_v1_v2_chat_handler(
                # Generate function name first
                grammar = None
                stops = CONTENT_TOKEN
-                    completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+                completion = create_completion(stop=stops)
                completion_text = completion["choices"][0]["text"]
                completion_tokens += completion["usage"]["completion_tokens"]
                function_name = completion_text.strip()
@ -2386,7 +2076,7 @@ def functionary_v1_v2_chat_handler(
                    grammar = get_grammar(function_call)
                # Generate content
                stops = [RECIPIENT_TOKEN, STOP_TOKEN]
-                    completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+                completion = create_completion(stop=stops)
                completion_text = completion["choices"][0]["text"]
                completion_tokens += completion["usage"]["completion_tokens"]
                if function_name == "all":
@ -2413,7 +2103,7 @@ def functionary_v1_v2_chat_handler(
                    # Check whether the model wants to generate another turn
                    prompt += completion_text.strip()
                    grammar = None
-                        completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
+                    completion = create_completion(stop=stops)
                    completion_tokens += completion["usage"]["completion_tokens"]
                    if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
                        prompt += "\n<|from|>assistant\n<|recipient|>"
@ -2475,7 +2165,7 @@ def functionary_v1_v2_chat_handler(


 class Llava15ChatHandler:
-    DEFAULT_SYSTEM_MESSAGE: Optional[str] =  "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."
+    DEFAULT_SYSTEM_MESSAGE =  "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions."

    CHAT_FORMAT = (
        "{% for message in messages %}"
@ -2598,31 +2288,18 @@ class Llava15ChatHandler:
        assert self.clip_ctx is not None

        system_prompt = _get_system_message(messages)
-        if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
+        if system_prompt == "":
            messages = [llama_types.ChatCompletionRequestSystemMessage(role="system", content=self.DEFAULT_SYSTEM_MESSAGE)] + messages

        image_urls = self.get_image_urls(messages)
-        template = ImmutableSandboxedEnvironment(
-            trim_blocks=True,
-            lstrip_blocks=True,
-        ).from_string(self.CHAT_FORMAT)
-        text = template.render(
-            messages=messages,
-            add_generation_prompt=True,
-            eos_token=llama.detokenize([llama.token_eos()]),
-            bos_token=llama.detokenize([llama.token_bos()]),
-        )
+        template = jinja2.Template(self.CHAT_FORMAT)
+        text = template.render(messages=messages, add_generation_prompt=True)
        split_text = self.split_text_on_image_urls(text, image_urls)

        def embed_image_bytes(image_bytes: bytes):
            if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash:
                return self._last_image_embed
            with suppress_stdout_stderr(disable=self.verbose):
-                # Free the previous image embed
-                if self._last_image_embed is not None:
-                    self._llava_cpp.llava_image_embed_free(self._last_image_embed)
-                    self._last_image_embed = None
-                    self._last_image_hash = None
                embed = (
                    self._llava_cpp.llava_image_embed_make_with_bytes(
                        self.clip_ctx,
@ -2637,10 +2314,9 @@ class Llava15ChatHandler:

        # Evaluate prompt
        llama.reset()
-        llama._ctx.kv_cache_clear()
-        for type_, value in split_text:
+        for i, (type_, value) in enumerate(split_text):
            if type_ == "text":
-                tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True)
+                tokens = llama.tokenize(value.encode("utf8"), add_bos=i == 0)
                if llama.n_tokens + len(tokens) > llama.n_ctx():
                    raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
                llama.eval(tokens)
@ -2658,8 +2334,6 @@ class Llava15ChatHandler:
                        llama.n_batch,
                        n_past_p,
                    )
-                # Required to avoid issues with hf tokenizer
-                llama.input_ids[llama.n_tokens : n_past.value] = -1
                llama.n_tokens = n_past.value

        # Get prompt tokens to avoid a cache miss
@ -3049,7 +2723,6 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
    # Answer the question<|im_end|><|im_start|>user
    # <image>
    # What is the picture about?<|im_end|><|im_start|>assistant
-    DEFAULT_SYSTEM_MESSAGE = "Answer the question"

    CHAT_FORMAT = (
        "{% for message in messages %}"
@ -3098,66 +2771,6 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
        "{% endif %}"
    )

-class Llama3VisionAlpha(Llava15ChatHandler):
-    # question = "<image>" + q
-
-    # prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-    DEFAULT_SYSTEM_MESSAGE = None
-
-    CHAT_FORMAT = (
-        "{% for message in messages %}"
-
-        "<|start_header_id|>"
-
-        "{% if message.role == 'user' %}"
-
-        "user<|end_header_id|>\n\n"
-
-        "{% if message.content is iterable %}"
-
-        # <image>
-        "{% for content in message.content %}"
-        "{% if content.type == 'image_url' %}"
-        "{% if content.image_url is string %}"
-        "{{ content.image_url }}"
-        "{% endif %}"
-        "{% if content.image_url is mapping %}"
-        "{{ content.image_url.url }}"
-        "{% endif %}"
-        "{% endif %}"
-        "{% endfor %}"
-
-        # Question:
-        "{% for content in message.content %}"
-        "{% if content.type == 'text' %}"
-        "{{ content.text }}"
-        "{% endif %}"
-        "{% endfor %}"
-
-        "{% endif %}"
-
-        # Question:
-        "{% if message.content is string %}"
-        "{{ message.content }}"
-        "{% endif %}"
-
-        "{% endif %}"
-
-        # Answer:
-        "{% if message.role == 'assistant' %}"
-        "assistant<|end_header_id|>\n\n"
-        "{{ message.content }}"
-        "{% endif %}"
-
-        "<|eot_id|>"
-
-        "{% endfor %}"
-
-        # Generation prompt
-        "{% if add_generation_prompt %}"
-        "<|start_header_id|>assistant<|end_header_id|>\n\n"
-        "{% endif %}"
-    )

@register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
@ -3245,7 +2858,8 @@ def chatml_function_calling(
        "{% endfor %}"
        "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
    )
-    template_renderer = ImmutableSandboxedEnvironment(
+    template_renderer = jinja2.Environment(
+        loader=jinja2.BaseLoader(),
        autoescape=jinja2.select_autoescape(["html", "xml"]),
        undefined=jinja2.StrictUndefined,
    ).from_string(function_calling_template)
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@ -294,11 +294,6 @@ LLAMA_VOCAB_TYPE_WPM = 3
 #     LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
 #     LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
 #     LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
-#     LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
-#     LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-#     LLAMA_VOCAB_PRE_TYPE_QWEN2          = 10,
-#     LLAMA_VOCAB_PRE_TYPE_OLMO           = 11,
-#     LLAMA_VOCAB_PRE_TYPE_DBRX           = 12,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@ -308,11 +303,6 @@ LLAMA_VOCAB_PRE_TYPE_FALCON = 4
 LLAMA_VOCAB_PRE_TYPE_MPT = 5
 LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
 LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
-LLAMA_VOCAB_PRE_TYPE_REFACT = 8
-LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9
-LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10
-LLAMA_VOCAB_PRE_TYPE_OLMO = 11
-LLAMA_VOCAB_PRE_TYPE_DBRX = 12


 # // note: these values should be synchronized with ggml_rope
@ -381,7 +371,6 @@ LLAMA_TOKEN_TYPE_BYTE = 6
 #     LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors

 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@ -414,8 +403,6 @@ LLAMA_FTYPE_MOSTLY_IQ3_M = 27
 LLAMA_FTYPE_MOSTLY_IQ2_S = 28
 LLAMA_FTYPE_MOSTLY_IQ2_M = 29
 LLAMA_FTYPE_MOSTLY_IQ4_XS = 30
-LLAMA_FTYPE_MOSTLY_IQ1_M = 31
-LLAMA_FTYPE_MOSTLY_BF16 = 32
 LLAMA_FTYPE_GUESSED = 1024

 # enum llama_rope_scaling_type {
@ -507,7 +494,7 @@ class llama_token_data_array(ctypes.Structure):

 llama_token_data_array_p = ctypes.POINTER(llama_token_data_array)

-# typedef bool (*llama_progress_callback)(float progress, void * user_data);
+# typedef bool (*llama_progress_callback)(float progress, void *ctx);
 llama_progress_callback = ctypes.CFUNCTYPE(
    ctypes.c_bool, ctypes.c_float, ctypes.c_void_p
 )
@ -648,9 +635,6 @@ class llama_model_kv_override(ctypes.Structure):
 #     // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
 #     const float * tensor_split;

-#     // comma separated list of RPC servers to use for offloading
-#     const char * rpc_servers;
-
 #     // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
 #     // If the provided progress_callback returns true, model loading continues.
 #     // If it returns false, model loading is immediately aborted.
@ -677,7 +661,6 @@ class llama_model_params(ctypes.Structure):
        split_mode (int): how to split the model across multiple GPUs
        main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
        tensor_split (ctypes.Array[ctypes.ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
-        rpc_servers (ctypes.c_char_p): comma separated list of RPC servers to use for offloading
        progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
        progress_callback_user_data (ctypes.ctypes.c_void_p): context pointer passed to the progress callback
        kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@ -691,7 +674,6 @@ class llama_model_params(ctypes.Structure):
        split_mode: int
        main_gpu: int
        tensor_split: CtypesArray[ctypes.c_float]
-        rpc_servers: ctypes.c_char_p
        progress_callback: Callable[[float, ctypes.c_void_p], bool]
        progress_callback_user_data: ctypes.c_void_p
        kv_overrides: CtypesArray[llama_model_kv_override]
@ -705,7 +687,6 @@ class llama_model_params(ctypes.Structure):
        ("split_mode", ctypes.c_int),
        ("main_gpu", ctypes.c_int32),
        ("tensor_split", ctypes.POINTER(ctypes.c_float)),
-        ("rpc_servers", ctypes.c_char_p),
        ("progress_callback", llama_progress_callback),
        ("progress_callback_user_data", ctypes.c_void_p),
        ("kv_overrides", ctypes.POINTER(llama_model_kv_override)),
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@ -132,7 +132,6 @@ def create_app(
        middleware=middleware,
        title="🦙 llama.cpp Python API",
        version=llama_cpp.__version__,
-        root_path=server_settings.root_path,
    )
    app.add_middleware(
        CORSMiddleware,
@ -275,7 +274,6 @@ async def create_completion(
        "best_of",
        "logit_bias_type",
        "user",
-        "min_tokens",
    }
    kwargs = body.model_dump(exclude=exclude)

@ -289,15 +287,6 @@ async def create_completion(
    if body.grammar is not None:
        kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)

-    if body.min_tokens > 0:
-        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
-            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
-        )
-        if "logits_processor" not in kwargs:
-            kwargs["logits_processor"] = _min_tokens_logits_processor
-        else:
-            kwargs["logits_processor"].extend(_min_tokens_logits_processor)
-
    iterator_or_completion: Union[
        llama_cpp.CreateCompletionResponse,
        Iterator[llama_cpp.CreateCompletionStreamResponse],
@ -455,7 +444,6 @@ async def create_chat_completion(
        "n",
        "logit_bias_type",
        "user",
-        "min_tokens",
    }
    kwargs = body.model_dump(exclude=exclude)
    llama = llama_proxy(body.model)
@ -469,15 +457,6 @@ async def create_chat_completion(
    if body.grammar is not None:
        kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)

-    if body.min_tokens > 0:
-        _min_tokens_logits_processor = llama_cpp.LogitsProcessorList(
-            [llama_cpp.MinTokensLogitsProcessor(body.min_tokens, llama.token_eos())]
-        )
-        if "logits_processor" not in kwargs:
-            kwargs["logits_processor"] = _min_tokens_logits_processor
-        else:
-            kwargs["logits_processor"].extend(_min_tokens_logits_processor)
-
    iterator_or_completion: Union[
        llama_cpp.ChatCompletion, Iterator[llama_cpp.ChatCompletionChunk]
    ] = await run_in_threadpool(llama.create_chat_completion, **kwargs)
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@ -140,20 +140,6 @@ class LlamaProxy:
                chat_handler = llama_cpp.llama_chat_format.NanoLlavaChatHandler(
                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
                )
-        elif settings.chat_format == "llama-3-vision-alpha":
-            assert settings.clip_model_path is not None, "clip model not found"
-            if settings.hf_model_repo_id is not None:
-                chat_handler = (
-                    llama_cpp.llama_chat_format.Llama3VisionAlpha.from_pretrained(
-                        repo_id=settings.hf_model_repo_id,
-                        filename=settings.clip_model_path,
-                        verbose=settings.verbose,
-                    )
-                )
-            else:
-                chat_handler = llama_cpp.llama_chat_format.Llama3VisionAlpha(
-                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
-                )
        elif settings.chat_format == "hf-autotokenizer":
            assert (
                settings.hf_pretrained_model_name_or_path is not None
@ -242,7 +228,6 @@ class LlamaProxy:
            logits_all=settings.logits_all,
            embedding=settings.embedding,
            offload_kqv=settings.offload_kqv,
-            flash_attn=settings.flash_attn,
            # Sampling Params
            last_n_tokens_size=settings.last_n_tokens_size,
            # LoRA Params
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@ -215,10 +215,6 @@ class ServerSettings(BaseSettings):
        default=False,
        description="Disable EventSource pings (may be needed for some clients).",
    )
-    root_path: str = Field(
-        default="",
-        description="The root path for the server. Useful when running behind a reverse proxy.",
-    )


 class Settings(ServerSettings, ModelSettings):
--- a/llama_cpp/server/types.py
+++ b/llama_cpp/server/types.py
@ -16,14 +16,10 @@ max_tokens_field = Field(
    default=16, ge=1, description="The maximum number of tokens to generate."
 )

-min_tokens_field = Field(
-    default=0,
-    ge=0,
-    description="The minimum number of tokens to generate. It may return fewer tokens if another condition is met (e.g. max_tokens, stop).",
-)
-
 temperature_field = Field(
    default=0.8,
+    ge=0.0,
+    le=2.0,
    description="Adjust the randomness of the generated text.\n\n"
    + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
 )
@ -117,7 +113,6 @@ class CreateCompletionRequest(BaseModel):
    max_tokens: Optional[int] = Field(
        default=16, ge=0, description="The maximum number of tokens to generate."
    )
-    min_tokens: int = min_tokens_field
    temperature: float = temperature_field
    top_p: float = top_p_field
    min_p: float = min_p_field
@ -213,7 +208,6 @@ class CreateChatCompletionRequest(BaseModel):
        default=None,
        description="The maximum number of tokens to generate. Defaults to inf",
    )
-    min_tokens: int = min_tokens_field
    logprobs: Optional[bool] = Field(
        default=False,
        description="Whether to output the logprobs or not. Default is True"
--- a/scripts/releases-to-pep-503.sh
+++ b/scripts/releases-to-pep-503.sh
@ -44,9 +44,7 @@ releases=$(echo $releases | tr ' ' '\n' | grep -E $pattern)
 # For each release, get all assets
 for release in $releases; do
    assets=$(curl -s https://api.github.com/repos/abetlen/llama-cpp-python/releases/tags/$release | jq -r .assets)
-    # Get release version from release ie v0.1.0-cu121 -> v0.1.0
-    release_version=$(echo $release | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
-    echo "    <h2>$release_version</h2>" >> index.html
+    echo "    <h2>$release</h2>" >> index.html
    for asset in $(echo $assets | jq -r .[].browser_download_url); do
        if [[ $asset == *".whl" ]]; then
            echo "    <a href=\"$asset\">$asset</a>" >> index.html
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@ -1 +1 @@
-Subproject commit 05834841dcb4f922983ea976539c70472272df9a
+Subproject commit f364eb6fb5d46118a76fa045f487318de4c24961