diff --git a/.gitmodules b/.gitmodules
index 6267b09..7edf097 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "vendor/llama.cpp"]
 	path = vendor/llama.cpp
-	url = git@github.com:ggerganov/llama.cpp.git
+	url = https://github.com/ggerganov/llama.cpp.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..6eb04cd
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,12 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### Added
+
+- Added first version of the changelog
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bda2388..16932b1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,4 +28,4 @@ else()
         LIBRARY DESTINATION llama_cpp
         RUNTIME DESTINATION llama_cpp
     )
-endif(UNIX)
+endif()
diff --git a/README.md b/README.md
index ae633f4..7487345 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ This package provides:
   - OpenAI-like API
   - LangChain compatibility
 
+Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).
+
 ## Installation from PyPI (recommended)
 
 Install from PyPI (requires a c compiler):
@@ -26,6 +28,18 @@ pip install llama-cpp-python
 The above command will attempt to install the package and build build `llama.cpp` from source.
 This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
 
+If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different  compiler options, please add the following flags to ensure that the package is rebuilt correctly:
+
+```bash
+pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir
+```
+
+Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
+```
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
+bash Miniforge3-MacOSX-arm64.sh
+```
+Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
 
 ### Installation with OpenBLAS / cuBLAS / CLBlast
 
@@ -35,19 +49,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
 To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
 
 ```bash
-LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 
 To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
 
 ```bash
-LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 
 To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
 
 ```bash
-LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 
 
@@ -102,7 +116,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
 A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
 
 ```bash
-docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
+docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
 ```
 
 ## Low-level API
@@ -120,7 +134,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize
 >>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params)
 >>> max_tokens = params.n_ctx
 # use ctypes arrays for array params
->>> tokens = (llama_cppp.llama_token * int(max_tokens))()
+>>> tokens = (llama_cpp.llama_token * int(max_tokens))()
 >>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True))
 >>> llama_cpp.llama_free(ctx)
 ```
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000..f0ef5f7
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,51 @@
+# Define the image argument and provide a default value
+ARG IMAGE=python:3-slim-bullseye
+
+# Use the image as specified
+FROM ${IMAGE}
+
+# Re-declare the ARG after FROM
+ARG IMAGE
+
+# Update and upgrade the existing packages 
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    ninja-build \
+    build-essential
+
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
+
+# Perform the conditional installations based on the image
+RUN echo "Image: ${IMAGE}" && \
+    if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
+    echo "OpenBLAS install:" && \
+    apt-get install -y --no-install-recommends libopenblas-dev && \
+    LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
+else \
+    echo "CuBLAS install:" && \
+    LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
+fi
+
+# Clean up apt cache
+RUN rm -rf /var/lib/apt/lists/*
+
+# Set a working directory for better clarity
+WORKDIR /app
+
+# Copy files to the app directory
+RUN echo "Installing model...this can take some time..."
+COPY ./model.bin /app/model.bin
+COPY ./start_server.sh /app/start_server.sh
+
+# Make the server start script executable
+RUN chmod +x /app/start_server.sh
+
+# Set environment variable for the host
+ENV HOST=0.0.0.0
+
+# Expose a port for the server
+EXPOSE 8000
+
+# Run the server start script
+CMD ["/bin/sh", "/app/start_server.sh"]
diff --git a/Dockerfile.cuda b/docker/Dockerfile.cuda_simple
similarity index 78%
rename from Dockerfile.cuda
rename to docker/Dockerfile.cuda_simple
index a852f3c..dda7a9f 100644
--- a/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda_simple
@@ -1,4 +1,5 @@
-FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
+ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
+FROM ${CUDA_IMAGE}
 
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
@@ -12,4 +13,4 @@ RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fa
 RUN LLAMA_CUBLAS=1 python3 setup.py develop
 
 # Run the server
-CMD python3 -m llama_cpp.server
\ No newline at end of file
+CMD python3 -m llama_cpp.server
diff --git a/Dockerfile b/docker/Dockerfile.openblas_simple
similarity index 100%
rename from Dockerfile
rename to docker/Dockerfile.openblas_simple
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 0000000..100bcbd
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,46 @@
+# Dockerfiles for building the llama-cpp-python server
+- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS
+- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS
+- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
+- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
+ 
+# Get model from Hugging Face
+`python3 ./hug_model.py`
+
+You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
+```
+docker $ ls -lh *.bin
+-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>.q5_1.bin
+lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>.q5_1.bin
+```
+**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
+**TWICE** as much disk space as the size of the model:
+
+| Model |  Quantized size |
+|------:|----------------:|
+|    7B |            5 GB |
+|   13B |           10 GB |
+|   30B |           25 GB |
+|   65B |           50 GB |
+
+**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
+
+# Install Docker Server
+
+**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
+
+[Install Docker Engine](https://docs.docker.com/engine/install)
+
+# Use OpenBLAS
+Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
+## Build:
+`docker build --build-arg -t openblas .`
+## Run:
+`docker run --cap-add SYS_RESOURCE -t openblas`
+
+# Use CuBLAS
+Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
+## Build:
+`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
+## Run:
+`docker run --cap-add SYS_RESOURCE -t cublas`
diff --git a/docker/hug_model.py b/docker/hug_model.py
new file mode 100644
index 0000000..848a1aa
--- /dev/null
+++ b/docker/hug_model.py
@@ -0,0 +1,116 @@
+import requests
+import json
+import os
+import struct
+
+def make_request(url, params=None):
+    print(f"Making request to {url}...")
+    response = requests.get(url, params=params)
+    if response.status_code == 200:
+        return json.loads(response.text)
+    else:
+        print(f"Request failed with status code {response.status_code}")
+        return None
+
+def check_magic_and_version(filename):
+    with open(filename, 'rb') as f:
+        # Read the first 6 bytes from the file
+        data = f.read(6)
+
+    # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
+    # and the next 2 bytes as a little-endian unsigned short
+    magic, version = struct.unpack('<I H', data)
+
+    print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
+
+    return magic, version
+
+def download_file(url, destination):
+    print(f"Downloading {url} to {destination}...")
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(destination, 'wb') as f:
+            total_downloaded = 0
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+                    total_downloaded += len(chunk)
+                    if total_downloaded >= 10485760:  # 10 MB
+                        print('.', end='', flush=True)
+                        total_downloaded = 0
+        print("\nDownload complete.")
+        
+        # Creating a symbolic link from destination to "model.bin"
+        if os.path.isfile("model.bin"):
+            os.remove("model.bin")  # remove the existing link if any
+        os.symlink(destination, "model.bin")
+    else:
+        print(f"Download failed with status code {response.status_code}")
+
+def get_user_choice(model_list):
+    # Print the enumerated list
+    print("\n")
+    for i, (model_id, rfilename) in enumerate(model_list):
+        print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
+
+    # Get user's choice
+    choice = input("Choose a model to download by entering the corresponding number: ")
+    try:
+        index = int(choice) - 1
+        if 0 <= index < len(model_list):
+            # Return the chosen model
+            return model_list[index]
+        else:
+            print("Invalid choice.")
+    except ValueError:
+        print("Invalid input. Please enter a number corresponding to a model.")
+    except IndexError:
+        print("Invalid choice. Index out of range.")
+    
+    return None
+
+import argparse
+
+def main():
+    # Create an argument parser
+    parser = argparse.ArgumentParser(description='Process the model version.')
+    parser.add_argument('-v', '--version', type=int, default=0x0003,
+                        help='an integer for the version to be used')
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    # Define the parameters
+    params = {
+        "author": "TheBloke",  # Filter by author
+        "tags": "llama"
+    }
+
+    models = make_request('https://huggingface.co/api/models', params=params)
+    if models is None:
+        return
+
+    model_list = []
+    # Iterate over the models
+    for model in models:
+        model_id = model['id']
+        model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
+        if model_info is None:
+            continue
+
+        for sibling in model_info.get('siblings', []):
+            rfilename = sibling.get('rfilename')
+            if rfilename and 'q5_1' in rfilename:
+                model_list.append((model_id, rfilename))
+
+    model_choice = get_user_choice(model_list)
+    if model_choice is not None:
+        model_id, rfilename = model_choice
+        url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
+        download_file(url, rfilename)
+        _, version = check_magic_and_version(rfilename)
+        if version != args.version:
+            print(f"Warning: Expected version {args.version}, but found different version in the file.")
+
+if __name__ == '__main__':
+    main()
diff --git a/docker/start_server.sh b/docker/start_server.sh
new file mode 100755
index 0000000..176bd87
--- /dev/null
+++ b/docker/start_server.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+# For mmap support
+ulimit -l unlimited
+
+if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
+    python3 -B -m llama_cpp.server --model /app/model.bin
+else
+    # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
+    python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
+fi
diff --git a/docs/index.md b/docs/index.md
index c36adff..99b1f59 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -112,8 +112,12 @@ python3 setup.py develop
         show_root_heading: true
 
 ::: llama_cpp.LlamaCache
+    options:
+        show_root_heading: true
 
 ::: llama_cpp.LlamaState
+    options:
+        show_root_heading: true
 
 ::: llama_cpp.llama_cpp
     options:
diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
index 8773cb1..f5d51a3 100644
--- a/examples/low_level_api/low_level_api_chat_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -368,10 +368,10 @@ n_keep = {self.params.n_keep}
 						id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
 					else:
 						# Temperature sampling
-						llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k)
-						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z))
-						llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p))
-						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p))
+						llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1))
+						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1))
+						llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1))
+						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1))
 						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
 						id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
 				# print("`{}`".format(candidates_p.size))
@@ -382,12 +382,15 @@ n_keep = {self.params.n_keep}
 				# replace end of text token with newline token when in interactive mode
 				if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
 					id = self.llama_token_newline[0]
+					self.embd.append(id)
 					if (self.use_antiprompt()):
 						# tokenize and inject first reverse prompt
 						self.embd_inp += self.first_antiprompt[0]
-
-				# add it to the context
-				self.embd.append(id)
+						for id in self.first_antiprompt[0]:
+							self.embd.append(id)
+				else:
+					# add it to the context
+					self.embd.append(id)
 
 				# echo this to console
 				self.output_echo = True
@@ -493,7 +496,7 @@ n_keep = {self.params.n_keep}
 			# Contains multi-byte UTF8
 			for num, pattern in [(2, 192), (3, 224), (4, 240)]:
 				# Bitwise AND check
-				if pattern & int.from_bytes(cur_char) == pattern:
+				if pattern & int.from_bytes(cur_char, 'little') == pattern:
 					self.multibyte_fix = [cur_char] + ([None] * (num-1))
 
 			# Stop incomplete bytes from passing
diff --git a/examples/notebooks/Guidance.ipynb b/examples/notebooks/Guidance.ipynb
new file mode 100644
index 0000000..045856e
--- /dev/null
+++ b/examples/notebooks/Guidance.ipynb
@@ -0,0 +1,89 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div id=\"guidance-stop-button-faa51639-e0a4-43c6-a6d4-d5853c2ec764\" style=\"cursor: pointer; margin: 0px; display: none; float: right; padding: 3px; border-radius: 4px 4px 4px 4px; border: 0px solid rgba(127, 127, 127, 1); padding-left: 10px; padding-right: 10px; font-size: 13px; background-color: rgba(127, 127, 127, 0.25);\">Stop program</div><div id=\"guidance-content-faa51639-e0a4-43c6-a6d4-d5853c2ec764\"><pre style='margin: 0px; padding: 0px; padding-left: 8px; margin-left: -8px; border-radius: 0px; border-left: 1px solid rgba(127, 127, 127, 0.2); white-space: pre-wrap; font-family: ColfaxAI, Arial; font-size: 15px; line-height: 23px;'>Tweak this proverb to apply to model instructions instead.\n",
+       "\n",
+       "<span style='background-color: rgba(0, 138.56128016, 250.76166089, 0.25); display: inline;' title='{{proverb}}'>Where there is no guidance, a people falls,\n",
+       "but in an abundance of counselors there is safety.</span>\n",
+       "- <span style='background-color: rgba(0, 138.56128016, 250.76166089, 0.25); display: inline;' title='{{book}}'>Proverbs</span> <span style='background-color: rgba(0, 138.56128016, 250.76166089, 0.25); display: inline;' title='{{chapter}}'>11</span>:<span style='background-color: rgba(0, 138.56128016, 250.76166089, 0.25); display: inline;' title='{{verse}}'>14</span>\n",
+       "\n",
+       "UPDATED\n",
+       "Where there is no guidance<span style='background-color: rgba(0, 165, 0, 0.25); opacity: 1.0; display: inline;' title='{{gen &#x27;rewrite&#x27; stop=&quot;\\n-&quot;}}'> for assembling a model, people will struggle,\n",
+       "but with clear instructions, the process becomes safe and successful.</span>\n",
+       "- GPT <span style='background-color: rgba(0, 165, 0, 0.25); opacity: 1.0; display: inline;' title='{{gen &#x27;chapter&#x27;}}'>2 (updated)</span>:<span style='background-color: rgba(0, 165, 0, 0.25); opacity: 1.0; display: inline;' title='{{gen &#x27;verse&#x27;}}'> Proverbs 11:14</span></pre></div>\n",
+       "<script type=\"text/javascript\">(()=>{var t={296:(t,e,n)=>{var i=NaN,o=\"[object Symbol]\",r=/^\\s+|\\s+$/g,a=/^[-+]0x[0-9a-f]+$/i,s=/^0b[01]+$/i,c=/^0o[0-7]+$/i,d=parseInt,u=\"object\"==typeof n.g&&n.g&&n.g.Object===Object&&n.g,l=\"object\"==typeof self&&self&&self.Object===Object&&self,f=u||l||Function(\"return this\")(),h=Object.prototype.toString,p=Math.max,m=Math.min,g=function(){return f.Date.now()};function b(t){var e=typeof t;return!!t&&(\"object\"==e||\"function\"==e)}function y(t){if(\"number\"==typeof t)return t;if(function(t){return\"symbol\"==typeof t||function(t){return!!t&&\"object\"==typeof t}(t)&&h.call(t)==o}(t))return i;if(b(t)){var e=\"function\"==typeof t.valueOf?t.valueOf():t;t=b(e)?e+\"\":e}if(\"string\"!=typeof t)return 0===t?t:+t;t=t.replace(r,\"\");var n=s.test(t);return n||c.test(t)?d(t.slice(2),n?2:8):a.test(t)?i:+t}t.exports=function(t,e,n){var i,o,r,a,s,c,d=0,u=!1,l=!1,f=!0;if(\"function\"!=typeof t)throw new TypeError(\"Expected a function\");function h(e){var n=i,r=o;return i=o=void 0,d=e,a=t.apply(r,n)}function v(t){var n=t-c;return void 0===c||n>=e||n<0||l&&t-d>=r}function _(){var t=g();if(v(t))return w(t);s=setTimeout(_,function(t){var n=e-(t-c);return l?m(n,r-(t-d)):n}(t))}function w(t){return s=void 0,f&&i?h(t):(i=o=void 0,a)}function j(){var t=g(),n=v(t);if(i=arguments,o=this,c=t,n){if(void 0===s)return function(t){return d=t,s=setTimeout(_,e),u?h(t):a}(c);if(l)return s=setTimeout(_,e),h(c)}return void 0===s&&(s=setTimeout(_,e)),a}return e=y(e)||0,b(n)&&(u=!!n.leading,r=(l=\"maxWait\"in n)?p(y(n.maxWait)||0,e):r,f=\"trailing\"in n?!!n.trailing:f),j.cancel=function(){void 0!==s&&clearTimeout(s),d=0,i=c=o=s=void 0},j.flush=function(){return void 0===s?a:w(g())},j}},777:t=>{var e,n,i=Math.max,o=(e=function(t,e){return function(t,e,n){if(\"function\"!=typeof t)throw new TypeError(\"Expected a function\");return setTimeout((function(){t.apply(void 0,n)}),1)}(t,0,e)},n=i(void 0===n?e.length-1:n,0),function(){for(var t=arguments,o=-1,r=i(t.length-n,0),a=Array(r);++o<r;)a[o]=t[n+o];o=-1;for(var s=Array(n+1);++o<n;)s[o]=t[o];return s[n]=a,function(t,e,n){switch(n.length){case 0:return t.call(e);case 1:return t.call(e,n[0]);case 2:return t.call(e,n[0],n[1]);case 3:return t.call(e,n[0],n[1],n[2])}return t.apply(e,n)}(e,this,s)});t.exports=o}},e={};function n(i){var o=e[i];if(void 0!==o)return o.exports;var r=e[i]={exports:{}};return t[i](r,r.exports,n),r.exports}n.n=t=>{var e=t&&t.__esModule?()=>t.default:()=>t;return n.d(e,{a:e}),e},n.d=(t,e)=>{for(var i in e)n.o(e,i)&&!n.o(t,i)&&Object.defineProperty(t,i,{enumerable:!0,get:e[i]})},n.g=function(){if(\"object\"==typeof globalThis)return globalThis;try{return this||new Function(\"return this\")()}catch(t){if(\"object\"==typeof window)return window}}(),n.o=(t,e)=>Object.prototype.hasOwnProperty.call(t,e),(()=>{\"use strict\";const t=t=>{const e=new Set;do{for(const n of Reflect.ownKeys(t))e.add([t,n])}while((t=Reflect.getPrototypeOf(t))&&t!==Object.prototype);return e};function e(e,{include:n,exclude:i}={}){const o=t=>{const e=e=>\"string\"==typeof e?t===e:e.test(t);return n?n.some(e):!i||!i.some(e)};for(const[n,i]of t(e.constructor.prototype)){if(\"constructor\"===i||!o(i))continue;const t=Reflect.getOwnPropertyDescriptor(n,i);t&&\"function\"==typeof t.value&&(e[i]=e[i].bind(e))}return e}var i=n(777),o=n.n(i),r=n(296),a=n.n(r);class s{constructor(t,n){e(this),this.interfaceId=t,this.callbackMap={},this.data={},this.pendingData={},this.jcomm=new c(\"guidance_interface_target_\"+this.interfaceId,this.updateData,\"open\"),this.debouncedSendPendingData500=a()(this.sendPendingData,500),this.debouncedSendPendingData1000=a()(this.sendPendingData,1e3),n&&o()(n)}send(t,e){this.addPendingData(t,e),this.sendPendingData()}sendEvent(t){for(const e of Object.keys(t))this.addPendingData(e,t[e]);this.sendPendingData()}debouncedSendEvent500(t){for(const e of Object.keys(t))this.addPendingData(e,t[e]);this.debouncedSendPendingData500()}debouncedSend500(t,e){this.addPendingData(t,e),this.debouncedSendPendingData500()}debouncedSend1000(t,e){this.addPendingData(t,e),this.debouncedSendPendingData1000()}addPendingData(t,e){Array.isArray(t)||(t=[t]);for(const n in t)this.pendingData[t[n]]=e}updateData(t){t=JSON.parse(t.data);for(const e in t)this.data[e]=t[e];for(const e in t)e in this.callbackMap&&this.callbackMap[e](this.data[e])}subscribe(t,e){this.callbackMap[t]=e,o()((e=>this.callbackMap[t](this.data[t])))}sendPendingData(){this.jcomm.send_data(this.pendingData),this.pendingData={}}}class c{constructor(t,e,n=\"open\"){this._fire_callback=this._fire_callback.bind(this),this._register=this._register.bind(this),this.jcomm=void 0,this.callback=e,void 0!==window.Jupyter?\"register\"===n?Jupyter.notebook.kernel.comm_manager.register_target(t,this._register):(this.jcomm=Jupyter.notebook.kernel.comm_manager.new_comm(t),this.jcomm.on_msg(this._fire_callback)):void 0!==window._mgr&&(\"register\"===n?window._mgr.widgetManager.proxyKernel.registerCommTarget(t,this._register):(this.jcomm=window._mgr.widgetManager.proxyKernel.createComm(t),this.jcomm.open({},\"\"),this.jcomm.onMsg=this._fire_callback))}send_data(t){void 0!==this.jcomm?this.jcomm.send(t):console.error(\"Jupyter comm module not yet loaded! So we can't send the message.\")}_register(t,e){this.jcomm=t,this.jcomm.on_msg(this._fire_callback)}_fire_callback(t){this.callback(t.content.data)}}class d{constructor(t,n){e(this),this.id=t,this.comm=new s(t),this.comm.subscribe(\"append\",this.appendData),this.comm.subscribe(\"replace\",this.replaceData),this.comm.subscribe(\"event\",this.eventOccurred),this.element=document.getElementById(\"guidance-content-\"+t),this.stop_button=document.getElementById(\"guidance-stop-button-\"+t),this.stop_button.onclick=()=>this.comm.send(\"event\",\"stop\")}appendData(t){t&&(this.stop_button.style.display=\"inline-block\",this.element.innerHTML+=t)}replaceData(t){t&&(this.stop_button.style.display=\"inline-block\",this.element.innerHTML=t)}eventOccurred(t){\"complete\"===t&&(this.stop_button.style.display=\"none\")}}window._guidanceDisplay=function(t,e){return new d(t,e)}})()})();; window._guidanceDisplay(\"faa51639-e0a4-43c6-a6d4-d5853c2ec764\");</script>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n",
+    "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n",
+    "os.environ[\"OPENAI_API_HOST\"] = \"http://100.64.159.73:8000\"\n",
+    "\n",
+    "import guidance\n",
+    "\n",
+    "# set the default language model used to execute guidance programs\n",
+    "guidance.llm = guidance.llms.OpenAI(\"text-davinci-003\", caching=False)\n",
+    "\n",
+    "# define a guidance program that adapts a proverb\n",
+    "program = guidance(\"\"\"Tweak this proverb to apply to model instructions instead.\n",
+    "\n",
+    "{{proverb}}\n",
+    "- {{book}} {{chapter}}:{{verse}}\n",
+    "\n",
+    "UPDATED\n",
+    "Where there is no guidance{{gen 'rewrite' stop=\"\\\\n-\"}}\n",
+    "- GPT {{gen 'chapter'}}:{{gen 'verse'}}\"\"\")\n",
+    "\n",
+    "# execute the program on a specific proverb\n",
+    "executed_program = program(\n",
+    "    proverb=\"Where there is no guidance, a people falls,\\nbut in an abundance of counselors there is safety.\",\n",
+    "    book=\"Proverbs\",\n",
+    "    chapter=11,\n",
+    "    verse=14\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 7be51e1..012bb86 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -4,7 +4,17 @@ import uuid
 import time
 import math
 import multiprocessing
-from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple
+from typing import (
+    List,
+    Optional,
+    Union,
+    Generator,
+    Sequence,
+    Iterator,
+    Deque,
+    Tuple,
+    Callable,
+)
 from collections import deque, OrderedDict
 
 from . import llama_cpp
@@ -15,9 +25,7 @@ class LlamaCache:
     """Cache for a llama.cpp model."""
 
     def __init__(self, capacity_bytes: int = (2 << 30)):
-        self.cache_state: OrderedDict[
-            Tuple[llama_cpp.llama_token, ...], "LlamaState"
-        ] = OrderedDict()
+        self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict()
         self.capacity_bytes = capacity_bytes
 
     @property
@@ -26,8 +34,8 @@ class LlamaCache:
 
     def _find_longest_prefix_key(
         self,
-        key: Tuple[llama_cpp.llama_token, ...],
-    ) -> Optional[Tuple[llama_cpp.llama_token, ...]]:
+        key: Tuple[int, ...],
+    ) -> Optional[Tuple[int, ...]]:
         min_len = 0
         min_key = None
         keys = (
@@ -39,7 +47,7 @@ class LlamaCache:
                 min_key = k
         return min_key
 
-    def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState":
+    def __getitem__(self, key: Sequence[int]) -> "LlamaState":
         key = tuple(key)
         _key = self._find_longest_prefix_key(key)
         if _key is None:
@@ -48,10 +56,10 @@ class LlamaCache:
         self.cache_state.move_to_end(_key)
         return value
 
-    def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool:
+    def __contains__(self, key: Sequence[int]) -> bool:
         return self._find_longest_prefix_key(tuple(key)) is not None
 
-    def __setitem__(self, key: Sequence[llama_cpp.llama_token], value: "LlamaState"):
+    def __setitem__(self, key: Sequence[int], value: "LlamaState"):
         key = tuple(key)
         if key in self.cache_state:
             del self.cache_state[key]
@@ -63,7 +71,7 @@ class LlamaCache:
 class LlamaState:
     def __init__(
         self,
-        eval_tokens: Deque[llama_cpp.llama_token],
+        eval_tokens: Deque[int],
         eval_logits: Deque[List[float]],
         llama_state,  # type: llama_cpp.Array[llama_cpp.c_uint8]
         llama_state_size: int,
@@ -74,6 +82,24 @@ class LlamaState:
         self.llama_state_size = llama_state_size
 
 
+LogitsProcessor = Callable[[List[int], List[float]], List[float]]
+
+
+class LogitsProcessorList(List[LogitsProcessor]):
+    def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]:
+        for processor in self:
+            scores = processor(input_ids, scores)
+        return scores
+
+
+StoppingCriteria = Callable[[List[int], List[float]], bool]
+
+
+class StoppingCriteriaList(List[StoppingCriteria]):
+    def __call__(self, input_ids: List[int], logits: List[float]) -> bool:
+        return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
+
+
 class Llama:
     """High-level Python wrapper for a llama.cpp model."""
 
@@ -83,6 +109,7 @@ class Llama:
         # NOTE: These parameters are likely to change in the future.
         n_ctx: int = 512,
         n_parts: int = -1,
+        n_gpu_layers: int = 0,
         seed: int = 1337,
         f16_kv: bool = True,
         logits_all: bool = False,
@@ -128,7 +155,7 @@ class Llama:
 
         self.params = llama_cpp.llama_context_default_params()
         self.params.n_ctx = n_ctx
-        self.params.n_parts = n_parts
+        self.params.n_gpu_layers = n_gpu_layers
         self.params.seed = seed
         self.params.f16_kv = f16_kv
         self.params.logits_all = logits_all
@@ -139,7 +166,7 @@ class Llama:
 
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
-        self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx)
+        self.eval_tokens: Deque[int] = deque(maxlen=n_ctx)
         self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1)
 
         self.cache: Optional[LlamaCache] = None
@@ -149,6 +176,10 @@ class Llama:
         self.lora_base = lora_base
         self.lora_path = lora_path
 
+        ### DEPRECATED ###
+        self.n_parts = n_parts
+        ### DEPRECATED ###
+
         if not os.path.exists(model_path):
             raise ValueError(f"Model path does not exist: {model_path}")
 
@@ -174,7 +205,30 @@ class Llama:
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
 
-    def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]:
+        self._n_vocab = self.n_vocab()
+        self._n_ctx = self.n_ctx()
+        data = (llama_cpp.llama_token_data * self._n_vocab)(
+            *[
+                llama_cpp.llama_token_data(
+                    id=llama_cpp.llama_token(i),
+                    logit=llama_cpp.c_float(0.0),
+                    p=llama_cpp.c_float(0.0),
+                )
+                for i in range(self._n_vocab)
+            ]
+        )
+        size = llama_cpp.c_size_t(self._n_vocab)
+        sorted = False
+        candidates = llama_cpp.llama_token_data_array(
+            data=data,
+            size=size,
+            sorted=sorted,
+        )
+        self._candidates = candidates
+        self._token_nl = Llama.token_nl()
+        self._token_eos = Llama.token_eos()
+
+    def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
         """Tokenize a string.
 
         Args:
@@ -187,20 +241,32 @@ class Llama:
             A list of tokens.
         """
         assert self.ctx is not None
-        n_ctx = llama_cpp.llama_n_ctx(self.ctx)
-        tokens = (llama_cpp.llama_token * int(n_ctx))()
+        n_ctx = self._n_ctx
+        tokens = (llama_cpp.llama_token * n_ctx)()
         n_tokens = llama_cpp.llama_tokenize(
             self.ctx,
             text,
             tokens,
-            n_ctx,
-            llama_cpp.c_bool(True),
+            llama_cpp.c_int(n_ctx),
+            llama_cpp.c_bool(add_bos),
         )
-        if int(n_tokens) < 0:
-            raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
+        if n_tokens < 0:
+            n_tokens = abs(n_tokens)
+            tokens = (llama_cpp.llama_token * n_tokens)()
+            n_tokens = llama_cpp.llama_tokenize(
+                self.ctx,
+                text,
+                tokens,
+                llama_cpp.c_int(n_tokens),
+                llama_cpp.c_bool(add_bos),
+            )
+            if n_tokens < 0:
+                raise RuntimeError(
+                    f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
+                )
         return list(tokens[:n_tokens])
 
-    def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes:
+    def detokenize(self, tokens: List[int]) -> bytes:
         """Detokenize a list of tokens.
 
         Args:
@@ -212,7 +278,9 @@ class Llama:
         assert self.ctx is not None
         output = b""
         for token in tokens:
-            output += llama_cpp.llama_token_to_str(self.ctx, token)
+            output += llama_cpp.llama_token_to_str(
+                self.ctx, llama_cpp.llama_token(token)
+            )
         return output
 
     def set_cache(self, cache: Optional[LlamaCache]):
@@ -228,14 +296,14 @@ class Llama:
         self.eval_tokens.clear()
         self.eval_logits.clear()
 
-    def eval(self, tokens: Sequence[llama_cpp.llama_token]):
+    def eval(self, tokens: Sequence[int]):
         """Evaluate a list of tokens.
 
         Args:
             tokens: The list of tokens to evaluate.
         """
         assert self.ctx is not None
-        n_ctx = int(llama_cpp.llama_n_ctx(self.ctx))
+        n_ctx = self._n_ctx
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
             n_past = min(n_ctx - len(batch), len(self.eval_tokens))
@@ -247,18 +315,16 @@ class Llama:
                 n_past=llama_cpp.c_int(n_past),
                 n_threads=llama_cpp.c_int(self.n_threads),
             )
-            if int(return_code) != 0:
+            if return_code != 0:
                 raise RuntimeError(f"llama_eval returned {return_code}")
             # Save tokens
             self.eval_tokens.extend(batch)
             # Save logits
             rows = n_tokens if self.params.logits_all else 1
-            n_vocab = llama_cpp.llama_n_vocab(self.ctx)
-            cols = int(n_vocab)
+            n_vocab = self._n_vocab
+            cols = n_vocab
             logits_view = llama_cpp.llama_get_logits(self.ctx)
-            logits: List[List[float]] = [
-                [logits_view[i * cols + j] for j in range(cols)] for i in range(rows)
-            ]
+            logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)]
             self.eval_logits.extend(logits)
 
     def _sample(
@@ -275,28 +341,33 @@ class Llama:
         mirostat_mode: llama_cpp.c_int,
         mirostat_tau: llama_cpp.c_float,
         mirostat_eta: llama_cpp.c_float,
+        penalize_nl: bool = True,
+        logits_processor: Optional[LogitsProcessorList] = None,
     ):
         assert self.ctx is not None
         assert len(self.eval_logits) > 0
-        n_vocab = int(llama_cpp.llama_n_vocab(self.ctx))
+        n_vocab = self._n_vocab
+        n_ctx = self._n_ctx
+        top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
+        last_n_tokens_size = (
+            llama_cpp.c_int(n_ctx)
+            if last_n_tokens_size.value < 0
+            else last_n_tokens_size
+        )
         logits = self.eval_logits[-1]
-        data = (llama_cpp.llama_token_data * n_vocab)(
-            *[
-                llama_cpp.llama_token_data(
-                    id=llama_cpp.llama_token(i),
-                    logit=logits[i],
-                    p=llama_cpp.c_float(0.0),
-                )
-                for i in range(n_vocab)
-            ]
-        )
-        size = llama_cpp.c_size_t(n_vocab)
-        sorted = False
-        candidates = llama_cpp.llama_token_data_array(
-            data=data,
-            size=size,
-            sorted=sorted,
-        )
+
+        if logits_processor is not None:
+            logits = logits_processor(list(self.eval_tokens), logits)
+            self.eval_logits[-1] = logits
+
+        nl_logit = logits[self._token_nl]
+        candidates = self._candidates
+        for i, logit in enumerate(logits):
+            candidates.data[i].id = llama_cpp.llama_token(i)
+            candidates.data[i].logit = llama_cpp.c_float(logit)
+            candidates.data[i].p = llama_cpp.c_float(0.0)
+        candidates.sorted = llama_cpp.c_bool(False)
+        candidates.size = llama_cpp.c_size_t(n_vocab)
         llama_cpp.llama_sample_repetition_penalty(
             ctx=self.ctx,
             last_tokens_data=last_n_tokens_data,
@@ -312,6 +383,8 @@ class Llama:
             alpha_frequency=frequency_penalty,
             alpha_presence=presence_penalty,
         )
+        if not penalize_nl:
+            candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit)
         if temp.value == 0.0:
             return llama_cpp.llama_sample_token_greedy(
                 ctx=self.ctx,
@@ -394,6 +467,8 @@ class Llama:
         mirostat_mode: int = 0,
         mirostat_eta: float = 0.1,
         mirostat_tau: float = 5.0,
+        penalize_nl: bool = True,
+        logits_processor: Optional[LogitsProcessorList] = None,
     ):
         """Sample a token from the model.
 
@@ -425,24 +500,27 @@ class Llama:
             mirostat_mode=llama_cpp.c_int(mirostat_mode),
             mirostat_tau=llama_cpp.c_float(mirostat_tau),
             mirostat_eta=llama_cpp.c_float(mirostat_eta),
+            penalize_nl=penalize_nl,
+            logits_processor=logits_processor,
         )
 
     def generate(
         self,
-        tokens: Sequence[llama_cpp.llama_token],
-        top_k: int,
-        top_p: float,
-        temp: float,
-        repeat_penalty: float,
+        tokens: Sequence[int],
+        top_k: int = 40,
+        top_p: float = 0.95,
+        temp: float = 0.80,
+        repeat_penalty: float = 1.1,
         reset: bool = True,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
+        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
-    ) -> Generator[
-        llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None
-    ]:
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+    ) -> Generator[int, Optional[Sequence[int]], None]:
         """Create a generator of tokens from a prompt.
 
         Examples:
@@ -495,16 +573,24 @@ class Llama:
                 repeat_penalty=repeat_penalty,
                 frequency_penalty=frequency_penalty,
                 presence_penalty=presence_penalty,
+                tfs_z=tfs_z,
                 mirostat_mode=mirostat_mode,
                 mirostat_tau=mirostat_tau,
                 mirostat_eta=mirostat_eta,
+                logits_processor=logits_processor,
             )
+            if stopping_criteria is not None and stopping_criteria(
+                list(self.eval_tokens), self.eval_logits[-1]
+            ):
+                return
             tokens_or_none = yield token
             tokens = [token]
             if tokens_or_none is not None:
                 tokens.extend(tokens_or_none)
 
-    def create_embedding(self, input: str) -> Embedding:
+    def create_embedding(
+        self, input: Union[str, List[str]], model: Optional[str] = None
+    ) -> Embedding:
         """Embed a string.
 
         Args:
@@ -514,6 +600,7 @@ class Llama:
             An embedding object.
         """
         assert self.ctx is not None
+        model_name: str = model if model is not None else self.model_path
 
         if self.params.embedding == False:
             raise RuntimeError(
@@ -523,30 +610,40 @@ class Llama:
         if self.verbose:
             llama_cpp.llama_reset_timings(self.ctx)
 
-        tokens = self.tokenize(input.encode("utf-8"))
-        self.reset()
-        self.eval(tokens)
-        n_tokens = len(tokens)
-        embedding = llama_cpp.llama_get_embeddings(self.ctx)[
-            : llama_cpp.llama_n_embd(self.ctx)
-        ]
+        if isinstance(input, str):
+            inputs = [input]
+        else:
+            inputs = input
 
+        data: List[EmbeddingData] = []
+        total_tokens = 0
+        for index, input in enumerate(inputs):
+            tokens = self.tokenize(input.encode("utf-8"))
+            self.reset()
+            self.eval(tokens)
+            n_tokens = len(tokens)
+            total_tokens += n_tokens
+            embedding = llama_cpp.llama_get_embeddings(self.ctx)[
+                : llama_cpp.llama_n_embd(self.ctx)
+            ]
+
+            data.append(
+                {
+                    "object": "embedding",
+                    "embedding": embedding,
+                    "index": index,
+                }
+            )
         if self.verbose:
             llama_cpp.llama_print_timings(self.ctx)
 
         return {
             "object": "list",
-            "data": [
-                {
-                    "object": "embedding",
-                    "embedding": embedding,
-                    "index": 0,
-                }
-            ],
-            "model": self.model_path,
+            "data": data,
+            "model": model_name,
             "usage": {
-                "prompt_tokens": n_tokens,
-                "total_tokens": n_tokens,
+                "prompt_tokens": total_tokens,
+                "total_tokens": total_tokens,
             },
         }
 
@@ -570,35 +667,39 @@ class Llama:
         top_p: float = 0.95,
         logprobs: Optional[int] = None,
         echo: bool = False,
-        stop: Optional[List[str]] = [],
+        stop: Optional[Union[str, List[str]]] = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
+        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
     ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
         assert self.ctx is not None
+
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
-        completion_tokens: List[llama_cpp.llama_token] = []
+        completion_tokens: List[int] = []
         # Add blank space to start of prompt to match OG llama tokenizer
-        prompt_tokens: List[llama_cpp.llama_token] = self.tokenize(
-            b" " + prompt.encode("utf-8")
-        )
+        prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8"))
         text: bytes = b""
-        returned_characters: int = 0
-        stop = stop if stop is not None else []
+        returned_tokens: int = 0
+        stop = (
+            stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else []
+        )
+        model_name: str = model if model is not None else self.model_path
 
         if self.verbose:
             llama_cpp.llama_reset_timings(self.ctx)
 
-        if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
-            raise ValueError(
-                f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
-            )
+        if len(prompt_tokens) + max_tokens > self._n_ctx:
+            raise ValueError(f"Requested tokens exceed context window of {self._n_ctx}")
 
         if stop != []:
             stop_sequences = [s.encode("utf-8") for s in stop]
@@ -634,14 +735,17 @@ class Llama:
             top_k=top_k,
             top_p=top_p,
             temp=temperature,
+            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
             repeat_penalty=repeat_penalty,
+            stopping_criteria=stopping_criteria,
+            logits_processor=logits_processor,
         ):
-            if token == llama_cpp.llama_token_eos():
+            if token == self._token_eos:
                 text = self.detokenize(completion_tokens)
                 finish_reason = "stop"
                 break
@@ -671,63 +775,189 @@ class Llama:
                 break
 
             if stream:
-                start = returned_characters
-                longest = 0
                 # We want to avoid yielding any characters from
                 # the generated text if they are part of a stop
                 # sequence.
+                first_stop_position = 0
                 for s in stop_sequences:
                     for i in range(len(s), 0, -1):
                         if all_text.endswith(s[:i]):
-                            if i > longest:
-                                longest = i
+                            if i > first_stop_position:
+                                first_stop_position = i
                             break
-                text = all_text[: len(all_text) - longest]
-                returned_characters += len(text[start:])
-                yield {
-                    "id": completion_id,
-                    "object": "text_completion",
-                    "created": created,
-                    "model": self.model_path,
-                    "choices": [
-                        {
-                            "text": text[start:].decode("utf-8", errors="ignore"),
-                            "index": 0,
-                            "logprobs": None,
-                            "finish_reason": None,
+
+                token_end_position = 0
+                remaining_tokens = completion_tokens[returned_tokens:]
+                remaining_length = len(self.detokenize(remaining_tokens))
+                for token in remaining_tokens:
+                    token_end_position += len(self.detokenize([token]))
+                    # Check if stop sequence is in the token
+                    if token_end_position >= (
+                        remaining_length - first_stop_position - 1
+                    ):
+                        break
+                    logprobs_or_none: Optional[CompletionLogprobs] = None
+                    if logprobs is not None:
+                        token_str = self.detokenize([token]).decode(
+                            "utf-8", errors="ignore"
+                        )
+                        text_offset = len(prompt) + len(
+                            self.detokenize(completion_tokens[:returned_tokens])
+                        )
+                        token_offset = len(prompt_tokens) + returned_tokens
+                        logits = self.eval_logits[token_offset - 1]
+                        current_logprobs = Llama.logits_to_logprobs(logits)
+                        sorted_logprobs = list(
+                            sorted(
+                                zip(current_logprobs, range(len(current_logprobs))),
+                                reverse=True,
+                            )
+                        )
+                        top_logprob = {
+                            self.detokenize([i]).decode(
+                                "utf-8", errors="ignore"
+                            ): logprob
+                            for logprob, i in sorted_logprobs[:logprobs]
                         }
-                    ],
-                }
+                        top_logprob.update({token_str: current_logprobs[int(token)]})
+                        logprobs_or_none = {
+                            "tokens": [
+                                self.detokenize([token]).decode(
+                                    "utf-8", errors="ignore"
+                                )
+                            ],
+                            "text_offset": [text_offset],
+                            "token_logprobs": [sorted_logprobs[int(token)][0]],
+                            "top_logprobs": [top_logprob],
+                        }
+                    returned_tokens += 1
+                    yield {
+                        "id": completion_id,
+                        "object": "text_completion",
+                        "created": created,
+                        "model": model_name,
+                        "choices": [
+                            {
+                                "text": self.detokenize([token]).decode(
+                                    "utf-8", errors="ignore"
+                                ),
+                                "index": 0,
+                                "logprobs": logprobs_or_none,
+                                "finish_reason": None,
+                            }
+                        ],
+                    }
 
             if len(completion_tokens) >= max_tokens:
                 text = self.detokenize(completion_tokens)
                 finish_reason = "length"
                 break
 
+        if stopping_criteria is not None and stopping_criteria(
+            list(self.eval_tokens), self.eval_logits[-1]
+        ):
+            text = self.detokenize(completion_tokens)
+            finish_reason = "stop"
+
+        if self.verbose:
+            llama_cpp.llama_print_timings(self.ctx)
+
+        if stream:
+            remaining_tokens = completion_tokens[returned_tokens:]
+            all_text = self.detokenize(remaining_tokens)
+            any_stop = [s for s in stop_sequences if s in all_text]
+            if len(any_stop) > 0:
+                end = min(all_text.index(stop) for stop in any_stop)
+            else:
+                end = len(all_text)
+
+            token_end_position = 0
+            for token in remaining_tokens:
+                token_end_position += len(self.detokenize([token]))
+
+                logprobs_or_none: Optional[CompletionLogprobs] = None
+                if logprobs is not None:
+                    token_str = self.detokenize([token]).decode(
+                        "utf-8", errors="ignore"
+                    )
+                    text_offset = len(prompt) + len(
+                        self.detokenize(completion_tokens[:returned_tokens])
+                    )
+                    token_offset = len(prompt_tokens) + returned_tokens - 1
+                    logits = self.eval_logits[token_offset]
+                    current_logprobs = Llama.logits_to_logprobs(logits)
+                    sorted_logprobs = list(
+                        sorted(
+                            zip(current_logprobs, range(len(current_logprobs))),
+                            reverse=True,
+                        )
+                    )
+                    top_logprob = {
+                        self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
+                        for logprob, i in sorted_logprobs[:logprobs]
+                    }
+                    top_logprob.update({token_str: current_logprobs[int(token)]})
+                    logprobs_or_none = {
+                        "tokens": [
+                            self.detokenize([token]).decode("utf-8", errors="ignore")
+                        ],
+                        "text_offset": [text_offset],
+                        "token_logprobs": [sorted_logprobs[int(token)][0]],
+                        "top_logprobs": [top_logprob],
+                    }
+
+                if token_end_position >= end:
+                    last_text = self.detokenize([token])
+                    if token_end_position == end - 1:
+                        break
+                    returned_tokens += 1
+                    yield {
+                        "id": completion_id,
+                        "object": "text_completion",
+                        "created": created,
+                        "model": model_name,
+                        "choices": [
+                            {
+                                "text": last_text[
+                                    : len(last_text) - (token_end_position - end)
+                                ].decode("utf-8", errors="ignore"),
+                                "index": 0,
+                                "logprobs": logprobs_or_none,
+                                "finish_reason": finish_reason,
+                            }
+                        ],
+                    }
+                    break
+                returned_tokens += 1
+                yield {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "text": self.detokenize([token]).decode(
+                                "utf-8", errors="ignore"
+                            ),
+                            "index": 0,
+                            "logprobs": logprobs_or_none,
+                            "finish_reason": finish_reason
+                            if returned_tokens == len(completion_tokens)
+                            else None,
+                        }
+                    ],
+                }
+            if self.cache:
+                if self.verbose:
+                    print("Llama._create_completion: cache save", file=sys.stderr)
+                self.cache[prompt_tokens + completion_tokens] = self.save_state()
+            return
+
         if self.cache:
             if self.verbose:
                 print("Llama._create_completion: cache save", file=sys.stderr)
             self.cache[prompt_tokens + completion_tokens] = self.save_state()
 
-        if stream:
-            yield {
-                "id": completion_id,
-                "object": "text_completion",
-                "created": created,
-                "model": self.model_path,
-                "choices": [
-                    {
-                        "text": text[returned_characters:].decode(
-                            "utf-8", errors="ignore"
-                        ),
-                        "index": 0,
-                        "logprobs": None,
-                        "finish_reason": finish_reason,
-                    }
-                ],
-            }
-            return
-
         text_str = text.decode("utf-8", errors="ignore")
 
         if echo:
@@ -738,13 +968,19 @@ class Llama:
 
         logprobs_or_none: Optional[CompletionLogprobs] = None
         if logprobs is not None:
-            text_offset = 0
+            text_offset = 0 if echo else len(prompt)
+            token_offset = 0 if echo else len(prompt_tokens[1:])
             text_offsets: List[int] = []
-            token_logprobs: List[float] = []
+            token_logprobs: List[Optional[float]] = []
             tokens: List[str] = []
-            top_logprobs: List[Dict[str, float]] = []
+            top_logprobs: List[Optional[Dict[str, float]]] = []
+
+            if echo:
+                # Remove leading BOS token
+                all_tokens = prompt_tokens[1:] + completion_tokens
+            else:
+                all_tokens = completion_tokens
 
-            all_tokens = prompt_tokens + completion_tokens
             all_token_strs = [
                 self.detokenize([token]).decode("utf-8", errors="ignore")
                 for token in all_tokens
@@ -752,7 +988,7 @@ class Llama:
             all_logprobs = [
                 Llama.logits_to_logprobs(list(map(float, row)))
                 for row in self.eval_logits
-            ]
+            ][token_offset:]
             for token, token_str, logprobs_token in zip(
                 all_tokens, all_token_strs, all_logprobs
             ):
@@ -765,14 +1001,18 @@ class Llama:
                     )
                 )
                 token_logprobs.append(sorted_logprobs[int(token)][0])
-                top_logprob = {
-                    self.detokenize([llama_cpp.llama_token(i)]).decode(
-                        "utf-8", errors="ignore"
-                    ): logprob
+                top_logprob: Optional[Dict[str, float]] = {
+                    self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
                     for logprob, i in sorted_logprobs[:logprobs]
                 }
-                top_logprob.update({token_str: sorted_logprobs[int(token)][0]})
+                top_logprob.update({token_str: logprobs_token[int(token)]})
                 top_logprobs.append(top_logprob)
+            # Weird idosincracy of the OpenAI API where
+            # token_logprobs and top_logprobs are null for
+            # the first token.
+            if echo and len(all_tokens) > 0:
+                token_logprobs[0] = None
+                top_logprobs[0] = None
             logprobs_or_none = {
                 "tokens": tokens,
                 "text_offset": text_offsets,
@@ -780,14 +1020,11 @@ class Llama:
                 "top_logprobs": top_logprobs,
             }
 
-        if self.verbose:
-            llama_cpp.llama_print_timings(self.ctx)
-
         yield {
             "id": completion_id,
             "object": "text_completion",
             "created": created,
-            "model": self.model_path,
+            "model": model_name,
             "choices": [
                 {
                     "text": text_str,
@@ -812,15 +1049,19 @@ class Llama:
         top_p: float = 0.95,
         logprobs: Optional[int] = None,
         echo: bool = False,
-        stop: Optional[List[str]] = [],
+        stop: Optional[Union[str, List[str]]] = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
+        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
 
@@ -858,9 +1099,13 @@ class Llama:
             repeat_penalty=repeat_penalty,
             top_k=top_k,
             stream=stream,
+            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
+            model=model,
+            stopping_criteria=stopping_criteria,
+            logits_processor=logits_processor,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks
@@ -877,15 +1122,19 @@ class Llama:
         top_p: float = 0.95,
         logprobs: Optional[int] = None,
         echo: bool = False,
-        stop: Optional[List[str]] = [],
+        stop: Optional[Union[str, List[str]]] = [],
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
+        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
 
@@ -923,9 +1172,13 @@ class Llama:
             repeat_penalty=repeat_penalty,
             top_k=top_k,
             stream=stream,
+            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
+            model=model,
+            stopping_criteria=stopping_criteria,
+            logits_processor=logits_processor,
         )
 
     def _convert_text_completion_to_chat(
@@ -993,14 +1246,16 @@ class Llama:
         top_p: float = 0.95,
         top_k: int = 40,
         stream: bool = False,
-        stop: Optional[List[str]] = [],
+        stop: Optional[Union[str, List[str]]] = [],
         max_tokens: int = 256,
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
+        tfs_z: float = 1.0,
         mirostat_mode: int = 0,
         mirostat_tau: float = 5.0,
         mirostat_eta: float = 0.1,
+        model: Optional[str] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         """Generate a chat completion from a list of messages.
 
@@ -1017,7 +1272,9 @@ class Llama:
         Returns:
             Generated chat completion or a stream of chat completion chunks.
         """
-        stop = stop if stop is not None else []
+        stop = (
+            stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else []
+        )
         chat_history = "".join(
             f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}'
             for message in messages
@@ -1035,9 +1292,11 @@ class Llama:
             repeat_penalty=repeat_penalty,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
+            tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
             mirostat_tau=mirostat_tau,
             mirostat_eta=mirostat_eta,
+            model=model,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks  # type: ignore
@@ -1056,7 +1315,7 @@ class Llama:
             verbose=self.verbose,
             model_path=self.model_path,
             n_ctx=self.params.n_ctx,
-            n_parts=self.params.n_parts,
+            n_gpu_layers=self.params.n_gpu_layers,
             seed=self.params.seed,
             f16_kv=self.params.f16_kv,
             logits_all=self.params.logits_all,
@@ -1069,6 +1328,9 @@ class Llama:
             n_threads=self.n_threads,
             lora_base=self.lora_base,
             lora_path=self.lora_path,
+            ### DEPRECATED ###
+            n_parts=self.n_parts,
+            ### DEPRECATED ###
         )
 
     def __setstate__(self, state):
@@ -1076,6 +1338,7 @@ class Llama:
             model_path=state["model_path"],
             n_ctx=state["n_ctx"],
             n_parts=state["n_parts"],
+            n_gpu_layers=state["n_gpu_layers"],
             seed=state["seed"],
             f16_kv=state["f16_kv"],
             logits_all=state["logits_all"],
@@ -1120,16 +1383,41 @@ class Llama:
         if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size:
             raise RuntimeError("Failed to set llama state data")
 
+    def n_ctx(self) -> int:
+        """Return the context window size."""
+        assert self.ctx is not None
+        return llama_cpp.llama_n_ctx(self.ctx)
+
+    def n_embd(self) -> int:
+        """Return the embedding size."""
+        assert self.ctx is not None
+        return llama_cpp.llama_n_embd(self.ctx)
+
+    def n_vocab(self) -> int:
+        """Return the vocabulary size."""
+        assert self.ctx is not None
+        return llama_cpp.llama_n_vocab(self.ctx)
+
+    def tokenizer(self) -> "LlamaTokenizer":
+        """Return the tokenizer for this model."""
+        assert self.ctx is not None
+        return LlamaTokenizer(self)
+
     @staticmethod
-    def token_eos() -> llama_cpp.llama_token:
+    def token_eos() -> int:
         """Return the end-of-sequence token."""
         return llama_cpp.llama_token_eos()
 
     @staticmethod
-    def token_bos() -> llama_cpp.llama_token:
+    def token_bos() -> int:
         """Return the beginning-of-sequence token."""
         return llama_cpp.llama_token_bos()
 
+    @staticmethod
+    def token_nl() -> int:
+        """Return the newline token."""
+        return llama_cpp.llama_token_nl()
+
     @staticmethod
     def logits_to_logprobs(logits: List[float]) -> List[float]:
         exps = [math.exp(float(x)) for x in logits]
@@ -1137,9 +1425,7 @@ class Llama:
         return [math.log(x / sum_exps) for x in exps]
 
     @staticmethod
-    def longest_token_prefix(
-        a: Sequence[llama_cpp.llama_token], b: Sequence[llama_cpp.llama_token]
-    ):
+    def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
         longest_prefix = 0
         for _a, _b in zip(a, b):
             if _a == _b:
@@ -1147,3 +1433,20 @@ class Llama:
             else:
                 break
         return longest_prefix
+
+
+class LlamaTokenizer:
+    def __init__(self, llama: Llama):
+        self.llama = llama
+
+    def encode(self, text: str, add_bos: bool = True) -> List[int]:
+        return self.llama.tokenize(
+            text.encode("utf-8", errors="ignore"), add_bos=add_bos
+        )
+
+    def decode(self, tokens: List[int]) -> str:
+        return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
+
+    @classmethod
+    def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
+        return cls(Llama(model_path=path, vocab_only=True))
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index e60558c..541ee00 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -44,15 +44,20 @@ def _load_shared_library(lib_base_name: str):
         _base_path = _lib.parent.resolve()
         _lib_paths = [_lib.resolve()]
 
+    cdll_args = dict()  # type: ignore
     # Add the library directory to the DLL search path on Windows (if needed)
     if sys.platform == "win32" and sys.version_info >= (3, 8):
         os.add_dll_directory(str(_base_path))
+        if "CUDA_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
+        cdll_args["winmode"] = 0
 
     # Try to load the shared library, handling potential errors
     for _lib_path in _lib_paths:
         if _lib_path.exists():
             try:
-                return ctypes.CDLL(str(_lib_path))
+                return ctypes.CDLL(str(_lib_path), **cdll_args)
             except Exception as e:
                 raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
 
@@ -67,31 +72,61 @@ _lib_base_name = "llama"
 # Load the library
 _lib = _load_shared_library(_lib_base_name)
 
-# C types
-LLAMA_FILE_VERSION = c_int(1)
-LLAMA_FILE_MAGIC = b"ggjt"
-LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
-LLAMA_SESSION_MAGIC = b"ggsn"
+# Misc
+c_float_p = POINTER(c_float)
+c_uint8_p = POINTER(c_uint8)
+c_size_t_p = POINTER(c_size_t)
+
+# llama.h bindings
+
+# #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74)
+# #define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
+LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61)
+# #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
+LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66)
+# #define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
+LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C)
+# #define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E)
+
+# #define LLAMA_FILE_VERSION           3
+LLAMA_FILE_VERSION = c_int(3)
+LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT
+LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML
+LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_VERSION = c_int(1)
 
+# struct llama_context;
 llama_context_p = c_void_p
 
 
+# typedef int llama_token;
 llama_token = c_int
 llama_token_p = POINTER(llama_token)
 
 
+# typedef struct llama_token_data {
+#     llama_token id; // token id
+#     float logit;    // log-odds of the token
+#     float p;        // probability of the token
+# } llama_token_data;
 class llama_token_data(Structure):
     _fields_ = [
-        ("id", llama_token),  # token id
-        ("logit", c_float),  # log-odds of the token
-        ("p", c_float),  # probability of the token
+        ("id", llama_token),
+        ("logit", c_float),
+        ("p", c_float),
     ]
 
 
 llama_token_data_p = POINTER(llama_token_data)
 
 
+# typedef struct llama_token_data_array {
+#     llama_token_data * data;
+#     size_t size;
+#     bool sorted;
+# } llama_token_data_array;
 class llama_token_data_array(Structure):
     _fields_ = [
         ("data", llama_token_data_p),
@@ -102,53 +137,72 @@ class llama_token_data_array(Structure):
 
 llama_token_data_array_p = POINTER(llama_token_data_array)
 
+# typedef void (*llama_progress_callback)(float progress, void *ctx);
 llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
 
 
+# struct llama_context_params {
+#     int n_ctx;        // text context
+#     int n_gpu_layers; // number of layers to store in VRAM
+#     int seed;         // RNG seed, -1 for random
+
+#     bool f16_kv;     // use fp16 for KV cache
+#     bool logits_all; // the llama_eval() call computes all logits, not just the last one
+#     bool vocab_only; // only load the vocabulary, no weights
+#     bool use_mmap;   // use mmap if possible
+#     bool use_mlock;  // force system to keep model in RAM
+#     bool embedding;  // embedding mode only
+
+
+#     // called with a progress value between 0 and 1, pass NULL to disable
+#     llama_progress_callback progress_callback;
+#     // context pointer passed to the progress callback
+#     void * progress_callback_user_data;
+# };
 class llama_context_params(Structure):
     _fields_ = [
-        ("n_ctx", c_int),  # text context
-        ("n_parts", c_int),  # -1 for default
-        ("seed", c_int),  # RNG seed, 0 for random
-        ("f16_kv", c_bool),  # use fp16 for KV cache
+        ("n_ctx", c_int),
+        ("n_gpu_layers", c_int),
+        ("seed", c_int),
+        ("f16_kv", c_bool),
         (
             "logits_all",
             c_bool,
-        ),  # the llama_eval() call computes all logits, not just the last one
-        ("vocab_only", c_bool),  # only load the vocabulary, no weights
-        ("use_mmap", c_bool),  # use mmap if possible
-        ("use_mlock", c_bool),  # force system to keep model in RAM
-        ("embedding", c_bool),  # embedding mode only
-        # called with a progress value between 0 and 1, pass NULL to disable
+        ),
+        ("vocab_only", c_bool),
+        ("use_mmap", c_bool),
+        ("use_mlock", c_bool),
+        ("embedding", c_bool),
         ("progress_callback", llama_progress_callback),
-        # context pointer passed to the progress callback
         ("progress_callback_user_data", c_void_p),
     ]
 
 
 llama_context_params_p = POINTER(llama_context_params)
 
+# enum llama_ftype {
+#     LLAMA_FTYPE_ALL_F32              = 0,
+#     LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+#     // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+#     // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
+#     LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+# };
 LLAMA_FTYPE_ALL_F32 = c_int(0)
-LLAMA_FTYPE_MOSTLY_F16 = c_int(1)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
-    4
-)  # tok_embeddings.weight and output.weight are F16
-LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5)  # except 1d tensors
-# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)  # except 1d tensors
-
-# Misc
-c_float_p = POINTER(c_float)
-c_uint8_p = POINTER(c_uint8)
-c_size_t_p = POINTER(c_size_t)
-
-# Functions
+LLAMA_FTYPE_MOSTLY_F16 = c_int(1)
+LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)
+LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)
+LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4)
+LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)
+LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)
+LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)
 
 
+# LLAMA_API struct llama_context_params llama_context_default_params();
 def llama_context_default_params() -> llama_context_params:
     return _lib.llama_context_default_params()
 
@@ -157,6 +211,7 @@ _lib.llama_context_default_params.argtypes = []
 _lib.llama_context_default_params.restype = llama_context_params
 
 
+# LLAMA_API bool llama_mmap_supported();
 def llama_mmap_supported() -> bool:
     return _lib.llama_mmap_supported()
 
@@ -165,6 +220,7 @@ _lib.llama_mmap_supported.argtypes = []
 _lib.llama_mmap_supported.restype = c_bool
 
 
+# LLAMA_API bool llama_mlock_supported();
 def llama_mlock_supported() -> bool:
     return _lib.llama_mlock_supported()
 
@@ -173,9 +229,33 @@ _lib.llama_mlock_supported.argtypes = []
 _lib.llama_mlock_supported.restype = c_bool
 
 
-# Various functions for loading a ggml llama model.
-# Allocate (almost) all memory needed for the model.
-# Return NULL on failure
+# // TODO: not great API - very likely to change
+# // Initialize the llama + ggml backend
+# // Call once at the start of the program
+# LLAMA_API void llama_init_backend();
+def llama_init_backend():
+    return _lib.llama_init_backend()
+
+
+_lib.llama_init_backend.argtypes = []
+_lib.llama_init_backend.restype = None
+
+
+# LLAMA_API int64_t llama_time_us();
+def llama_time_us() -> int:
+    return _lib.llama_time_us()
+
+
+_lib.llama_time_us.argtypes = []
+_lib.llama_time_us.restype = ctypes.c_int64
+
+
+# // Various functions for loading a ggml llama model.
+# // Allocate (almost) all memory needed for the model.
+# // Return NULL on failure
+# LLAMA_API struct llama_context * llama_init_from_file(
+#                             const char * path_model,
+#         struct llama_context_params   params);
 def llama_init_from_file(
     path_model: bytes, params: llama_context_params
 ) -> llama_context_p:
@@ -187,8 +267,9 @@ _lib.llama_init_from_file.restype = llama_context_p
 
 
 # Frees all allocated memory
+# LLAMA_API void llama_free(struct llama_context * ctx);
 def llama_free(ctx: llama_context_p):
-    _lib.llama_free(ctx)
+    return _lib.llama_free(ctx)
 
 
 _lib.llama_free.argtypes = [llama_context_p]
@@ -198,9 +279,14 @@ _lib.llama_free.restype = None
 # TODO: not great API - very likely to change
 # Returns 0 on success
 # nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
+# LLAMA_API int llama_model_quantize(
+#         const char * fname_inp,
+#         const char * fname_out,
+#     enum llama_ftype   ftype,
+#         int          nthread);
 def llama_model_quantize(
     fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int
-) -> c_int:
+) -> int:
     return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread)
 
 
@@ -214,12 +300,17 @@ _lib.llama_model_quantize.restype = c_int
 # The model needs to be reloaded before applying a new adapter, otherwise the adapter
 # will be applied on top of the previous one
 # Returns 0 on success
+# LLAMA_API int llama_apply_lora_from_file(
+#         struct llama_context * ctx,
+#                   const char * path_lora,
+#                   const char * path_base_model,
+#                          int   n_threads);
 def llama_apply_lora_from_file(
     ctx: llama_context_p,
     path_lora: c_char_p,
     path_base_model: c_char_p,
     n_threads: c_int,
-) -> c_int:
+) -> int:
     return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
 
 
@@ -228,7 +319,8 @@ _lib.llama_apply_lora_from_file.restype = c_int
 
 
 # Returns the number of tokens in the KV cache
-def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
+# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
     return _lib.llama_get_kv_cache_token_count(ctx)
 
 
@@ -237,6 +329,7 @@ _lib.llama_get_kv_cache_token_count.restype = c_int
 
 
 # Sets the current rng seed.
+# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
 def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
     return _lib.llama_set_rng_seed(ctx, seed)
 
@@ -247,7 +340,8 @@ _lib.llama_set_rng_seed.restype = None
 
 # Returns the maximum size in bytes of the state (rng, logits, embedding
 # and kv_cache) - will often be smaller after compacting tokens
-def llama_get_state_size(ctx: llama_context_p) -> c_size_t:
+# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
+def llama_get_state_size(ctx: llama_context_p) -> int:
     return _lib.llama_get_state_size(ctx)
 
 
@@ -258,10 +352,11 @@ _lib.llama_get_state_size.restype = c_size_t
 # Copies the state to the specified destination address.
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
+# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
 def llama_copy_state_data(
-    ctx: llama_context_p, dest  # type: Array[c_uint8]
+    ctx: llama_context_p, dst  # type: Array[c_uint8]
 ) -> int:
-    return _lib.llama_copy_state_data(ctx, dest)
+    return _lib.llama_copy_state_data(ctx, dst)
 
 
 _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
@@ -270,6 +365,7 @@ _lib.llama_copy_state_data.restype = c_size_t
 
 # Set the state reading from the specified address
 # Returns the number of bytes read
+# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
 def llama_set_state_data(
     ctx: llama_context_p, src  # type: Array[c_uint8]
 ) -> int:
@@ -281,13 +377,14 @@ _lib.llama_set_state_data.restype = c_size_t
 
 
 # Save/load session file
+# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
 def llama_load_session_file(
     ctx: llama_context_p,
     path_session: bytes,
     tokens_out,  # type: Array[llama_token]
     n_token_capacity: c_size_t,
     n_token_count_out,  # type: _Pointer[c_size_t]
-) -> c_size_t:
+) -> int:
     return _lib.llama_load_session_file(
         ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
     )
@@ -303,12 +400,13 @@ _lib.llama_load_session_file.argtypes = [
 _lib.llama_load_session_file.restype = c_size_t
 
 
+# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
 def llama_save_session_file(
     ctx: llama_context_p,
     path_session: bytes,
     tokens,  # type: Array[llama_token]
     n_token_count: c_size_t,
-) -> c_size_t:
+) -> int:
     return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
 
 
@@ -325,13 +423,19 @@ _lib.llama_save_session_file.restype = c_size_t
 # tokens + n_tokens is the provided batch of new tokens to process
 # n_past is the number of tokens to use from previous eval calls
 # Returns 0 on success
+# LLAMA_API int llama_eval(
+#         struct llama_context * ctx,
+#            const llama_token * tokens,
+#                          int   n_tokens,
+#                          int   n_past,
+#                          int   n_threads);
 def llama_eval(
     ctx: llama_context_p,
     tokens,  # type: Array[llama_token]
     n_tokens: c_int,
     n_past: c_int,
     n_threads: c_int,
-) -> c_int:
+) -> int:
     return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
 
 
@@ -344,13 +448,19 @@ _lib.llama_eval.restype = c_int
 # Returns the number of tokens on success, no more than n_max_tokens
 # Returns a negative number on failure - the number of tokens that would have been returned
 # TODO: not sure if correct
+# LLAMA_API int llama_tokenize(
+#         struct llama_context * ctx,
+#                   const char * text,
+#                  llama_token * tokens,
+#                          int   n_max_tokens,
+#                         bool   add_bos);
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
     tokens,  # type: Array[llama_token]
     n_max_tokens: c_int,
     add_bos: c_bool,
-) -> c_int:
+) -> int:
     return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
 
 
@@ -358,7 +468,8 @@ _lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int,
 _lib.llama_tokenize.restype = c_int
 
 
-def llama_n_vocab(ctx: llama_context_p) -> c_int:
+# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+def llama_n_vocab(ctx: llama_context_p) -> int:
     return _lib.llama_n_vocab(ctx)
 
 
@@ -366,7 +477,8 @@ _lib.llama_n_vocab.argtypes = [llama_context_p]
 _lib.llama_n_vocab.restype = c_int
 
 
-def llama_n_ctx(ctx: llama_context_p) -> c_int:
+# LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+def llama_n_ctx(ctx: llama_context_p) -> int:
     return _lib.llama_n_ctx(ctx)
 
 
@@ -374,7 +486,8 @@ _lib.llama_n_ctx.argtypes = [llama_context_p]
 _lib.llama_n_ctx.restype = c_int
 
 
-def llama_n_embd(ctx: llama_context_p) -> c_int:
+# LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+def llama_n_embd(ctx: llama_context_p) -> int:
     return _lib.llama_n_embd(ctx)
 
 
@@ -387,6 +500,7 @@ _lib.llama_n_embd.restype = c_int
 # Can be mutated in order to change the probabilities of the next token
 # Rows: n_tokens
 # Cols: n_vocab
+# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 def llama_get_logits(
     ctx: llama_context_p,
 ):  # type: (...) -> Array[float] # type: ignore
@@ -399,6 +513,7 @@ _lib.llama_get_logits.restype = c_float_p
 
 # Get the embeddings for the input
 # shape: [n_embd] (1-dimensional)
+# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 def llama_get_embeddings(
     ctx: llama_context_p,
 ):  # type: (...) -> Array[float] # type: ignore
@@ -410,6 +525,7 @@ _lib.llama_get_embeddings.restype = c_float_p
 
 
 # Token Id -> String. Uses the vocabulary in the provided context
+# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
 def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
     return _lib.llama_token_to_str(ctx, token)
 
@@ -420,7 +536,8 @@ _lib.llama_token_to_str.restype = c_char_p
 # Special tokens
 
 
-def llama_token_bos() -> llama_token:
+# LLAMA_API llama_token llama_token_bos();
+def llama_token_bos() -> int:
     return _lib.llama_token_bos()
 
 
@@ -428,7 +545,8 @@ _lib.llama_token_bos.argtypes = []
 _lib.llama_token_bos.restype = llama_token
 
 
-def llama_token_eos() -> llama_token:
+# LLAMA_API llama_token llama_token_eos();
+def llama_token_eos() -> int:
     return _lib.llama_token_eos()
 
 
@@ -436,7 +554,8 @@ _lib.llama_token_eos.argtypes = []
 _lib.llama_token_eos.restype = llama_token
 
 
-def llama_token_nl() -> llama_token:
+# LLAMA_API llama_token llama_token_nl();
+def llama_token_nl() -> int:
     return _lib.llama_token_nl()
 
 
@@ -448,6 +567,7 @@ _lib.llama_token_nl.restype = llama_token
 
 
 # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -471,6 +591,7 @@ _lib.llama_sample_repetition_penalty.restype = None
 
 
 # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -501,6 +622,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
 def llama_sample_softmax(
     ctx: llama_context_p, candidates  # type: _Pointer[llama_token_data]
 ):
@@ -515,6 +637,7 @@ _lib.llama_sample_softmax.restype = None
 
 
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
 def llama_sample_top_k(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -534,6 +657,7 @@ _lib.llama_sample_top_k.restype = None
 
 
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
 def llama_sample_top_p(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -553,6 +677,7 @@ _lib.llama_sample_top_p.restype = None
 
 
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
 def llama_sample_tail_free(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -572,6 +697,7 @@ _lib.llama_sample_tail_free.restype = None
 
 
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
 def llama_sample_typical(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -590,6 +716,7 @@ _lib.llama_sample_typical.argtypes = [
 _lib.llama_sample_typical.restype = None
 
 
+# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
 def llama_sample_temperature(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -612,6 +739,7 @@ _lib.llama_sample_temperature.restype = None
 # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
 # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
@@ -619,7 +747,7 @@ def llama_sample_token_mirostat(
     eta: c_float,
     m: c_int,
     mu,  # type: _Pointer[c_float]
-) -> llama_token:
+) -> int:
     return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
 
 
@@ -639,13 +767,14 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 # @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
 # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     tau: c_float,
     eta: c_float,
     mu,  # type: _Pointer[c_float]
-) -> llama_token:
+) -> int:
     return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
 
 
@@ -660,10 +789,11 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 
 # @details Selects the token with the highest probability.
+# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
 def llama_sample_token_greedy(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
-) -> llama_token:
+) -> int:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
 
@@ -675,10 +805,11 @@ _lib.llama_sample_token_greedy.restype = llama_token
 
 
 # @details Randomly selects a token from the candidates based on their probabilities.
+# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
 def llama_sample_token(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
-) -> llama_token:
+) -> int:
     return _lib.llama_sample_token(ctx, candidates)
 
 
@@ -692,6 +823,7 @@ _lib.llama_sample_token.restype = llama_token
 # Performance information
 
 
+# LLAMA_API void llama_print_timings(struct llama_context * ctx);
 def llama_print_timings(ctx: llama_context_p):
     _lib.llama_print_timings(ctx)
 
@@ -700,6 +832,7 @@ _lib.llama_print_timings.argtypes = [llama_context_p]
 _lib.llama_print_timings.restype = None
 
 
+# LLAMA_API void llama_reset_timings(struct llama_context * ctx);
 def llama_reset_timings(ctx: llama_context_p):
     _lib.llama_reset_timings(ctx)
 
@@ -709,9 +842,19 @@ _lib.llama_reset_timings.restype = None
 
 
 # Print system information
+# LLAMA_API const char * llama_print_system_info(void);
 def llama_print_system_info() -> bytes:
     return _lib.llama_print_system_info()
 
 
 _lib.llama_print_system_info.argtypes = []
 _lib.llama_print_system_info.restype = c_char_p
+
+###################################################################################################
+
+
+_llama_initialized = False
+
+if not _llama_initialized:
+    llama_init_backend()
+    _llama_initialized = True
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index bfc7342..7729ced 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict, Union
+from typing import List, Optional, Dict
 from typing_extensions import TypedDict, NotRequired, Literal
 
 
@@ -22,9 +22,9 @@ class Embedding(TypedDict):
 
 class CompletionLogprobs(TypedDict):
     text_offset: List[int]
-    token_logprobs: List[float]
+    token_logprobs: List[Optional[float]]
     tokens: List[str]
-    top_logprobs: List[Dict[str, float]]
+    top_logprobs: List[Optional[Dict[str, float]]]
 
 
 class CompletionChoice(TypedDict):
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 621b73e..fea3612 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import multiprocessing
 from threading import Lock
 from typing import List, Optional, Union, Iterator, Dict
@@ -16,7 +17,16 @@ class Settings(BaseSettings):
     model: str = Field(
         description="The path to the model to use for generating completions."
     )
+    model_alias: Optional[str] = Field(
+        default=None,
+        description="The alias of the model to use for generating completions.",
+    )
     n_ctx: int = Field(default=2048, ge=1, description="The context size.")
+    n_gpu_layers: int = Field(
+        default=0,
+        ge=0,
+        description="The number of layers to put on the GPU. The rest will be on the CPU.",
+    )
     n_batch: int = Field(
         default=512, ge=1, description="The batch size to use per eval."
     )
@@ -59,6 +69,7 @@ class Settings(BaseSettings):
 
 router = APIRouter()
 
+settings: Optional[Settings] = None
 llama: Optional[llama_cpp.Llama] = None
 
 
@@ -80,6 +91,7 @@ def create_app(settings: Optional[Settings] = None):
     global llama
     llama = llama_cpp.Llama(
         model_path=settings.model,
+        n_gpu_layers=settings.n_gpu_layers,
         f16_kv=settings.f16_kv,
         use_mlock=settings.use_mlock,
         use_mmap=settings.use_mmap,
@@ -95,6 +107,12 @@ def create_app(settings: Optional[Settings] = None):
     if settings.cache:
         cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)
         llama.set_cache(cache)
+
+    def set_settings(_settings: Settings):
+        global settings
+        settings = _settings
+
+    set_settings(settings)
     return app
 
 
@@ -106,6 +124,10 @@ def get_llama():
         yield llama
 
 
+def get_settings():
+    yield settings
+
+
 model_field = Field(description="The model to use for generating completions.")
 
 max_tokens_field = Field(
@@ -152,9 +174,23 @@ repeat_penalty_field = Field(
     + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
 )
 
+presence_penalty_field = Field(
+    default=0.0,
+    ge=-2.0,
+    le=2.0,
+    description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+)
+
+frequency_penalty_field = Field(
+    default=0.0,
+    ge=-2.0,
+    le=2.0,
+    description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+)
+
 
 class CreateCompletionRequest(BaseModel):
-    prompt: Optional[str] = Field(
+    prompt: Union[str, List[str]] = Field(
         default="", description="The prompt to generate completions for."
     )
     suffix: Optional[str] = Field(
@@ -168,20 +204,20 @@ class CreateCompletionRequest(BaseModel):
         default=False,
         description="Whether to echo the prompt in the generated text. Useful for chatbots.",
     )
-    stop: Optional[List[str]] = stop_field
+    stop: Optional[Union[str, List[str]]] = stop_field
     stream: bool = stream_field
     logprobs: Optional[int] = Field(
         default=None,
         ge=0,
         description="The number of logprobs to generate. If None, no logprobs are generated.",
     )
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
 
     # ignored or currently unsupported
     model: Optional[str] = model_field
     n: Optional[int] = 1
     logprobs: Optional[int] = Field(None)
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
     best_of: Optional[int] = 1
     logit_bias: Optional[Dict[str, float]] = Field(None)
     user: Optional[str] = Field(None)
@@ -209,10 +245,13 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
 def create_completion(
     request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
+    if isinstance(request.prompt, list):
+        assert len(request.prompt) <= 1
+        request.prompt = request.prompt[0] if len(request.prompt) > 0 else ""
+
     completion_or_chunks = llama(
         **request.dict(
             exclude={
-                "model",
                 "n",
                 "best_of",
                 "logit_bias",
@@ -221,15 +260,22 @@ def create_completion(
         )
     )
     if request.stream:
+
+        async def server_sent_events(
+            chunks: Iterator[llama_cpp.CompletionChunk],
+        ):
+            for chunk in chunks:
+                yield dict(data=json.dumps(chunk))
+
         chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks  # type: ignore
-        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
+        return EventSourceResponse(server_sent_events(chunks))
     completion: llama_cpp.Completion = completion_or_chunks  # type: ignore
     return completion
 
 
 class CreateEmbeddingRequest(BaseModel):
     model: Optional[str] = model_field
-    input: str = Field(description="The input to embed.")
+    input: Union[str, List[str]] = Field(description="The input to embed.")
     user: Optional[str]
 
     class Config:
@@ -250,7 +296,7 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
 def create_embedding(
     request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
-    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
+    return llama.create_embedding(**request.dict(exclude={"user"}))
 
 
 class ChatCompletionRequestMessage(BaseModel):
@@ -269,12 +315,12 @@ class CreateChatCompletionRequest(BaseModel):
     top_p: float = top_p_field
     stop: Optional[List[str]] = stop_field
     stream: bool = stream_field
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
 
     # ignored or currently unsupported
     model: Optional[str] = model_field
     n: Optional[int] = 1
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
     logit_bias: Optional[Dict[str, float]] = Field(None)
     user: Optional[str] = Field(None)
 
@@ -311,7 +357,6 @@ def create_chat_completion(
     completion_or_chunks = llama.create_chat_completion(
         **request.dict(
             exclude={
-                "model",
                 "n",
                 "logit_bias",
                 "user",
@@ -354,13 +399,16 @@ GetModelResponse = create_model_from_typeddict(ModelList)
 
 @router.get("/v1/models", response_model=GetModelResponse)
 def get_models(
+    settings: Settings = Depends(get_settings),
     llama: llama_cpp.Llama = Depends(get_llama),
 ) -> ModelList:
     return {
         "object": "list",
         "data": [
             {
-                "id": llama.model_path,
+                "id": settings.model_alias
+                if settings.model_alias is not None
+                else llama.model_path,
                 "object": "model",
                 "owned_by": "me",
                 "permissions": [],
diff --git a/poetry.lock b/poetry.lock
index ad59963..50ae0cb 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -463,14 +463,14 @@ socks = ["socksio (>=1.0.0,<2.0.0)"]
 
 [[package]]
 name = "httpx"
-version = "0.24.0"
+version = "0.24.1"
 description = "The next generation HTTP client."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "httpx-0.24.0-py3-none-any.whl", hash = "sha256:447556b50c1921c351ea54b4fe79d91b724ed2b027462ab9a329465d147d5a4e"},
-    {file = "httpx-0.24.0.tar.gz", hash = "sha256:507d676fc3e26110d41df7d35ebd8b3b8585052450f4097401c9be59d928c63e"},
+    {file = "httpx-0.24.1-py3-none-any.whl", hash = "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd"},
+    {file = "httpx-0.24.1.tar.gz", hash = "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"},
 ]
 
 [package.dependencies]
@@ -800,14 +800,14 @@ mkdocs = ">=1.1"
 
 [[package]]
 name = "mkdocs-material"
-version = "9.1.11"
+version = "9.1.14"
 description = "Documentation that simply works"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"},
-    {file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"},
+    {file = "mkdocs_material-9.1.14-py3-none-any.whl", hash = "sha256:b56a9f955ed32d38333715cbbf68ce38f683bf38610c65094fa4ef2db9f08bcd"},
+    {file = "mkdocs_material-9.1.14.tar.gz", hash = "sha256:1ae74cc5464ef2f64574d4884512efed7f4db386fb9bc6af20fd427d7a702f49"},
 ]
 
 [package.dependencies]
diff --git a/pyproject.toml b/pyproject.toml
index 8aec94c..aacdac0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.48"
+version = "0.1.55"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
@@ -24,9 +24,9 @@ black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.21.2"}
-mkdocs-material = "^9.1.11"
+mkdocs-material = "^9.1.14"
 pytest = "^7.3.1"
-httpx = "^0.24.0"
+httpx = "^0.24.1"
 scikit-build = "0.13"
 
 [tool.poetry.extras]
diff --git a/setup.py b/setup.py
index f4cbb60..2136d8d 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.48",
+    version="0.1.55",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",
diff --git a/tests/test_llama.py b/tests/test_llama.py
index b3426b8..941287d 100644
--- a/tests/test_llama.py
+++ b/tests/test_llama.py
@@ -17,7 +17,7 @@ def test_llama():
 # @pytest.mark.skip(reason="need to update sample mocking")
 def test_llama_patch(monkeypatch):
     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
-    n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx))
+    n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
 
     ## Set up mock function
     def mock_eval(*args, **kwargs):
@@ -107,7 +107,7 @@ def test_llama_pickle():
 
 def test_utf8(monkeypatch):
     llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
-    n_vocab = int(llama_cpp.llama_n_vocab(llama.ctx))
+    n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
 
     ## Set up mock function
     def mock_eval(*args, **kwargs):
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 1b0fd45..66874d4 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25
+Subproject commit 66874d4fbcc7866377246efbcee938e8cc9c7d76