diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
new file mode 100644
index 0000000..208de8c
--- /dev/null
+++ b/llama_cpp/_internals.py
@@ -0,0 +1,770 @@
+from __future__ import annotations
+
+import os
+import ctypes
+
+from typing import (
+    List,
+    Optional,
+    Sequence,
+)
+from dataclasses import dataclass, field
+
+import numpy as np
+import numpy.typing as npt
+
+from .llama_types import *
+from .llama_grammar import LlamaGrammar
+
+import llama_cpp.llama_cpp as llama_cpp
+
+from ._utils import suppress_stdout_stderr
+
+
+# Python wrappers over llama.h structs
+
+
+class _LlamaModel:
+    """Intermediate Python wrapper for a llama.cpp llama_model.
+    NOTE: For stability it's recommended you use the Llama class instead."""
+
+    _llama_free_model = None
+    # NOTE: this must be "saved" here to avoid exceptions when calling __del__
+    _suppress_stdout_stderr = suppress_stdout_stderr
+
+    def __init__(
+        self,
+        *,
+        path_model: str,
+        params: llama_cpp.llama_model_params,
+        verbose: bool = True,
+    ):
+        self.path_model = path_model
+        self.params = params
+        self.verbose = verbose
+
+        self._llama_free_model = llama_cpp._lib.llama_free_model  # type: ignore
+
+        if not os.path.exists(path_model):
+            raise ValueError(f"Model path does not exist: {path_model}")
+
+        with self._suppress_stdout_stderr(disable=self.verbose):
+            self.model = llama_cpp.llama_load_model_from_file(
+                self.path_model.encode("utf-8"), self.params
+            )
+
+    def __del__(self):
+        with self._suppress_stdout_stderr(disable=self.verbose):
+            if self.model is not None and self._llama_free_model is not None:
+                self._llama_free_model(self.model)
+                self.model = None
+
+    def vocab_type(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_vocab_type(self.model)
+
+    def n_vocab(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_n_vocab(self.model)
+
+    def n_ctx_train(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_n_ctx_train(self.model)
+
+    def n_embd(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_n_embd(self.model)
+
+    def rope_freq_scale_train(self) -> float:
+        assert self.model is not None
+        return llama_cpp.llama_rope_freq_scale_train(self.model)
+
+    def desc(self) -> str:
+        assert self.model is not None
+        buf = ctypes.create_string_buffer(1024)
+        llama_cpp.llama_model_desc(self.model, buf, 1024)  # type: ignore
+        return buf.value.decode("utf-8")
+
+    def size(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_model_size(self.model)
+
+    def n_params(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_model_n_params(self.model)
+
+    def get_tensor(self, name: str) -> ctypes.c_void_p:
+        assert self.model is not None
+        return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8"))
+
+    def apply_lora_from_file(
+        self,
+        lora_path: str,
+        scale: float,
+        path_base_model: Optional[str],
+        n_threads: int,
+    ):
+        assert self.model is not None
+        return llama_cpp.llama_model_apply_lora_from_file(
+            self.model,
+            lora_path.encode("utf-8"),
+            scale,
+            path_base_model.encode("utf-8")
+            if path_base_model is not None
+            else llama_cpp.c_char_p(0),
+            n_threads,
+        )
+
+    # Vocab
+
+    def token_get_text(self, token: int) -> str:
+        # TODO: Fix
+        assert self.model is not None
+        return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8")
+
+    def token_get_score(self, token: int) -> float:
+        assert self.model is not None
+        return llama_cpp.llama_token_get_score(self.model, token)
+
+    def token_get_type(self, token: int) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_get_type(self.model, token)
+
+    # Special tokens
+
+    def token_bos(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_bos(self.model)
+
+    def token_eos(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_eos(self.model)
+
+    def token_nl(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_nl(self.model)
+
+    def token_prefix(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_prefix(self.model)
+
+    def token_middle(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_middle(self.model)
+
+    def token_suffix(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_suffix(self.model)
+
+    def token_eot(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_eot(self.model)
+
+    # Tokenization
+
+    def tokenize(self, text: bytes, add_bos: bool, special: bool):
+        assert self.model is not None
+        n_ctx = self.n_ctx_train()
+        tokens = (llama_cpp.llama_token * n_ctx)()
+        n_tokens = llama_cpp.llama_tokenize(
+            self.model, text, len(text), tokens, n_ctx, add_bos, special
+        )
+        if n_tokens < 0:
+            n_tokens = abs(n_tokens)
+            tokens = (llama_cpp.llama_token * n_tokens)()
+            n_tokens = llama_cpp.llama_tokenize(
+                self.model, text, len(text), tokens, n_tokens, add_bos, special
+            )
+            if n_tokens < 0:
+                raise RuntimeError(
+                    f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
+                )
+        return list(tokens[:n_tokens])
+
+    def token_to_piece(self, token: int) -> bytes:
+        assert self.model is not None
+        buf = ctypes.create_string_buffer(32)
+        llama_cpp.llama_token_to_piece(self.model, token, buf, 32)  # type: ignore
+        return bytes(buf)
+
+    def detokenize(self, tokens: List[int]) -> bytes:
+        assert self.model is not None
+        output = b""
+        size = 32
+        buffer = (ctypes.c_char * size)()
+        for token in tokens:
+            n = llama_cpp.llama_token_to_piece(
+                self.model, llama_cpp.llama_token(token), buffer, size
+            )
+            assert n <= size
+            output += bytes(buffer[:n])
+        # NOTE: Llama1 models automatically added a space at the start of the prompt
+        # this line removes a leading space if the first token is a beginning of sentence token
+        return (
+            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
+        )
+
+    @staticmethod
+    def default_params():
+        """Get the default llama_model_params."""
+        return llama_cpp.llama_model_default_params()
+
+
+class _LlamaContext:
+    """Intermediate Python wrapper for a llama.cpp llama_context.
+    NOTE: For stability it's recommended you use the Llama class instead."""
+
+    _llama_free = None
+    # NOTE: this must be "saved" here to avoid exceptions when calling __del__
+    _suppress_stdout_stderr = suppress_stdout_stderr
+
+    def __init__(
+        self,
+        *,
+        model: _LlamaModel,
+        params: llama_cpp.llama_context_params,
+        verbose: bool = True,
+    ):
+        self.model = model
+        self.params = params
+        self.verbose = verbose
+
+        self._llama_free = llama_cpp._lib.llama_free  # type: ignore
+
+        with self._suppress_stdout_stderr(disable=self.verbose):
+            self.ctx = llama_cpp.llama_new_context_with_model(
+                self.model.model, self.params
+            )
+
+    def __del__(self):
+        with self._suppress_stdout_stderr(disable=self.verbose):
+            if self.ctx is not None and self._llama_free is not None:
+                self._llama_free(self.ctx)
+                self.ctx = None
+
+    def n_ctx(self) -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_n_ctx(self.ctx)
+
+    def kv_cache_clear(self):
+        assert self.ctx is not None
+        llama_cpp.llama_kv_cache_clear(self.ctx)
+
+    def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
+        assert self.ctx is not None
+        llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
+
+    def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
+        assert self.ctx is not None
+        llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
+
+    def kv_cache_seq_keep(self, seq_id: int):
+        assert self.ctx is not None
+        llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
+
+    def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
+        assert self.ctx is not None
+        llama_cpp.llama_kv_cache_seq_shift(self.ctx, seq_id, p0, p1, shift)
+
+    def get_state_size(self) -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_get_state_size(self.ctx)
+
+    # TODO: copy_state_data
+
+    # TODO: set_state_data
+
+    # TODO: llama_load_session_file
+
+    # TODO: llama_save_session_file
+
+    def decode(self, batch: "_LlamaBatch"):
+        assert self.ctx is not None
+        assert batch.batch is not None
+        return_code = llama_cpp.llama_decode(
+            ctx=self.ctx,
+            batch=batch.batch,
+        )
+        if return_code != 0:
+            raise RuntimeError(f"llama_decode returned {return_code}")
+
+    def set_n_threads(self, n_threads: int, n_threads_batch: int):
+        assert self.ctx is not None
+        llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch)
+
+    def get_logits(self):
+        assert self.ctx is not None
+        return llama_cpp.llama_get_logits(self.ctx)
+
+    def get_logits_ith(self, i: int):
+        assert self.ctx is not None
+        return llama_cpp.llama_get_logits_ith(self.ctx, i)
+
+    def get_embeddings(self):
+        assert self.ctx is not None
+        return llama_cpp.llama_get_embeddings(self.ctx)
+
+    # Sampling functions
+
+    def set_rng_seed(self, seed: int):
+        assert self.ctx is not None
+        llama_cpp.llama_set_rng_seed(self.ctx, seed)
+
+    def sample_repetition_penalties(
+        self,
+        candidates: "_LlamaTokenDataArray",
+        last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]",
+        penalty_last_n: int,
+        penalty_repeat: float,
+        penalty_freq: float,
+        penalty_present: float,
+    ):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_repetition_penalties(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+            last_tokens_data,
+            penalty_last_n,
+            penalty_repeat,
+            penalty_freq,
+            penalty_present,
+        )
+
+    def sample_classifier_free_guidance(
+        self,
+        candidates: "_LlamaTokenDataArray",
+        guidance_ctx: "_LlamaContext",
+        scale: float,
+    ):
+        assert self.ctx is not None
+        assert guidance_ctx.ctx is not None
+        llama_cpp.llama_sample_classifier_free_guidance(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+            guidance_ctx.ctx,
+            scale,
+        )
+
+    def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_softmax(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+        )
+
+    def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_top_k(
+            self.ctx, ctypes.byref(candidates.candidates), k, min_keep  # type: ignore
+        )
+
+    def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_top_p(
+            self.ctx, ctypes.byref(candidates.candidates), p, min_keep  # type: ignore
+        )
+
+    def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_min_p(
+            self.ctx, ctypes.byref(candidates.candidates), p, min_keep  # type: ignore
+        )
+
+    def sample_tail_free(
+        self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int
+    ):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_tail_free(
+            self.ctx, ctypes.byref(candidates.candidates), z, min_keep  # type: ignore
+        )
+
+    def sample_typical(
+        self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
+    ):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_typical(
+            self.ctx, ctypes.byref(candidates.candidates), p, min_keep  # type: ignore
+        )
+
+    def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
+        assert self.ctx is not None
+        llama_cpp.llama_sample_temp(
+            self.ctx, ctypes.byref(candidates.candidates), temp  # type: ignore
+        )
+
+    def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
+        assert self.ctx is not None
+        assert grammar.grammar is not None
+        llama_cpp.llama_sample_grammar(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+            grammar.grammar,
+        )
+
+    def sample_token_mirostat(
+        self,
+        candidates: "_LlamaTokenDataArray",
+        tau: float,
+        eta: float,
+        m: int,
+        mu: ctypes._Pointer[ctypes.c_float],  # type: ignore
+    ) -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_sample_token_mirostat(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+            tau,
+            eta,
+            m,
+            mu,
+        )
+
+    def sample_token_mirostat_v2(
+        self, candidates: "_LlamaTokenDataArray", tau: float, eta: float, mu: ctypes._Pointer[ctypes.c_float]  # type: ignore
+    ) -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_sample_token_mirostat_v2(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+            tau,
+            eta,
+            mu,
+        )
+
+    def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_sample_token_greedy(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+        )
+
+    def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
+        assert self.ctx is not None
+        return llama_cpp.llama_sample_token(
+            self.ctx,
+            ctypes.byref(candidates.candidates),  # type: ignore
+        )
+
+    # Grammar
+    def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
+        assert self.ctx is not None
+        assert grammar.grammar is not None
+        llama_cpp.llama_grammar_accept_token(self.ctx, grammar.grammar, token)
+
+    def reset_timings(self):
+        assert self.ctx is not None
+        llama_cpp.llama_reset_timings(self.ctx)
+
+    def print_timings(self):
+        assert self.ctx is not None
+        llama_cpp.llama_print_timings(self.ctx)
+
+    # Utility functions
+    @staticmethod
+    def default_params():
+        """Get the default llama_context_params."""
+        return llama_cpp.llama_context_default_params()
+
+
+class _LlamaBatch:
+    _llama_batch_free = None
+    # NOTE: this must be "saved" here to avoid exceptions when calling __del__
+    _suppress_stdout_stderr = suppress_stdout_stderr
+
+    def __init__(
+        self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
+    ):
+        self.n_tokens = n_tokens
+        self.embd = embd
+        self.n_seq_max = n_seq_max
+        self.verbose = verbose
+
+        self._llama_batch_free = llama_cpp._lib.llama_batch_free  # type: ignore
+
+        with self._suppress_stdout_stderr(disable=self.verbose):
+            self.batch = llama_cpp.llama_batch_init(
+                self.n_tokens, self.embd, self.n_seq_max
+            )
+
+    def __del__(self):
+        with self._suppress_stdout_stderr(disable=self.verbose):
+            if self.batch is not None and self._llama_batch_free is not None:
+                self._llama_batch_free(self.batch)
+                self.batch = None
+
+    def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
+        assert self.batch is not None
+        n_tokens = len(batch)
+        self.batch.n_tokens = n_tokens
+        for i in range(n_tokens):
+            self.batch.token[i] = batch[i]
+            self.batch.pos[i] = n_past + i
+            self.batch.seq_id[i][0] = 0
+            self.batch.n_seq_id[i] = 1
+            self.batch.logits[i] = logits_all
+        self.batch.logits[n_tokens - 1] = True
+
+
+class _LlamaTokenDataArray:
+    def __init__(self, *, n_vocab: int):
+        self.n_vocab = n_vocab
+        self.candidates_data = np.array(
+            [],
+            dtype=np.dtype(
+                [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
+            ),
+        )
+        self.candidates_data.resize(3, self.n_vocab, refcheck=False)
+        self.candidates = llama_cpp.llama_token_data_array(
+            data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
+            size=self.n_vocab,
+            sorted=False,
+        )
+        self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc)
+        self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
+
+    def copy_logits(self, logits: npt.NDArray[np.single]):
+        self.candidates_data["id"][:] = self.default_candidates_data_id
+        self.candidates_data["logit"][:] = logits
+        self.candidates_data["p"][:] = self.default_candidates_data_p
+        self.candidates.data = self.candidates_data.ctypes.data_as(
+            llama_cpp.llama_token_data_p
+        )
+        self.candidates.sorted = llama_cpp.c_bool(False)
+        self.candidates.size = llama_cpp.c_size_t(self.n_vocab)
+
+
+# Python wrappers over common/common
+def _tokenize(model: _LlamaModel, text: str, add_bos: bool, special: bool) -> list[int]:
+    n_tokens = len(text) + 1 if add_bos else len(text)
+    result = (llama_cpp.llama_token * n_tokens)()
+    n_tokens = llama_cpp.llama_tokenize(
+        model.model,
+        text.encode("utf-8"),
+        len(text),
+        result,
+        n_tokens,
+        add_bos,
+        special,
+    )
+    if n_tokens < 0:
+        result = (llama_cpp.llama_token * -n_tokens)()
+        check = llama_cpp.llama_tokenize(
+            model.model,
+            text.encode("utf-8"),
+            len(text),
+            result,
+            len(result),
+            add_bos,
+            special,
+        )
+        if check != -n_tokens:
+            raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
+    else:
+        result = result[:n_tokens]
+    return list(result)
+
+
+def _token_to_piece(model: _LlamaModel, token: int) -> str:
+    assert model.model is not None
+    result = (ctypes.c_char * 8)(0)
+    n_tokens = llama_cpp.llama_token_to_piece(model.model, token, result, len(result))
+    if n_tokens < 0:
+        result = (ctypes.c_char * -n_tokens)(0)
+        check = llama_cpp.llama_token_to_piece(model.model, token, result, len(result))
+        if check != -n_tokens:
+            raise RuntimeError(f"Failed to get piece: token={token}")
+    else:
+        result = result[:n_tokens]
+    return bytes(result).decode("utf-8")
+
+
+def _detokenize_spm(model: _LlamaModel, tokens: List[int]) -> str:
+    bos_id = model.token_bos()
+    result = ""
+    for i, token in enumerate(tokens):
+        piece = _token_to_piece(model, token)
+        if (
+            (tokens[0] == bos_id and i == 1) or (tokens[0] != bos_id and i == 0)
+        ) and piece[0] == " ":
+            piece = piece[1:]
+        result += piece
+    return result
+
+
+def _detokenize_bpe(model: _LlamaModel, tokens: List[int]) -> str:
+    result = ""
+    for token in tokens:
+        piece = _token_to_piece(model, token)
+        result += piece
+    return result
+
+
+def _should_add_bos(model: _LlamaModel) -> bool:
+    assert model.model is not None
+    add_bos = llama_cpp.llama_add_bos_token(model.model)
+    if add_bos != -1:
+        return add_bos != 0
+    else:
+        return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM
+
+
+# Python wrappers over common/sampling structs
+
+
+@dataclass
+class _LlamaSamplingParams:
+    n_prev: int = 64
+    n_probs: int = 0
+    top_k: int = 40
+    top_p: float = 0.95
+    min_p: float = 0.05
+    tfs_z: float = 1.00
+    typical_p: float = 1.00
+    temp: float = 0.80
+    penalty_last_n: int = 64
+    penalty_repeat: float = 1.10
+    penalty_freq: float = 0.00
+    penalty_present: float = 0.00
+    mirostat: int = 0
+    mirostat_tau: float = 5.00
+    mirostat_eta: float = 0.10
+    penalize_nl: bool = True
+
+    grammar: str = ""
+
+    cfg_negative_prompt: str = ""
+    cfg_scale: float = 1.00
+
+    logit_bias: dict[int, float] = field(default_factory=dict)
+
+
+@dataclass
+class _LlamaSamplingContext:
+    params: _LlamaSamplingParams = field(default_factory=_LlamaSamplingParams)
+    mirostat_mu: ctypes.c_float = field(default_factory=ctypes.c_float)
+    grammar: Optional[LlamaGrammar] = None
+    # NOTE: Missing parsed_grammar
+    prev: list[int] = field(default_factory=list)
+    cur: list[llama_cpp.llama_token_data] = field(default_factory=list)
+
+    def reset(self):
+        self.prev = []
+        self.cur = []
+        if self.grammar is not None:
+            self.grammar.reset()
+
+    def cp(self):
+        return _LlamaSamplingContext(
+            params=self.params,
+            mirostat_mu=self.mirostat_mu,
+            grammar=self.grammar,
+            prev=self.prev.copy(),
+            cur=self.cur.copy(),
+        )
+
+    def last(self) -> Optional[int]:
+        if len(self.prev) > 0:
+            return self.prev[-1]
+        else:
+            return None
+
+    def prev_str(self, ctx_main: _LlamaContext, n: int) -> str:
+        return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8")
+
+    def sample(
+        self, ctx_main: _LlamaContext, ctx_cfg: Optional[_LlamaContext] = None, idx: int = 0, logits_array: Optional[npt.NDArray[np.single]] = None
+    ):
+        n_vocab = ctx_main.model.n_vocab()
+        id: int = 0
+
+        if logits_array is None:
+            logits = ctx_main.get_logits_ith(idx)
+            logits_array = np.array(
+                ctypes.cast(logits, ctypes.POINTER(ctypes.c_float * n_vocab)).contents,
+                dtype=np.single,
+            )
+
+        # apply logit_bias
+        for token, logit_bias in self.params.logit_bias.items():
+            logits_array[token] += logit_bias
+
+        token_data_array = _LlamaTokenDataArray(
+            n_vocab=n_vocab
+        )  # TODO: Only create this once
+        token_data_array.copy_logits(logits_array)
+
+        if ctx_cfg is not None:
+            ctx_main.sample_classifier_free_guidance(
+                token_data_array, ctx_cfg, self.params.cfg_scale
+            )
+
+        # apply penalties
+        if len(self.prev) > 0:
+            nl_token = ctx_main.model.token_nl()
+            nl_logit = logits_array[nl_token]
+            if self.params.penalty_last_n > 0:
+                ctx_main.sample_repetition_penalties(
+                    token_data_array,
+                    # TODO: Only create this once
+                    (llama_cpp.llama_token * len(self.prev))(*self.prev),  # type: ignore
+                    self.params.penalty_last_n,
+                    self.params.penalty_repeat,
+                    self.params.penalty_freq,
+                    self.params.penalty_present,
+                )
+            if not self.params.penalize_nl:
+                token_data_array.candidates_data["logit"][nl_token] = nl_logit
+
+        if self.grammar is not None:
+            ctx_main.sample_grammar(token_data_array, self.grammar)
+
+        if self.params.temp < 0:
+            ctx_main.sample_softmax(token_data_array)
+            id = token_data_array.candidates_data["id"][0]
+        elif self.params.temp == 0:
+            id = ctx_main.sample_token_greedy(token_data_array)
+        else:
+            if self.params.mirostat == 1:
+                mirostat_m = 100
+                ctx_main.sample_temp(token_data_array, self.params.temp)
+                id = ctx_main.sample_token_mirostat(
+                    token_data_array,
+                    self.params.mirostat_tau,
+                    self.params.mirostat_eta,
+                    mirostat_m,
+                    ctypes.pointer(self.mirostat_mu),
+                )
+            elif self.params.mirostat == 2:
+                ctx_main.sample_temp(token_data_array, self.params.temp)
+                id = ctx_main.sample_token_mirostat_v2(
+                    token_data_array,
+                    self.params.mirostat_tau,
+                    self.params.mirostat_eta,
+                    ctypes.pointer(self.mirostat_mu),
+                )
+            else:
+                min_keep = max(1, self.params.n_probs)
+                ctx_main.sample_top_k(
+                    token_data_array, self.params.top_k, min_keep=min_keep
+                )
+                ctx_main.sample_tail_free(
+                    token_data_array, self.params.tfs_z, min_keep=min_keep
+                )
+                ctx_main.sample_typical(
+                    token_data_array, self.params.typical_p, min_keep=min_keep
+                )
+                ctx_main.sample_top_p(
+                    token_data_array, self.params.top_p, min_keep=min_keep
+                )
+                ctx_main.sample_min_p(
+                    token_data_array, self.params.min_p, min_keep=min_keep
+                )
+                ctx_main.sample_temp(token_data_array, self.params.temp)
+                id = ctx_main.sample_token(token_data_array)
+        return id
+
+    def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool):
+        if apply_grammar and self.grammar is not None:
+            ctx_main.grammar_accept_token(self.grammar, id)
+        self.prev.append(id)
\ No newline at end of file
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index c6b55ab..f4e5dcd 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -32,6 +32,12 @@ import numpy as np
 import numpy.typing as npt
 
 from ._utils import suppress_stdout_stderr
+from ._internals import (
+    _LlamaModel,  # type: ignore
+    _LlamaContext,  # type: ignore
+    _LlamaBatch,  # type: ignore
+    _LlamaTokenDataArray,  # type: ignore
+)
 
 
 class LlamaState:
@@ -74,518 +80,6 @@ class StoppingCriteriaList(List[StoppingCriteria]):
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
 
 
-class _LlamaModel:
-    """Intermediate Python wrapper for a llama.cpp llama_model.
-
-    NOTE: For stability it's recommended you use the Llama class instead."""
-
-    _llama_free_model = None
-    # NOTE: this must be "saved" here to avoid exceptions when calling __del__
-    suppress_stdout_stderr = suppress_stdout_stderr
-
-    def __init__(
-        self,
-        *,
-        path_model: str,
-        params: llama_cpp.llama_model_params,
-        verbose: bool = True,
-    ):
-        self.path_model = path_model
-        self.params = params
-        self.verbose = verbose
-
-        self._llama_free_model = llama_cpp._lib.llama_free_model  # type: ignore
-
-        if not os.path.exists(path_model):
-            raise ValueError(f"Model path does not exist: {path_model}")
-
-        with suppress_stdout_stderr(disable=self.verbose):
-            self.model = llama_cpp.llama_load_model_from_file(
-                self.path_model.encode("utf-8"), self.params
-            )
-
-    def __del__(self):
-        with self.suppress_stdout_stderr(disable=self.verbose):
-            if self.model is not None and self._llama_free_model is not None:
-                self._llama_free_model(self.model)
-                self.model = None
-
-    def vocab_type(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_vocab_type(self.model)
-
-    def n_vocab(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_n_vocab(self.model)
-
-    def n_ctx_train(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_n_ctx_train(self.model)
-
-    def n_embd(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_n_embd(self.model)
-
-    def rope_freq_scale_train(self) -> float:
-        assert self.model is not None
-        return llama_cpp.llama_rope_freq_scale_train(self.model)
-
-    def desc(self) -> str:
-        assert self.model is not None
-        buf = ctypes.create_string_buffer(1024)
-        llama_cpp.llama_model_desc(self.model, buf, 1024)  # type: ignore
-        return buf.value.decode("utf-8")
-
-    def size(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_model_size(self.model)
-
-    def n_params(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_model_n_params(self.model)
-
-    def get_tensor(self, name: str) -> ctypes.c_void_p:
-        assert self.model is not None
-        return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8"))
-
-    def apply_lora_from_file(
-        self,
-        lora_path: str,
-        scale: float,
-        path_base_model: Optional[str],
-        n_threads: int,
-    ):
-        assert self.model is not None
-        return llama_cpp.llama_model_apply_lora_from_file(
-            self.model,
-            lora_path.encode("utf-8"),
-            scale,
-            path_base_model.encode("utf-8")
-            if path_base_model is not None
-            else llama_cpp.c_char_p(0),
-            n_threads,
-        )
-
-    # Vocab
-
-    def token_get_text(self, token: int) -> str:
-        # TODO: Fix
-        assert self.model is not None
-        return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8")
-
-    def token_get_score(self, token: int) -> float:
-        assert self.model is not None
-        return llama_cpp.llama_token_get_score(self.model, token)
-
-    def token_get_type(self, token: int) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_token_get_type(self.model, token)
-
-    # Special tokens
-
-    def token_bos(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_token_bos(self.model)
-
-    def token_eos(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_token_eos(self.model)
-
-    def token_nl(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_token_nl(self.model)
-
-    def token_prefix(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_token_prefix(self.model)
-
-    def token_middle(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_token_middle(self.model)
-
-    def token_suffix(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_token_suffix(self.model)
-
-    def token_eot(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_token_eot(self.model)
-
-    # Tokenization
-
-    def tokenize(self, text: bytes, add_bos: bool, special: bool):
-        assert self.model is not None
-        n_ctx = self.n_ctx_train()
-        tokens = (llama_cpp.llama_token * n_ctx)()
-        n_tokens = llama_cpp.llama_tokenize(
-            self.model, text, len(text), tokens, n_ctx, add_bos, special
-        )
-        if n_tokens < 0:
-            n_tokens = abs(n_tokens)
-            tokens = (llama_cpp.llama_token * n_tokens)()
-            n_tokens = llama_cpp.llama_tokenize(
-                self.model, text, len(text), tokens, n_tokens, add_bos, special
-            )
-            if n_tokens < 0:
-                raise RuntimeError(
-                    f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
-                )
-        return list(tokens[:n_tokens])
-
-    def token_to_piece(self, token: int) -> bytes:
-        assert self.model is not None
-        buf = ctypes.create_string_buffer(32)
-        llama_cpp.llama_token_to_piece(self.model, token, buf, 32)  # type: ignore
-        return bytes(buf)
-
-    def detokenize(self, tokens: List[int]) -> bytes:
-        assert self.model is not None
-        output = b""
-        size = 32
-        buffer = (ctypes.c_char * size)()
-        for token in tokens:
-            n = llama_cpp.llama_token_to_piece(
-                self.model, llama_cpp.llama_token(token), buffer, size
-            )
-            assert n <= size
-            output += bytes(buffer[:n])
-        # NOTE: Llama1 models automatically added a space at the start of the prompt
-        # this line removes a leading space if the first token is a beginning of sentence token
-        return (
-            output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
-        )
-
-    @staticmethod
-    def default_params():
-        """Get the default llama_model_params."""
-        return llama_cpp.llama_model_default_params()
-
-
-class _LlamaContext:
-    """Intermediate Python wrapper for a llama.cpp llama_context.
-
-    NOTE: For stability it's recommended you use the Llama class instead."""
-
-    _llama_free = None
-    # NOTE: this must be "saved" here to avoid exceptions when calling __del__
-    suppress_stdout_stderr = suppress_stdout_stderr
-
-    def __init__(
-        self,
-        *,
-        model: _LlamaModel,
-        params: llama_cpp.llama_context_params,
-        verbose: bool = True,
-    ):
-        self.model = model
-        self.params = params
-        self.verbose = verbose
-
-        self._llama_free = llama_cpp._lib.llama_free  # type: ignore
-
-        with suppress_stdout_stderr(disable=self.verbose):
-            self.ctx = llama_cpp.llama_new_context_with_model(
-                self.model.model, self.params
-            )
-
-    def __del__(self):
-        with self.suppress_stdout_stderr(disable=self.verbose):
-            if self.ctx is not None and self._llama_free is not None:
-                self._llama_free(self.ctx)
-                self.ctx = None
-
-    def n_ctx(self) -> int:
-        assert self.ctx is not None
-        return llama_cpp.llama_n_ctx(self.ctx)
-
-    def kv_cache_clear(self):
-        assert self.ctx is not None
-        llama_cpp.llama_kv_cache_clear(self.ctx)
-
-    def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
-        assert self.ctx is not None
-        llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1)
-
-    def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
-        assert self.ctx is not None
-        llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1)
-
-    def kv_cache_seq_keep(self, seq_id: int):
-        assert self.ctx is not None
-        llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id)
-
-    def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
-        assert self.ctx is not None
-        llama_cpp.llama_kv_cache_seq_shift(self.ctx, seq_id, p0, p1, shift)
-
-    def get_state_size(self) -> int:
-        assert self.ctx is not None
-        return llama_cpp.llama_get_state_size(self.ctx)
-
-    # TODO: copy_state_data
-
-    # TODO: set_state_data
-
-    # TODO: llama_load_session_file
-
-    # TODO: llama_save_session_file
-
-    def decode(self, batch: "_LlamaBatch"):
-        assert self.ctx is not None
-        assert batch.batch is not None
-        return_code = llama_cpp.llama_decode(
-            ctx=self.ctx,
-            batch=batch.batch,
-        )
-        if return_code != 0:
-            raise RuntimeError(f"llama_decode returned {return_code}")
-
-    def set_n_threads(self, n_threads: int, n_threads_batch: int):
-        assert self.ctx is not None
-        llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch)
-
-    def get_logits(self):
-        assert self.ctx is not None
-        return llama_cpp.llama_get_logits(self.ctx)
-
-    def get_logits_ith(self, i: int):
-        assert self.ctx is not None
-        return llama_cpp.llama_get_logits_ith(self.ctx, i)
-
-    def get_embeddings(self):
-        assert self.ctx is not None
-        return llama_cpp.llama_get_embeddings(self.ctx)
-
-    # Sampling functions
-
-    def set_rng_seed(self, seed: int):
-        assert self.ctx is not None
-        llama_cpp.llama_set_rng_seed(self.ctx, seed)
-
-    def sample_repetition_penalties(
-        self,
-        candidates: "_LlamaTokenDataArray",
-        last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]",
-        penalty_last_n: int,
-        penalty_repeat: float,
-        penalty_freq: float,
-        penalty_present: float,
-    ):
-        assert self.ctx is not None
-        llama_cpp.llama_sample_repetition_penalties(
-            self.ctx,
-            ctypes.byref(candidates.candidates),  # type: ignore
-            last_tokens_data,
-            penalty_last_n,
-            penalty_repeat,
-            penalty_freq,
-            penalty_present,
-        )
-
-    def sample_classifier_free_guidance(
-        self,
-        candidates: "_LlamaTokenDataArray",
-        guidance_ctx: "_LlamaContext",
-        scale: float,
-    ):
-        assert self.ctx is not None
-        assert guidance_ctx.ctx is not None
-        llama_cpp.llama_sample_classifier_free_guidance(
-            self.ctx,
-            ctypes.byref(candidates.candidates),  # type: ignore
-            guidance_ctx.ctx,
-            scale,
-        )
-
-    def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
-        assert self.ctx is not None
-        llama_cpp.llama_sample_softmax(
-            self.ctx,
-            ctypes.byref(candidates.candidates),  # type: ignore
-        )
-
-    def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
-        assert self.ctx is not None
-        llama_cpp.llama_sample_top_k(
-            self.ctx, ctypes.byref(candidates.candidates), k, min_keep  # type: ignore
-        )
-
-    def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        assert self.ctx is not None
-        llama_cpp.llama_sample_top_p(
-            self.ctx, ctypes.byref(candidates.candidates), p, min_keep  # type: ignore
-        )
-
-    def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        assert self.ctx is not None
-        llama_cpp.llama_sample_min_p(
-            self.ctx, ctypes.byref(candidates.candidates), p, min_keep  # type: ignore
-        )
-
-    def sample_tail_free(
-        self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int
-    ):
-        assert self.ctx is not None
-        llama_cpp.llama_sample_tail_free(
-            self.ctx, ctypes.byref(candidates.candidates), z, min_keep  # type: ignore
-        )
-
-    def sample_typical(
-        self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
-    ):
-        assert self.ctx is not None
-        llama_cpp.llama_sample_typical(
-            self.ctx, ctypes.byref(candidates.candidates), p, min_keep  # type: ignore
-        )
-
-    def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
-        assert self.ctx is not None
-        llama_cpp.llama_sample_temp(
-            self.ctx, ctypes.byref(candidates.candidates), temp  # type: ignore
-        )
-
-    def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
-        assert self.ctx is not None
-        assert grammar.grammar is not None
-        llama_cpp.llama_sample_grammar(
-            self.ctx,
-            ctypes.byref(candidates.candidates),  # type: ignore
-            grammar.grammar,
-        )
-
-    def sample_token_mirostat(
-        self,
-        candidates: "_LlamaTokenDataArray",
-        tau: float,
-        eta: float,
-        m: int,
-        mu: float,
-    ) -> int:
-        assert self.ctx is not None
-        return llama_cpp.llama_sample_token_mirostat(
-            self.ctx,
-            ctypes.byref(candidates.candidates),  # type: ignore
-            tau,
-            eta,
-            m,
-            ctypes.pointer(ctypes.c_float(mu)),
-        )
-
-    def sample_token_mirostat_v2(
-        self, candidates: "_LlamaTokenDataArray", tau: float, eta: float, mu: float
-    ) -> int:
-        assert self.ctx is not None
-        return llama_cpp.llama_sample_token_mirostat_v2(
-            self.ctx,
-            ctypes.byref(candidates.candidates),  # type: ignore
-            tau,
-            eta,
-            ctypes.pointer(ctypes.c_float(mu)),
-        )
-
-    def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
-        assert self.ctx is not None
-        return llama_cpp.llama_sample_token_greedy(
-            self.ctx,
-            ctypes.byref(candidates.candidates),  # type: ignore
-        )
-
-    def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
-        assert self.ctx is not None
-        return llama_cpp.llama_sample_token(
-            self.ctx,
-            ctypes.byref(candidates.candidates),  # type: ignore
-        )
-
-    # Grammar
-    def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
-        assert self.ctx is not None
-        assert grammar.grammar is not None
-        llama_cpp.llama_grammar_accept_token(self.ctx, grammar.grammar, token)
-
-    def reset_timings(self):
-        assert self.ctx is not None
-        llama_cpp.llama_reset_timings(self.ctx)
-
-    def print_timings(self):
-        assert self.ctx is not None
-        llama_cpp.llama_print_timings(self.ctx)
-
-    # Utility functions
-    @staticmethod
-    def default_params():
-        """Get the default llama_context_params."""
-        return llama_cpp.llama_context_default_params()
-
-
-class _LlamaBatch:
-    _llama_batch_free = None
-    # NOTE: this must be "saved" here to avoid exceptions when calling __del__
-    suppress_stdout_stderr = suppress_stdout_stderr
-
-    def __init__(
-        self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
-    ):
-        self.n_tokens = n_tokens
-        self.embd = embd
-        self.n_seq_max = n_seq_max
-        self.verbose = verbose
-
-        self._llama_batch_free = llama_cpp._lib.llama_batch_free  # type: ignore
-
-        with suppress_stdout_stderr(disable=self.verbose):
-            self.batch = llama_cpp.llama_batch_init(
-                self.n_tokens, self.embd, self.n_seq_max
-            )
-
-    def __del__(self):
-        with self.suppress_stdout_stderr(disable=self.verbose):
-            if self.batch is not None and self._llama_batch_free is not None:
-                self._llama_batch_free(self.batch)
-                self.batch = None
-
-    def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
-        assert self.batch is not None
-        n_tokens = len(batch)
-        self.batch.n_tokens = n_tokens
-        for i in range(n_tokens):
-            self.batch.token[i] = batch[i]
-            self.batch.pos[i] = n_past + i
-            self.batch.seq_id[i][0] = 0
-            self.batch.n_seq_id[i] = 1
-            self.batch.logits[i] = logits_all
-        self.batch.logits[n_tokens - 1] = True
-
-
-class _LlamaTokenDataArray:
-    def __init__(self, *, n_vocab: int):
-        self.n_vocab = n_vocab
-        self.candidates_data = np.array(
-            [],
-            dtype=np.dtype(
-                [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
-            ),
-        )
-        self.candidates_data.resize(3, self.n_vocab, refcheck=False)
-        self.candidates = llama_cpp.llama_token_data_array(
-            data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
-            size=self.n_vocab,
-            sorted=False,
-        )
-        self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc)
-        self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
-
-    def copy_logits(self, logits: npt.NDArray[np.single]):
-        self.candidates_data["id"][:] = self.default_candidates_data_id
-        self.candidates_data["logit"][:] = logits
-        self.candidates_data["p"][:] = self.default_candidates_data_p
-        self.candidates.data = self.candidates_data.ctypes.data_as(
-            llama_cpp.llama_token_data_p
-        )
-        self.candidates.sorted = llama_cpp.c_bool(False)
-        self.candidates.size = llama_cpp.c_size_t(self.n_vocab)
-
-
 class Llama:
     """High-level Python wrapper for a llama.cpp model."""