Merge pull request #460 from shouyiwang/tensor_split

Add support for llama.cpp's --tensor-split parameter
This commit is contained in:
Andrei 2023-07-14 16:33:54 -04:00 committed by GitHub
commit 82b11c8c16
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 2 deletions

View file

@ -19,6 +19,7 @@ from typing import (
from collections import deque, OrderedDict from collections import deque, OrderedDict
import diskcache import diskcache
import ctypes
from . import llama_cpp from . import llama_cpp
from .llama_types import * from .llama_types import *
@ -26,7 +27,6 @@ from .llama_types import *
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
class BaseLlamaCache(ABC): class BaseLlamaCache(ABC):
"""Base cache class for a llama.cpp model.""" """Base cache class for a llama.cpp model."""
@ -207,6 +207,7 @@ class Llama:
n_ctx: int = 512, n_ctx: int = 512,
n_parts: int = -1, n_parts: int = -1,
n_gpu_layers: int = 0, n_gpu_layers: int = 0,
tensor_split: list[float] = None,
seed: int = 1337, seed: int = 1337,
f16_kv: bool = True, f16_kv: bool = True,
logits_all: bool = False, logits_all: bool = False,
@ -248,12 +249,20 @@ class Llama:
Returns: Returns:
A Llama instance. A Llama instance.
""" """
if tensor_split is None:
tensor_split = [0.0] * llama_cpp.LLAMA_MAX_DEVICES.value
#Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
c_tensor_split = FloatArray(*tensor_split)
self.verbose = verbose self.verbose = verbose
self.model_path = model_path self.model_path = model_path
self.params = llama_cpp.llama_context_default_params() self.params = llama_cpp.llama_context_default_params()
self.params.n_ctx = n_ctx self.params.n_ctx = n_ctx
self.params.n_gpu_layers = n_gpu_layers self.params.n_gpu_layers = n_gpu_layers
self.params.tensor_split = c_tensor_split
self.params.seed = seed self.params.seed = seed
self.params.f16_kv = f16_kv self.params.f16_kv = f16_kv
self.params.logits_all = logits_all self.params.logits_all = logits_all
@ -1490,6 +1499,7 @@ class Llama:
model_path=self.model_path, model_path=self.model_path,
n_ctx=self.params.n_ctx, n_ctx=self.params.n_ctx,
n_gpu_layers=self.params.n_gpu_layers, n_gpu_layers=self.params.n_gpu_layers,
tensor_split=self.params.tensor_split,
seed=self.params.seed, seed=self.params.seed,
f16_kv=self.params.f16_kv, f16_kv=self.params.f16_kv,
logits_all=self.params.logits_all, logits_all=self.params.logits_all,
@ -1514,6 +1524,7 @@ class Llama:
n_ctx=state["n_ctx"], n_ctx=state["n_ctx"],
n_parts=state["n_parts"], n_parts=state["n_parts"],
n_gpu_layers=state["n_gpu_layers"], n_gpu_layers=state["n_gpu_layers"],
tensor_split=state["tensor_split"],
seed=state["seed"], seed=state["seed"],
f16_kv=state["f16_kv"], f16_kv=state["f16_kv"],
logits_all=state["logits_all"], logits_all=state["logits_all"],

View file

@ -31,7 +31,13 @@ class Settings(BaseSettings):
ge=0, ge=0,
description="The number of layers to put on the GPU. The rest will be on the CPU.", description="The number of layers to put on the GPU. The rest will be on the CPU.",
) )
seed: int = Field(default=1337, description="Random seed. -1 for random.") tensor_split: List[float] = Field(
default=None,
description="Split layers across multiple GPUs in proportion.",
)
seed: int = Field(
default=1337, description="Random seed. -1 for random."
)
n_batch: int = Field( n_batch: int = Field(
default=512, ge=1, description="The batch size to use per eval." default=512, ge=1, description="The batch size to use per eval."
) )
@ -111,6 +117,7 @@ def create_app(settings: Optional[Settings] = None):
llama = llama_cpp.Llama( llama = llama_cpp.Llama(
model_path=settings.model, model_path=settings.model,
n_gpu_layers=settings.n_gpu_layers, n_gpu_layers=settings.n_gpu_layers,
tensor_split=settings.tensor_split,
seed=settings.seed, seed=settings.seed,
f16_kv=settings.f16_kv, f16_kv=settings.f16_kv,
use_mlock=settings.use_mlock, use_mlock=settings.use_mlock,