Merge pull request #460 from shouyiwang/tensor_split
Add support for llama.cpp's --tensor-split parameter
This commit is contained in:
commit
82b11c8c16
2 changed files with 20 additions and 2 deletions
|
@ -19,6 +19,7 @@ from typing import (
|
||||||
from collections import deque, OrderedDict
|
from collections import deque, OrderedDict
|
||||||
|
|
||||||
import diskcache
|
import diskcache
|
||||||
|
import ctypes
|
||||||
|
|
||||||
from . import llama_cpp
|
from . import llama_cpp
|
||||||
from .llama_types import *
|
from .llama_types import *
|
||||||
|
@ -26,7 +27,6 @@ from .llama_types import *
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
|
|
||||||
|
|
||||||
class BaseLlamaCache(ABC):
|
class BaseLlamaCache(ABC):
|
||||||
"""Base cache class for a llama.cpp model."""
|
"""Base cache class for a llama.cpp model."""
|
||||||
|
|
||||||
|
@ -207,6 +207,7 @@ class Llama:
|
||||||
n_ctx: int = 512,
|
n_ctx: int = 512,
|
||||||
n_parts: int = -1,
|
n_parts: int = -1,
|
||||||
n_gpu_layers: int = 0,
|
n_gpu_layers: int = 0,
|
||||||
|
tensor_split: list[float] = None,
|
||||||
seed: int = 1337,
|
seed: int = 1337,
|
||||||
f16_kv: bool = True,
|
f16_kv: bool = True,
|
||||||
logits_all: bool = False,
|
logits_all: bool = False,
|
||||||
|
@ -248,12 +249,20 @@ class Llama:
|
||||||
Returns:
|
Returns:
|
||||||
A Llama instance.
|
A Llama instance.
|
||||||
"""
|
"""
|
||||||
|
if tensor_split is None:
|
||||||
|
tensor_split = [0.0] * llama_cpp.LLAMA_MAX_DEVICES.value
|
||||||
|
|
||||||
|
#Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
|
||||||
|
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
|
||||||
|
c_tensor_split = FloatArray(*tensor_split)
|
||||||
|
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.model_path = model_path
|
self.model_path = model_path
|
||||||
|
|
||||||
self.params = llama_cpp.llama_context_default_params()
|
self.params = llama_cpp.llama_context_default_params()
|
||||||
self.params.n_ctx = n_ctx
|
self.params.n_ctx = n_ctx
|
||||||
self.params.n_gpu_layers = n_gpu_layers
|
self.params.n_gpu_layers = n_gpu_layers
|
||||||
|
self.params.tensor_split = c_tensor_split
|
||||||
self.params.seed = seed
|
self.params.seed = seed
|
||||||
self.params.f16_kv = f16_kv
|
self.params.f16_kv = f16_kv
|
||||||
self.params.logits_all = logits_all
|
self.params.logits_all = logits_all
|
||||||
|
@ -1490,6 +1499,7 @@ class Llama:
|
||||||
model_path=self.model_path,
|
model_path=self.model_path,
|
||||||
n_ctx=self.params.n_ctx,
|
n_ctx=self.params.n_ctx,
|
||||||
n_gpu_layers=self.params.n_gpu_layers,
|
n_gpu_layers=self.params.n_gpu_layers,
|
||||||
|
tensor_split=self.params.tensor_split,
|
||||||
seed=self.params.seed,
|
seed=self.params.seed,
|
||||||
f16_kv=self.params.f16_kv,
|
f16_kv=self.params.f16_kv,
|
||||||
logits_all=self.params.logits_all,
|
logits_all=self.params.logits_all,
|
||||||
|
@ -1514,6 +1524,7 @@ class Llama:
|
||||||
n_ctx=state["n_ctx"],
|
n_ctx=state["n_ctx"],
|
||||||
n_parts=state["n_parts"],
|
n_parts=state["n_parts"],
|
||||||
n_gpu_layers=state["n_gpu_layers"],
|
n_gpu_layers=state["n_gpu_layers"],
|
||||||
|
tensor_split=state["tensor_split"],
|
||||||
seed=state["seed"],
|
seed=state["seed"],
|
||||||
f16_kv=state["f16_kv"],
|
f16_kv=state["f16_kv"],
|
||||||
logits_all=state["logits_all"],
|
logits_all=state["logits_all"],
|
||||||
|
|
|
@ -31,7 +31,13 @@ class Settings(BaseSettings):
|
||||||
ge=0,
|
ge=0,
|
||||||
description="The number of layers to put on the GPU. The rest will be on the CPU.",
|
description="The number of layers to put on the GPU. The rest will be on the CPU.",
|
||||||
)
|
)
|
||||||
seed: int = Field(default=1337, description="Random seed. -1 for random.")
|
tensor_split: List[float] = Field(
|
||||||
|
default=None,
|
||||||
|
description="Split layers across multiple GPUs in proportion.",
|
||||||
|
)
|
||||||
|
seed: int = Field(
|
||||||
|
default=1337, description="Random seed. -1 for random."
|
||||||
|
)
|
||||||
n_batch: int = Field(
|
n_batch: int = Field(
|
||||||
default=512, ge=1, description="The batch size to use per eval."
|
default=512, ge=1, description="The batch size to use per eval."
|
||||||
)
|
)
|
||||||
|
@ -111,6 +117,7 @@ def create_app(settings: Optional[Settings] = None):
|
||||||
llama = llama_cpp.Llama(
|
llama = llama_cpp.Llama(
|
||||||
model_path=settings.model,
|
model_path=settings.model,
|
||||||
n_gpu_layers=settings.n_gpu_layers,
|
n_gpu_layers=settings.n_gpu_layers,
|
||||||
|
tensor_split=settings.tensor_split,
|
||||||
seed=settings.seed,
|
seed=settings.seed,
|
||||||
f16_kv=settings.f16_kv,
|
f16_kv=settings.f16_kv,
|
||||||
use_mlock=settings.use_mlock,
|
use_mlock=settings.use_mlock,
|
||||||
|
|
Loading…
Reference in a new issue