feat: adding rpc_servers
parameter to Llama
class (#1477)
* passthru rpc_servers params wip * enable llama rpc by default * convert string to byte * add rpc package * Revert "enable llama rpc by default" This reverts commit 832c6dd56c979514cec5df224bf2d2014dccd790. * update readme * Only set rpc_servers when provided * Add rpc servers to server options --------- Co-authored-by: Andrei Betlen <abetlen@gmail.com>
This commit is contained in:
parent
6e0642ca19
commit
d634efcdd9
5 changed files with 26 additions and 0 deletions
3
Makefile
3
Makefile
|
@ -45,6 +45,9 @@ build.kompute:
|
|||
build.sycl:
|
||||
CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e .
|
||||
|
||||
build.rpc:
|
||||
CMAKE_ARGS="-DLLAMA_RPC=on" python3 -m pip install --verbose -e .
|
||||
|
||||
build.sdist:
|
||||
python3 -m build --sdist
|
||||
|
||||
|
|
11
README.md
11
README.md
|
@ -221,6 +221,17 @@ CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pi
|
|||
```
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>RPC</summary>
|
||||
|
||||
To install with RPC support, set the `LLAMA_RPC=on` environment variable before installing:
|
||||
|
||||
```bash
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
CMAKE_ARGS="-DLLAMA_RPC=on" pip install llama-cpp-python
|
||||
```
|
||||
</details>
|
||||
|
||||
|
||||
### Windows Notes
|
||||
|
||||
|
|
|
@ -72,6 +72,7 @@ class Llama:
|
|||
split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
|
||||
main_gpu: int = 0,
|
||||
tensor_split: Optional[List[float]] = None,
|
||||
rpc_servers: Optional[str] = None,
|
||||
vocab_only: bool = False,
|
||||
use_mmap: bool = True,
|
||||
use_mlock: bool = False,
|
||||
|
@ -150,6 +151,7 @@ class Llama:
|
|||
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
|
||||
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
|
||||
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
|
||||
rpc_servers: Comma separated list of RPC servers to use for offloading
|
||||
vocab_only: Only load the vocabulary no weights.
|
||||
use_mmap: Use mmap if possible.
|
||||
use_mlock: Force the system to keep the model in RAM.
|
||||
|
@ -221,6 +223,11 @@ class Llama:
|
|||
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
||||
self.model_params.split_mode = split_mode
|
||||
self.model_params.main_gpu = main_gpu
|
||||
if rpc_servers is not None:
|
||||
self.model_params.rpc_servers = rpc_servers.encode('utf-8')
|
||||
self._rpc_servers = rpc_servers
|
||||
else:
|
||||
self._rpc_servers = None
|
||||
self.tensor_split = tensor_split
|
||||
self._c_tensor_split = None
|
||||
if self.tensor_split is not None:
|
||||
|
|
|
@ -226,6 +226,7 @@ class LlamaProxy:
|
|||
use_mmap=settings.use_mmap,
|
||||
use_mlock=settings.use_mlock,
|
||||
kv_overrides=kv_overrides,
|
||||
rpc_servers=settings.rpc_servers,
|
||||
# Context Params
|
||||
seed=settings.seed,
|
||||
n_ctx=settings.n_ctx,
|
||||
|
|
|
@ -58,6 +58,10 @@ class ModelSettings(BaseSettings):
|
|||
default=None,
|
||||
description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
|
||||
)
|
||||
rpc_servers: Optional[str] = Field(
|
||||
default=None,
|
||||
description="comma seperated list of rpc servers for offloading",
|
||||
)
|
||||
# Context Params
|
||||
seed: int = Field(
|
||||
default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
|
||||
|
|
Loading…
Reference in a new issue