diff --git a/Makefile b/Makefile index 3796d17..d8fb0cc 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,9 @@ build.kompute: build.sycl: CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e . +build.rpc: + CMAKE_ARGS="-DLLAMA_RPC=on" python3 -m pip install --verbose -e . + build.sdist: python3 -m build --sdist diff --git a/README.md b/README.md index 342e924..0f7abfb 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,17 @@ CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pi ``` +
+RPC + +To install with RPC support, set the `LLAMA_RPC=on` environment variable before installing: + +```bash +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DLLAMA_RPC=on" pip install llama-cpp-python +``` +
+ ### Windows Notes diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fb26950..bf3bd65 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -72,6 +72,7 @@ class Llama: split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER, main_gpu: int = 0, tensor_split: Optional[List[float]] = None, + rpc_servers: Optional[str] = None, vocab_only: bool = False, use_mmap: bool = True, use_mlock: bool = False, @@ -150,6 +151,7 @@ class Llama: split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options. main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split. + rpc_servers: Comma separated list of RPC servers to use for offloading vocab_only: Only load the vocabulary no weights. use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. @@ -221,6 +223,11 @@ class Llama: ) # 0x7FFFFFFF is INT32 max, will be auto set to all layers self.model_params.split_mode = split_mode self.model_params.main_gpu = main_gpu + if rpc_servers is not None: + self.model_params.rpc_servers = rpc_servers.encode('utf-8') + self._rpc_servers = rpc_servers + else: + self._rpc_servers = None self.tensor_split = tensor_split self._c_tensor_split = None if self.tensor_split is not None: diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 4f83716..d4d4acb 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -226,6 +226,7 @@ class LlamaProxy: use_mmap=settings.use_mmap, use_mlock=settings.use_mlock, kv_overrides=kv_overrides, + rpc_servers=settings.rpc_servers, # Context Params seed=settings.seed, n_ctx=settings.n_ctx, diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index a3e1850..4d924f3 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -58,6 +58,10 @@ class ModelSettings(BaseSettings): default=None, description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.", ) + rpc_servers: Optional[str] = Field( + default=None, + description="comma seperated list of rpc servers for offloading", + ) # Context Params seed: int = Field( default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."