From d634efcdd91a0f79ae0fac13cfdf46eb47d0602d Mon Sep 17 00:00:00 2001
From: nullname <chraac@gmail.com>
Date: Tue, 4 Jun 2024 22:38:21 +0800
Subject: [PATCH] feat: adding `rpc_servers` parameter to `Llama` class (#1477)

* passthru rpc_servers params

wip

* enable llama rpc by default

* convert string to byte

* add rpc package

* Revert "enable llama rpc by default"

This reverts commit 832c6dd56c979514cec5df224bf2d2014dccd790.

* update readme

* Only set rpc_servers when provided

* Add rpc servers to server options

---------

Co-authored-by: Andrei Betlen <abetlen@gmail.com>
---
 Makefile                     |  3 +++
 README.md                    | 11 +++++++++++
 llama_cpp/llama.py           |  7 +++++++
 llama_cpp/server/model.py    |  1 +
 llama_cpp/server/settings.py |  4 ++++
 5 files changed, 26 insertions(+)
diff --git a/Makefile b/Makefile
index 3796d17..d8fb0cc 100644
--- a/Makefile
+++ b/Makefile
@@ -45,6 +45,9 @@ build.kompute:
 build.sycl:
 	CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e .
 
+build.rpc:
+	CMAKE_ARGS="-DLLAMA_RPC=on" python3 -m pip install --verbose -e .
+
 build.sdist:
 	python3 -m build --sdist
 
diff --git a/README.md b/README.md
index 342e924..0f7abfb 100644
--- a/README.md
+++ b/README.md
@@ -221,6 +221,17 @@ CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pi
 ```
 </details>
 
+<details>
+<summary>RPC</summary>
+
+To install with RPC support, set the `LLAMA_RPC=on` environment variable before installing:
+
+```bash
+source /opt/intel/oneapi/setvars.sh   
+CMAKE_ARGS="-DLLAMA_RPC=on" pip install llama-cpp-python
+```
+</details>
+
 
 ### Windows Notes
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index fb26950..bf3bd65 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -72,6 +72,7 @@ class Llama:
         split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER,
         main_gpu: int = 0,
         tensor_split: Optional[List[float]] = None,
+        rpc_servers: Optional[str] = None,
         vocab_only: bool = False,
         use_mmap: bool = True,
         use_mlock: bool = False,
@@ -150,6 +151,7 @@ class Llama:
             split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
             main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
             tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
+            rpc_servers: Comma separated list of RPC servers to use for offloading
             vocab_only: Only load the vocabulary no weights.
             use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
@@ -221,6 +223,11 @@ class Llama:
         )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
         self.model_params.split_mode = split_mode
         self.model_params.main_gpu = main_gpu
+        if rpc_servers is not None:
+            self.model_params.rpc_servers = rpc_servers.encode('utf-8')
+            self._rpc_servers = rpc_servers
+        else:
+            self._rpc_servers = None
         self.tensor_split = tensor_split
         self._c_tensor_split = None
         if self.tensor_split is not None:
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 4f83716..d4d4acb 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -226,6 +226,7 @@ class LlamaProxy:
             use_mmap=settings.use_mmap,
             use_mlock=settings.use_mlock,
             kv_overrides=kv_overrides,
+            rpc_servers=settings.rpc_servers,
             # Context Params
             seed=settings.seed,
             n_ctx=settings.n_ctx,
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index a3e1850..4d924f3 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -58,6 +58,10 @@ class ModelSettings(BaseSettings):
         default=None,
         description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.",
     )
+    rpc_servers: Optional[str] = Field(
+        default=None,
+        description="comma seperated list of rpc servers for offloading",
+    )
     # Context Params
     seed: int = Field(
         default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."