From c283edd7f29acef7c24755da638c418cb69a22f1 Mon Sep 17 00:00:00 2001 From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com> Date: Wed, 5 Apr 2023 18:17:29 -0400 Subject: [PATCH] Set n_batch to default values and reduce thread count: Change batch size to the llama.cpp default of 8. I've seen issues in llama.cpp where batch size affects quality of generations. (It shouldn't) But in case that's still an issue I changed to default. Set auto-determined num of threads to 1/2 system count. ggml will sometimes lock cores at 100% while doing nothing. This is being addressed, but can cause bad experience for user if pegged at 100% --- examples/high_level_api/fastapi_server.py | 6 +++--- llama_cpp/server/__main__.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py index b7d2565..a649692 100644 --- a/examples/high_level_api/fastapi_server.py +++ b/examples/high_level_api/fastapi_server.py @@ -27,10 +27,10 @@ from sse_starlette.sse import EventSourceResponse class Settings(BaseSettings): model: str n_ctx: int = 2048 - n_batch: int = 2048 - n_threads: int = os.cpu_count() or 1 + n_batch: int = 8 + n_threads: int = int(os.cpu_count() / 2) or 1 f16_kv: bool = True - use_mlock: bool = True + use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... embedding: bool = True last_n_tokens_size: int = 64 diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 67ca115..b474f67 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -28,9 +28,9 @@ class Settings(BaseSettings): model: str n_ctx: int = 2048 n_batch: int = 8 - n_threads: int = os.cpu_count() or 1 + n_threads: int = int(os.cpu_count() / 2) or 1 f16_kv: bool = True - use_mlock: bool = True + use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out... embedding: bool = True last_n_tokens_size: int = 64