Set n_batch to default values and reduce thread count:
Change batch size to the llama.cpp default of 8. I've seen issues in llama.cpp where batch size affects quality of generations. (It shouldn't) But in case that's still an issue I changed to default. Set auto-determined num of threads to 1/2 system count. ggml will sometimes lock cores at 100% while doing nothing. This is being addressed, but can cause bad experience for user if pegged at 100%
This commit is contained in:
parent
b9b6dfd23f
commit
c283edd7f2
2 changed files with 5 additions and 5 deletions
|
@ -27,10 +27,10 @@ from sse_starlette.sse import EventSourceResponse
|
|||
class Settings(BaseSettings):
|
||||
model: str
|
||||
n_ctx: int = 2048
|
||||
n_batch: int = 2048
|
||||
n_threads: int = os.cpu_count() or 1
|
||||
n_batch: int = 8
|
||||
n_threads: int = int(os.cpu_count() / 2) or 1
|
||||
f16_kv: bool = True
|
||||
use_mlock: bool = True
|
||||
use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
|
||||
embedding: bool = True
|
||||
last_n_tokens_size: int = 64
|
||||
|
||||
|
|
|
@ -28,9 +28,9 @@ class Settings(BaseSettings):
|
|||
model: str
|
||||
n_ctx: int = 2048
|
||||
n_batch: int = 8
|
||||
n_threads: int = os.cpu_count() or 1
|
||||
n_threads: int = int(os.cpu_count() / 2) or 1
|
||||
f16_kv: bool = True
|
||||
use_mlock: bool = True
|
||||
use_mlock: bool = False # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
|
||||
embedding: bool = True
|
||||
last_n_tokens_size: int = 64
|
||||
|
||||
|
|
Loading…
Reference in a new issue