From 0ad0e738cd7ed1266b3c210ad54dcd2b70142563 Mon Sep 17 00:00:00 2001 From: Richard Lyons Date: Sun, 18 Aug 2024 01:43:26 +0200 Subject: [PATCH 1/3] Override numParallel only if unset. --- server/sched.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/sched.go b/server/sched.go index 9947fd32..4d9c0296 100644 --- a/server/sched.go +++ b/server/sched.go @@ -734,7 +734,9 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL // If multiple Libraries are detected, pick the Library which loads the most layers for the model func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { - *numParallel = 1 + if *numParallel <= 0 { + *numParallel = 1 + } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { return gpus From 9352eeb752531decccc7c6b91a07bc3dd5efa67e Mon Sep 17 00:00:00 2001 From: Richard Lyons Date: Sun, 18 Aug 2024 02:55:01 +0200 Subject: [PATCH 2/3] Reset NumCtx. --- server/sched.go | 1 + 1 file changed, 1 insertion(+) diff --git a/server/sched.go b/server/sched.go index 4d9c0296..3fe6d7fc 100644 --- a/server/sched.go +++ b/server/sched.go @@ -736,6 +736,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { if *numParallel <= 0 { *numParallel = 1 + req.opts.NumCtx = req.origNumCtx } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { From 885cf45087863aa2e064a05da99e8bd07d69970a Mon Sep 17 00:00:00 2001 From: Richard Lyons Date: Sun, 18 Aug 2024 03:07:16 +0200 Subject: [PATCH 3/3] Fix white space. --- server/sched.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/sched.go b/server/sched.go index 3fe6d7fc..9d8c4144 100644 --- a/server/sched.go +++ b/server/sched.go @@ -736,8 +736,8 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList { if *numParallel <= 0 { *numParallel = 1 - req.opts.NumCtx = req.origNumCtx - } + req.opts.NumCtx = req.origNumCtx + } byLibrary := gpus.ByLibrary() if len(byLibrary) <= 1 { return gpus