diff --git a/llm/patches/04-metal.diff b/llm/patches/04-metal.diff new file mode 100644 index 00000000..f8fa7db7 --- /dev/null +++ b/llm/patches/04-metal.diff @@ -0,0 +1,45 @@ +diff --git a/ggml-metal.m b/ggml-metal.m +index 0207b787..b5e9884b 100644 +--- a/ggml-metal.m ++++ b/ggml-metal.m +@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute( + // to the matrix-vector kernel + int ne11_mm_min = 1; + +-#if 0 + // the numbers below are measured on M2 Ultra for 7B and 13B models + // these numbers do not translate to other devices or model sizes + // TODO: need to find a better approach +- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { +- switch (src0t) { +- case GGML_TYPE_F16: ne11_mm_min = 2; break; +- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; +- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; +- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; +- case GGML_TYPE_Q4_0: +- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; +- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; +- case GGML_TYPE_Q5_0: // not tested yet +- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet +- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; +- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; +- default: ne11_mm_min = 1; break; +- } ++ switch (src0t) { ++ case GGML_TYPE_F16: ne11_mm_min = 2; break; ++ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; ++ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; ++ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; ++ case GGML_TYPE_Q4_0: ++ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; ++ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; ++ case GGML_TYPE_Q5_0: // not tested yet ++ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet ++ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; ++ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; ++ default: ne11_mm_min = 1; break; + } +-#endif + + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs + // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel diff --git a/server/sched.go b/server/sched.go index a8f31d98..3c7ab66e 100644 --- a/server/sched.go +++ b/server/sched.go @@ -46,6 +46,7 @@ type Scheduler struct { // TODO set this to zero after a release or two, to enable multiple models by default var loadedMax = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners) var maxQueuedRequests = 10 // TODO configurable +var numParallel = 1 func InitScheduler(ctx context.Context) *Scheduler { maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS") @@ -57,6 +58,14 @@ func InitScheduler(ctx context.Context) *Scheduler { loadedMax = m } } + if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" { + p, err := strconv.Atoi(onp) + if err != nil || p <= 0 { + slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err) + } else { + numParallel = p + } + } sched := &Scheduler{ pendingReqCh: make(chan *LlmRequest, maxQueuedRequests), @@ -81,6 +90,8 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, successCh: make(chan *runnerRef), errCh: make(chan error, 1), } + // context split across parallel threads + opts.NumCtx = opts.NumCtx * numParallel select { case s.pendingReqCh <- req: default: