Merge pull request #3914 from dhiltgen/mac_perf
Improve mac parallel performance
This commit is contained in:
commit
9b5a3c5991
2 changed files with 56 additions and 0 deletions
45
llm/patches/04-metal.diff
Normal file
45
llm/patches/04-metal.diff
Normal file
|
@ -0,0 +1,45 @@
|
||||||
|
diff --git a/ggml-metal.m b/ggml-metal.m
|
||||||
|
index 0207b787..b5e9884b 100644
|
||||||
|
--- a/ggml-metal.m
|
||||||
|
+++ b/ggml-metal.m
|
||||||
|
@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
|
// to the matrix-vector kernel
|
||||||
|
int ne11_mm_min = 1;
|
||||||
|
|
||||||
|
-#if 0
|
||||||
|
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||||
|
// these numbers do not translate to other devices or model sizes
|
||||||
|
// TODO: need to find a better approach
|
||||||
|
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
||||||
|
- switch (src0t) {
|
||||||
|
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||||
|
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||||
|
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||||
|
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||||
|
- case GGML_TYPE_Q4_0:
|
||||||
|
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||||
|
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||||
|
- case GGML_TYPE_Q5_0: // not tested yet
|
||||||
|
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||||
|
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||||
|
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||||
|
- default: ne11_mm_min = 1; break;
|
||||||
|
- }
|
||||||
|
+ switch (src0t) {
|
||||||
|
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||||
|
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||||
|
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||||
|
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||||
|
+ case GGML_TYPE_Q4_0:
|
||||||
|
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||||
|
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||||
|
+ case GGML_TYPE_Q5_0: // not tested yet
|
||||||
|
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||||
|
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||||
|
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||||
|
+ default: ne11_mm_min = 1; break;
|
||||||
|
}
|
||||||
|
-#endif
|
||||||
|
|
||||||
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||||
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
|
@ -46,6 +46,7 @@ type Scheduler struct {
|
||||||
// TODO set this to zero after a release or two, to enable multiple models by default
|
// TODO set this to zero after a release or two, to enable multiple models by default
|
||||||
var loadedMax = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
|
var loadedMax = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
|
||||||
var maxQueuedRequests = 10 // TODO configurable
|
var maxQueuedRequests = 10 // TODO configurable
|
||||||
|
var numParallel = 1
|
||||||
|
|
||||||
func InitScheduler(ctx context.Context) *Scheduler {
|
func InitScheduler(ctx context.Context) *Scheduler {
|
||||||
maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
|
maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
|
||||||
|
@ -57,6 +58,14 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
||||||
loadedMax = m
|
loadedMax = m
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
|
||||||
|
p, err := strconv.Atoi(onp)
|
||||||
|
if err != nil || p <= 0 {
|
||||||
|
slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
|
||||||
|
} else {
|
||||||
|
numParallel = p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sched := &Scheduler{
|
sched := &Scheduler{
|
||||||
pendingReqCh: make(chan *LlmRequest, maxQueuedRequests),
|
pendingReqCh: make(chan *LlmRequest, maxQueuedRequests),
|
||||||
|
@ -81,6 +90,8 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
|
||||||
successCh: make(chan *runnerRef),
|
successCh: make(chan *runnerRef),
|
||||||
errCh: make(chan error, 1),
|
errCh: make(chan error, 1),
|
||||||
}
|
}
|
||||||
|
// context split across parallel threads
|
||||||
|
opts.NumCtx = opts.NumCtx * numParallel
|
||||||
select {
|
select {
|
||||||
case s.pendingReqCh <- req:
|
case s.pendingReqCh <- req:
|
||||||
default:
|
default:
|
||||||
|
|
Loading…
Reference in a new issue