diff --git a/llama/runner/runner.go b/llama/runner/runner.go index 0255ed55..9b1534e4 100644 --- a/llama/runner/runner.go +++ b/llama/runner/runner.go @@ -833,10 +833,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) { } } +type multiLPath []string + +func (m *multiLPath) Set(value string) error { + *m = append(*m, value) + return nil +} + +func (m *multiLPath) String() string { + return strings.Join(*m, ", ") +} + func (s *Server) loadModel( params llama.ModelParams, mpath string, - lpath string, + lpath multiLPath, ppath string, kvSize int, flashAttention bool, @@ -857,10 +868,12 @@ func (s *Server) loadModel( panic(err) } - if lpath != "" { - err := s.model.ApplyLoraFromFile(s.lc, lpath, 1.0, threads) - if err != nil { - panic(err) + if lpath.String() != "" { + for _, path := range lpath { + err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads) + if err != nil { + panic(err) + } } } @@ -890,7 +903,6 @@ func main() { mainGpu := flag.Int("main-gpu", 0, "Main GPU") flashAttention := flag.Bool("flash-attn", false, "Enable flash attention") kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size") - lpath := flag.String("lora", "", "Path to lora layer file") port := flag.Int("port", 8080, "Port to expose the server on") threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") verbose := flag.Bool("verbose", false, "verbose output (default: disabled)") @@ -900,6 +912,9 @@ func main() { multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") requirements := flag.Bool("requirements", false, "print json requirement information") + var lpaths multiLPath + flag.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)") + flag.Parse() if *requirements { printRequirements(os.Stdout) @@ -946,7 +961,7 @@ func main() { params := llama.ModelParams{ NumGpuLayers: *nGpuLayers, MainGpu: *mainGpu, - UseMmap: !*noMmap && *lpath == "", + UseMmap: !*noMmap && lpaths.String() == "", UseMlock: *mlock, TensorSplit: tensorSplitFloats, Progress: func(progress float32) { @@ -955,7 +970,7 @@ func main() { } server.ready.Add(1) - go server.loadModel(params, *mpath, *lpath, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache) + go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache) server.cond = sync.NewCond(&server.mu) diff --git a/llm/server.go b/llm/server.go index b2405905..2afc5562 100644 --- a/llm/server.go +++ b/llm/server.go @@ -144,10 +144,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter // Loop through potential servers finalErr := errors.New("no suitable llama servers found") - if len(adapters) > 1 { - return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") - } - rDir, err := runners.Refresh(build.EmbedFS) if err != nil { return nil, err @@ -201,8 +197,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter } if len(adapters) > 0 { - // TODO: applying multiple adapters is not supported by the llama.cpp server yet - params = append(params, "--lora", adapters[0]) + for _, adapter := range adapters { + params = append(params, "--lora", adapter) + } } if len(projectors) > 0 {