Support Multiple LoRa Adapters (#7667)

Closes #7627
This commit is contained in:
ItzCrazyKns 2024-11-28 00:30:04 +05:30 committed by GitHub
parent 940e62772e
commit e3936d4fb3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 26 additions and 14 deletions

View file

@ -833,10 +833,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
} }
} }
type multiLPath []string
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) loadModel( func (s *Server) loadModel(
params llama.ModelParams, params llama.ModelParams,
mpath string, mpath string,
lpath string, lpath multiLPath,
ppath string, ppath string,
kvSize int, kvSize int,
flashAttention bool, flashAttention bool,
@ -857,10 +868,12 @@ func (s *Server) loadModel(
panic(err) panic(err)
} }
if lpath != "" { if lpath.String() != "" {
err := s.model.ApplyLoraFromFile(s.lc, lpath, 1.0, threads) for _, path := range lpath {
if err != nil { err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
panic(err) if err != nil {
panic(err)
}
} }
} }
@ -890,7 +903,6 @@ func main() {
mainGpu := flag.Int("main-gpu", 0, "Main GPU") mainGpu := flag.Int("main-gpu", 0, "Main GPU")
flashAttention := flag.Bool("flash-attn", false, "Enable flash attention") flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size") kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
lpath := flag.String("lora", "", "Path to lora layer file")
port := flag.Int("port", 8080, "Port to expose the server on") port := flag.Int("port", 8080, "Port to expose the server on")
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation") threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)") verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
@ -900,6 +912,9 @@ func main() {
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users") multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
requirements := flag.Bool("requirements", false, "print json requirement information") requirements := flag.Bool("requirements", false, "print json requirement information")
var lpaths multiLPath
flag.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
flag.Parse() flag.Parse()
if *requirements { if *requirements {
printRequirements(os.Stdout) printRequirements(os.Stdout)
@ -946,7 +961,7 @@ func main() {
params := llama.ModelParams{ params := llama.ModelParams{
NumGpuLayers: *nGpuLayers, NumGpuLayers: *nGpuLayers,
MainGpu: *mainGpu, MainGpu: *mainGpu,
UseMmap: !*noMmap && *lpath == "", UseMmap: !*noMmap && lpaths.String() == "",
UseMlock: *mlock, UseMlock: *mlock,
TensorSplit: tensorSplitFloats, TensorSplit: tensorSplitFloats,
Progress: func(progress float32) { Progress: func(progress float32) {
@ -955,7 +970,7 @@ func main() {
} }
server.ready.Add(1) server.ready.Add(1)
go server.loadModel(params, *mpath, *lpath, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache) go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
server.cond = sync.NewCond(&server.mu) server.cond = sync.NewCond(&server.mu)

View file

@ -144,10 +144,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
// Loop through potential servers // Loop through potential servers
finalErr := errors.New("no suitable llama servers found") finalErr := errors.New("no suitable llama servers found")
if len(adapters) > 1 {
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
}
rDir, err := runners.Refresh(build.EmbedFS) rDir, err := runners.Refresh(build.EmbedFS)
if err != nil { if err != nil {
return nil, err return nil, err
@ -201,8 +197,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
} }
if len(adapters) > 0 { if len(adapters) > 0 {
// TODO: applying multiple adapters is not supported by the llama.cpp server yet for _, adapter := range adapters {
params = append(params, "--lora", adapters[0]) params = append(params, "--lora", adapter)
}
} }
if len(projectors) > 0 { if len(projectors) > 0 {