parent
940e62772e
commit
e3936d4fb3
2 changed files with 26 additions and 14 deletions
|
@ -833,10 +833,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type multiLPath []string
|
||||||
|
|
||||||
|
func (m *multiLPath) Set(value string) error {
|
||||||
|
*m = append(*m, value)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *multiLPath) String() string {
|
||||||
|
return strings.Join(*m, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Server) loadModel(
|
func (s *Server) loadModel(
|
||||||
params llama.ModelParams,
|
params llama.ModelParams,
|
||||||
mpath string,
|
mpath string,
|
||||||
lpath string,
|
lpath multiLPath,
|
||||||
ppath string,
|
ppath string,
|
||||||
kvSize int,
|
kvSize int,
|
||||||
flashAttention bool,
|
flashAttention bool,
|
||||||
|
@ -857,12 +868,14 @@ func (s *Server) loadModel(
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
if lpath != "" {
|
if lpath.String() != "" {
|
||||||
err := s.model.ApplyLoraFromFile(s.lc, lpath, 1.0, threads)
|
for _, path := range lpath {
|
||||||
|
err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if ppath != "" {
|
if ppath != "" {
|
||||||
var err error
|
var err error
|
||||||
|
@ -890,7 +903,6 @@ func main() {
|
||||||
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
|
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
|
||||||
flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
|
flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
|
||||||
kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
|
kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
|
||||||
lpath := flag.String("lora", "", "Path to lora layer file")
|
|
||||||
port := flag.Int("port", 8080, "Port to expose the server on")
|
port := flag.Int("port", 8080, "Port to expose the server on")
|
||||||
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||||
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
|
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
|
||||||
|
@ -900,6 +912,9 @@ func main() {
|
||||||
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
||||||
requirements := flag.Bool("requirements", false, "print json requirement information")
|
requirements := flag.Bool("requirements", false, "print json requirement information")
|
||||||
|
|
||||||
|
var lpaths multiLPath
|
||||||
|
flag.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
if *requirements {
|
if *requirements {
|
||||||
printRequirements(os.Stdout)
|
printRequirements(os.Stdout)
|
||||||
|
@ -946,7 +961,7 @@ func main() {
|
||||||
params := llama.ModelParams{
|
params := llama.ModelParams{
|
||||||
NumGpuLayers: *nGpuLayers,
|
NumGpuLayers: *nGpuLayers,
|
||||||
MainGpu: *mainGpu,
|
MainGpu: *mainGpu,
|
||||||
UseMmap: !*noMmap && *lpath == "",
|
UseMmap: !*noMmap && lpaths.String() == "",
|
||||||
UseMlock: *mlock,
|
UseMlock: *mlock,
|
||||||
TensorSplit: tensorSplitFloats,
|
TensorSplit: tensorSplitFloats,
|
||||||
Progress: func(progress float32) {
|
Progress: func(progress float32) {
|
||||||
|
@ -955,7 +970,7 @@ func main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
server.ready.Add(1)
|
server.ready.Add(1)
|
||||||
go server.loadModel(params, *mpath, *lpath, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
|
go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
|
||||||
|
|
||||||
server.cond = sync.NewCond(&server.mu)
|
server.cond = sync.NewCond(&server.mu)
|
||||||
|
|
||||||
|
|
|
@ -144,10 +144,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||||
// Loop through potential servers
|
// Loop through potential servers
|
||||||
finalErr := errors.New("no suitable llama servers found")
|
finalErr := errors.New("no suitable llama servers found")
|
||||||
|
|
||||||
if len(adapters) > 1 {
|
|
||||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
|
||||||
}
|
|
||||||
|
|
||||||
rDir, err := runners.Refresh(build.EmbedFS)
|
rDir, err := runners.Refresh(build.EmbedFS)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -201,8 +197,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(adapters) > 0 {
|
if len(adapters) > 0 {
|
||||||
// TODO: applying multiple adapters is not supported by the llama.cpp server yet
|
for _, adapter := range adapters {
|
||||||
params = append(params, "--lora", adapters[0])
|
params = append(params, "--lora", adapter)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(projectors) > 0 {
|
if len(projectors) > 0 {
|
||||||
|
|
Loading…
Reference in a new issue