unbound max num gpu layers (#591)
--------- Co-authored-by: Michael Yang <mxyng@pm.me>
This commit is contained in:
parent
b934bf23e6
commit
86279f4ae3
4 changed files with 36 additions and 29 deletions
|
@ -77,6 +77,7 @@ type model interface {
|
||||||
ModelFamily() string
|
ModelFamily() string
|
||||||
ModelType() string
|
ModelType() string
|
||||||
FileType() string
|
FileType() string
|
||||||
|
NumLayers() int64
|
||||||
}
|
}
|
||||||
|
|
||||||
type container interface {
|
type container interface {
|
||||||
|
|
10
llm/gguf.go
10
llm/gguf.go
|
@ -195,6 +195,16 @@ func (llm *ggufModel) Decode(r io.Reader) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (llm *ggufModel) NumLayers() int64 {
|
||||||
|
value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
|
||||||
|
if !exists {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
v := value.(uint32)
|
||||||
|
return int64(v)
|
||||||
|
}
|
||||||
|
|
||||||
func (ggufModel) readU8(r io.Reader) uint8 {
|
func (ggufModel) readU8(r io.Reader) uint8 {
|
||||||
var u8 uint8
|
var u8 uint8
|
||||||
binary.Read(r, binary.LittleEndian, &u8)
|
binary.Read(r, binary.LittleEndian, &u8)
|
||||||
|
|
50
llm/llama.go
50
llm/llama.go
|
@ -152,6 +152,10 @@ func (llm *llamaModel) FileType() string {
|
||||||
return fileType(llm.hyperparameters.FileType)
|
return fileType(llm.hyperparameters.FileType)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (llm *llamaModel) NumLayers() int64 {
|
||||||
|
return int64(llm.hyperparameters.NumLayer)
|
||||||
|
}
|
||||||
|
|
||||||
type llamaHyperparameters struct {
|
type llamaHyperparameters struct {
|
||||||
// NumVocab is the size of the model's vocabulary.
|
// NumVocab is the size of the model's vocabulary.
|
||||||
NumVocab uint32
|
NumVocab uint32
|
||||||
|
@ -207,13 +211,13 @@ func CheckVRAM() (int, error) {
|
||||||
return total, nil
|
return total, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func NumGPU(opts api.Options) int {
|
func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
|
||||||
if opts.NumGPU != -1 {
|
if opts.NumGPU != -1 {
|
||||||
return opts.NumGPU
|
return opts.NumGPU
|
||||||
}
|
}
|
||||||
n := 1 // default to enable metal on macOS
|
n := 1 // default to enable metal on macOS
|
||||||
if runtime.GOOS == "linux" {
|
if runtime.GOOS == "linux" {
|
||||||
vram, err := CheckVRAM()
|
vramMib, err := CheckVRAM()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err.Error() != "nvidia-smi command failed" {
|
if err.Error() != "nvidia-smi command failed" {
|
||||||
log.Print(err.Error())
|
log.Print(err.Error())
|
||||||
|
@ -221,33 +225,25 @@ func NumGPU(opts api.Options) int {
|
||||||
// nvidia driver not installed or no nvidia GPU found
|
// nvidia driver not installed or no nvidia GPU found
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
// TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
|
|
||||||
switch {
|
totalVramBytes := int64(vramMib) * 1024 * 1024 // 1 MiB = 1024^2 bytes
|
||||||
case vram < 500:
|
|
||||||
log.Printf("WARNING: Low VRAM detected, disabling GPU")
|
// Calculate bytes per layer
|
||||||
n = 0
|
// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
|
||||||
case vram < 1000:
|
bytesPerLayer := fileSizeBytes / numLayer
|
||||||
n = 4
|
|
||||||
case vram < 2000:
|
// set n to the max number of layers we can fit in VRAM
|
||||||
n = 8
|
return int(totalVramBytes / bytesPerLayer)
|
||||||
case vram < 4000:
|
|
||||||
n = 12
|
log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, n)
|
||||||
case vram < 8000:
|
|
||||||
n = 16
|
|
||||||
case vram < 12000:
|
|
||||||
n = 24
|
|
||||||
case vram < 16000:
|
|
||||||
n = 32
|
|
||||||
default:
|
|
||||||
n = 48
|
|
||||||
}
|
|
||||||
log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
|
|
||||||
}
|
}
|
||||||
return n
|
// default to enable metal on macOS
|
||||||
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
func newLlama(model string, adapters []string, runners []ModelRunner, opts api.Options) (*llama, error) {
|
func newLlama(model string, adapters []string, runners []ModelRunner, numLayers int64, opts api.Options) (*llama, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
fileInfo, err := os.Stat(model)
|
||||||
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -261,7 +257,7 @@ func newLlama(model string, adapters []string, runners []ModelRunner, opts api.O
|
||||||
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
||||||
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
||||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||||
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
|
"--n-gpu-layers", fmt.Sprintf("%d", NumGPU(numLayers, fileInfo.Size(), opts)),
|
||||||
"--embedding",
|
"--embedding",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -91,9 +91,9 @@ func New(workDir, model string, adapters []string, opts api.Options) (LLM, error
|
||||||
switch ggml.Name() {
|
switch ggml.Name() {
|
||||||
case "gguf":
|
case "gguf":
|
||||||
opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
|
opts.NumGQA = 0 // TODO: remove this when llama.cpp runners differ enough to need separate newLlama functions
|
||||||
return newLlama(model, adapters, chooseRunners(workDir, "gguf"), opts)
|
return newLlama(model, adapters, chooseRunners(workDir, "gguf"), ggml.NumLayers(), opts)
|
||||||
case "ggml", "ggmf", "ggjt", "ggla":
|
case "ggml", "ggmf", "ggjt", "ggla":
|
||||||
return newLlama(model, adapters, chooseRunners(workDir, "ggml"), opts)
|
return newLlama(model, adapters, chooseRunners(workDir, "ggml"), ggml.NumLayers(), opts)
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
|
return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue