partial offloading
This commit is contained in:
parent
8b2c10061c
commit
7e33a017c0
6 changed files with 98 additions and 86 deletions
|
@ -50,7 +50,7 @@ func HumanBytes(b int64) string {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func HumanBytes2(b int64) string {
|
func HumanBytes2(b uint64) string {
|
||||||
switch {
|
switch {
|
||||||
case b >= MebiByte:
|
case b >= MebiByte:
|
||||||
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
|
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
|
||||||
|
|
|
@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) {
|
||||||
return ret, nil
|
return ret, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func CheckVRAM() (int64, error) {
|
func CheckVRAM() (uint64, error) {
|
||||||
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
||||||
if userLimit != "" {
|
if userLimit != "" {
|
||||||
avail, err := strconv.ParseInt(userLimit, 10, 64)
|
avail, err := strconv.ParseInt(userLimit, 10, 64)
|
||||||
|
@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) {
|
||||||
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
|
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
|
||||||
}
|
}
|
||||||
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
|
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
|
||||||
return avail, nil
|
return uint64(avail), nil
|
||||||
}
|
}
|
||||||
gpuInfo := GetGPUInfo()
|
gpuInfo := GetGPUInfo()
|
||||||
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
|
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
|
||||||
return int64(gpuInfo.FreeMemory), nil
|
return gpuInfo.FreeMemory, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
|
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
|
||||||
|
|
|
@ -17,7 +17,7 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
|
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
|
||||||
func CheckVRAM() (int64, error) {
|
func CheckVRAM() (uint64, error) {
|
||||||
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
userLimit := os.Getenv("OLLAMA_MAX_VRAM")
|
||||||
if userLimit != "" {
|
if userLimit != "" {
|
||||||
avail, err := strconv.ParseInt(userLimit, 10, 64)
|
avail, err := strconv.ParseInt(userLimit, 10, 64)
|
||||||
|
@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) {
|
||||||
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
|
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
|
||||||
}
|
}
|
||||||
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
|
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
|
||||||
return avail, nil
|
return uint64(avail), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if runtime.GOARCH == "amd64" {
|
if runtime.GOARCH == "amd64" {
|
||||||
// gpu not supported, this may not be metal
|
// gpu not supported, this may not be metal
|
||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
recommendedMaxVRAM := int64(C.getRecommendedMaxVRAM())
|
return uint64(C.getRecommendedMaxVRAM()), nil
|
||||||
return recommendedMaxVRAM, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetGPUInfo() GpuInfo {
|
func GetGPUInfo() GpuInfo {
|
||||||
|
|
|
@ -15,7 +15,7 @@ type GpuInfo struct {
|
||||||
Variant string `json:"variant,omitempty"`
|
Variant string `json:"variant,omitempty"`
|
||||||
|
|
||||||
// MinimumMemory represents the minimum memory required to use the GPU
|
// MinimumMemory represents the minimum memory required to use the GPU
|
||||||
MinimumMemory int64 `json:"-"`
|
MinimumMemory uint64 `json:"-"`
|
||||||
|
|
||||||
// TODO add other useful attributes about the card here for discovery information
|
// TODO add other useful attributes about the card here for discovery information
|
||||||
}
|
}
|
||||||
|
|
77
llm/ggml.go
77
llm/ggml.go
|
@ -324,45 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
||||||
}, offset, nil
|
}, offset, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm GGML) GraphSize(context, batch int) (int64, bool) {
|
func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
|
||||||
embeddingLength := llm.KV().EmbeddingLength()
|
embedding := llm.KV().EmbeddingLength()
|
||||||
headCount := llm.KV().HeadCount()
|
heads := llm.KV().HeadCount()
|
||||||
headCountKV := llm.KV().HeadCountKV()
|
headsKV := llm.KV().HeadCountKV()
|
||||||
vocabLength := len(llm.KV()["tokenizer.ggml.tokens"].([]any))
|
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
|
||||||
|
|
||||||
layers := llm.Tensors().Layers()
|
|
||||||
|
|
||||||
var attnQKVWeight1 uint64 = 0
|
|
||||||
if t, ok := layers["0"]["attn_qkv.weight"]; ok && len(t.Shape) > 2 {
|
|
||||||
attnQKVWeight1 = t.Shape[1]
|
|
||||||
}
|
|
||||||
|
|
||||||
var ffnGate0Weight1 uint64 = 0
|
|
||||||
if t, ok := layers["0"]["ffn_gate.0.weight"]; ok && len(t.Shape) > 2 {
|
|
||||||
ffnGate0Weight1 = t.Shape[1]
|
|
||||||
}
|
|
||||||
|
|
||||||
switch llm.KV().Architecture() {
|
switch llm.KV().Architecture() {
|
||||||
case "gemma", "command-r":
|
|
||||||
return 4 * int64(batch) * int64(embeddingLength+uint64(vocabLength)), true
|
|
||||||
case "phi2":
|
|
||||||
return max(
|
|
||||||
4*int64(batch)*int64(embeddingLength+uint64(vocabLength)),
|
|
||||||
4*int64(batch)*int64(1+4*embeddingLength+uint64(context)+attnQKVWeight1+uint64(context)*headCount),
|
|
||||||
), true
|
|
||||||
case "qwen2":
|
|
||||||
return max(
|
|
||||||
4*int64(batch)*int64(embeddingLength+uint64(vocabLength)),
|
|
||||||
4*int64(batch)*int64(1+2*embeddingLength+uint64(context)+uint64(context)*headCount),
|
|
||||||
), true
|
|
||||||
case "llama":
|
case "llama":
|
||||||
if ffnGate0Weight1 > 0 {
|
fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
|
||||||
// moe
|
|
||||||
return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate0Weight1), true
|
|
||||||
}
|
|
||||||
|
|
||||||
return 4 * int64(batch) * int64(1+4*embeddingLength+uint64(context)+uint64(context)*headCount), true
|
partialOffload = 4 * batch * embedding
|
||||||
|
partialOffload += max(
|
||||||
|
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
|
||||||
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
|
)
|
||||||
|
case "gemma":
|
||||||
|
fullOffload = 4 * batch * (embedding + vocab)
|
||||||
|
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
|
||||||
|
case "command-r":
|
||||||
|
fullOffload = max(
|
||||||
|
4*batch*(embedding+vocab),
|
||||||
|
4*batch*(2+4*embedding+context*(1+heads)),
|
||||||
|
)
|
||||||
|
|
||||||
|
partialOffload = max(
|
||||||
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
|
4*batch*(1+2*embedding+context*(1+heads))+ 4*embedding*context+embedding*embedding*9/16,
|
||||||
|
)
|
||||||
|
case "qwen2":
|
||||||
|
fullOffload = max(
|
||||||
|
4*batch*(embedding+vocab),
|
||||||
|
4*batch*(1+2*embedding+context+context*heads),
|
||||||
|
)
|
||||||
|
|
||||||
|
partialOffload = max(
|
||||||
|
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||||
|
4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
|
||||||
|
)
|
||||||
|
case "phi2":
|
||||||
|
fullOffload = max(
|
||||||
|
4*batch*(embedding+vocab),
|
||||||
|
4*batch*(1+4*embedding+context+context*heads),
|
||||||
|
)
|
||||||
|
|
||||||
|
partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0, false
|
return
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) {
|
func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
f, err := os.Open(model)
|
f, err := os.Open(model)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -65,12 +61,12 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||||
opts.NumCtx = 4
|
opts.NumCtx = 4
|
||||||
}
|
}
|
||||||
|
|
||||||
availableMemory, _ := gpu.CheckVRAM()
|
memoryAvailable, _ := gpu.CheckVRAM()
|
||||||
info := gpu.GetGPUInfo()
|
info := gpu.GetGPUInfo()
|
||||||
|
|
||||||
usedMemory := info.MinimumMemory
|
memoryMinimum := info.MinimumMemory
|
||||||
for _, projector := range projectors {
|
for _, projector := range projectors {
|
||||||
usedMemory += projectorMemoryRequirements(projector)
|
memoryMinimum += projectorMemoryRequirements(projector)
|
||||||
|
|
||||||
// multimodal models require at least 2048 context
|
// multimodal models require at least 2048 context
|
||||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||||
|
@ -79,55 +75,65 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||||
|
|
||||||
graph, ok := ggml.GraphSize(opts.NumCtx, min(opts.NumCtx, opts.NumBatch))
|
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||||
if !ok {
|
if graphPartialOffload == 0 {
|
||||||
graph = int64(ggml.KV().GQA()*kv) / 6
|
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||||
}
|
}
|
||||||
|
|
||||||
usedMemory += graph
|
if graphFullOffload == 0 {
|
||||||
|
graphFullOffload = graphPartialOffload
|
||||||
if (usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture())) && info.Library != "metal" {
|
|
||||||
info.Library = "cpu"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
requiredMemory := usedMemory
|
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||||
|
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
||||||
|
|
||||||
tensorLayers := ggml.Tensors().Layers()
|
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||||
|
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
||||||
|
|
||||||
var layers int
|
if info.Library != "metal" {
|
||||||
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
if memoryRequiredPartial > memoryAvailable || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
|
||||||
layerMemory := int64(tensorLayers[fmt.Sprintf("%d", i)].size() + kv/ggml.KV().BlockCount())
|
info.Library = "cpu"
|
||||||
requiredMemory += layerMemory
|
|
||||||
|
|
||||||
if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
|
|
||||||
usedMemory += layerMemory
|
|
||||||
layers++
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
memOutputLayer := int64(tensorLayers["output"].size())
|
var layerCount int
|
||||||
requiredMemory += memOutputLayer
|
layers := ggml.Tensors().Layers()
|
||||||
|
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
||||||
|
memoryLayer := layers[fmt.Sprintf("%d", i)].size()
|
||||||
|
|
||||||
// only offload output layer if all repeating layers are offloaded
|
// KV is proportional to the number of layers
|
||||||
if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
|
memoryLayer += kv / ggml.KV().BlockCount()
|
||||||
usedMemory += memOutputLayer
|
|
||||||
layers++
|
memoryRequiredTotal += memoryLayer
|
||||||
|
if memoryAvailable > memoryRequiredPartial+memoryLayer {
|
||||||
|
memoryRequiredPartial += memoryLayer
|
||||||
|
layerCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
memoryLayerOutput := layers["output"].size()
|
||||||
|
memoryRequiredTotal += memoryLayerOutput
|
||||||
|
if memoryAvailable > memoryRequiredTotal {
|
||||||
|
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||||
|
memoryRequiredPartial = memoryRequiredTotal
|
||||||
|
}
|
||||||
|
|
||||||
|
if opts.NumGPU < 0 {
|
||||||
|
opts.NumGPU = layerCount
|
||||||
}
|
}
|
||||||
|
|
||||||
slog.Info(
|
slog.Info(
|
||||||
"offload to gpu",
|
"offload to gpu",
|
||||||
"layers", layers,
|
"reallayers", opts.NumGPU,
|
||||||
"required", format.HumanBytes2(requiredMemory),
|
"layers", layerCount,
|
||||||
"used", format.HumanBytes2(usedMemory),
|
"required", format.HumanBytes2(memoryRequiredTotal),
|
||||||
"available", format.HumanBytes2(availableMemory),
|
"used", format.HumanBytes2(memoryRequiredPartial),
|
||||||
"kv", format.HumanBytes2(int64(kv)),
|
"available", format.HumanBytes2(memoryAvailable),
|
||||||
"graph", format.HumanBytes2(graph),
|
"kv", format.HumanBytes2(kv),
|
||||||
|
"fulloffload", format.HumanBytes2(graphFullOffload),
|
||||||
|
"partialoffload", format.HumanBytes2(graphPartialOffload),
|
||||||
)
|
)
|
||||||
|
|
||||||
if opts.NumGPU < 0 && info.Library != "cpu" {
|
|
||||||
opts.NumGPU = layers
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(adapters) > 1 {
|
if len(adapters) > 1 {
|
||||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||||
}
|
}
|
||||||
|
@ -284,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||||
return nil, finalErr
|
return nil, finalErr
|
||||||
}
|
}
|
||||||
|
|
||||||
func projectorMemoryRequirements(filename string) int64 {
|
func projectorMemoryRequirements(filename string) uint64 {
|
||||||
file, err := os.Open(filename)
|
file, err := os.Open(filename)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0
|
return 0
|
||||||
|
@ -301,7 +307,7 @@ func projectorMemoryRequirements(filename string) int64 {
|
||||||
mem += layer.size()
|
mem += layer.size()
|
||||||
}
|
}
|
||||||
|
|
||||||
return int64(mem)
|
return mem
|
||||||
}
|
}
|
||||||
|
|
||||||
type ServerStatus int
|
type ServerStatus int
|
||||||
|
|
Loading…
Add table
Reference in a new issue