ollama/gpu/gpu.go

//go:build linux || windows

package gpu

/*
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
#cgo windows LDFLAGS: -lpthread

#include "gpu_info.h"

*/
import "C"
import (
	"fmt"
	"log"
	"runtime"
	"sync"
	"unsafe"
)

type handles struct {
	cuda *C.cuda_handle_t
	rocm *C.rocm_handle_t
}

var gpuMutex sync.Mutex
var gpuHandles *handles = nil

// With our current CUDA compile flags, 5.2 and older will not work properly
const CudaComputeMajorMin = 6

// Note: gpuMutex must already be held
func initGPUHandles() {
	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
	log.Printf("Detecting GPU type")
	gpuHandles = &handles{nil, nil}
	var resp C.cuda_init_resp_t
	C.cuda_init(&resp)
	if resp.err != nil {
		log.Printf("CUDA not detected: %s", C.GoString(resp.err))
		C.free(unsafe.Pointer(resp.err))

		var resp C.rocm_init_resp_t
		C.rocm_init(&resp)
		if resp.err != nil {
			log.Printf("ROCm not detected: %s", C.GoString(resp.err))
			C.free(unsafe.Pointer(resp.err))
		} else {
			log.Printf("Radeon GPU detected")
			rocm := resp.rh
			gpuHandles.rocm = &rocm
		}
	} else {
		log.Printf("Nvidia GPU detected")
		cuda := resp.ch
		gpuHandles.cuda = &cuda
	}
}

func GetGPUInfo() GpuInfo {
	// TODO - consider exploring lspci (and equivalent on windows) to check for
	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
	gpuMutex.Lock()
	defer gpuMutex.Unlock()
	if gpuHandles == nil {
		initGPUHandles()
	}

	var memInfo C.mem_info_t
	resp := GpuInfo{}
	if gpuHandles.cuda != nil {
		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
		if memInfo.err != nil {
			log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
			C.free(unsafe.Pointer(memInfo.err))
		} else {
			// Verify minimum compute capability
			var cc C.cuda_compute_capability_t
			C.cuda_compute_capability(*gpuHandles.cuda, &cc)
			if cc.err != nil {
				log.Printf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err))
				C.free(unsafe.Pointer(cc.err))
			} else if cc.major >= CudaComputeMajorMin {
				log.Printf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)
				resp.Library = "cuda"
			} else {
				log.Printf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)
			}
		}
	} else if gpuHandles.rocm != nil {
		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
		if memInfo.err != nil {
			log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))
			C.free(unsafe.Pointer(memInfo.err))
		} else {
			resp.Library = "rocm"
		}
	}
	if resp.Library == "" {
		C.cpu_check_ram(&memInfo)
		// In the future we may offer multiple CPU variants to tune CPU features
		if runtime.GOOS == "windows" {
			resp.Library = "cpu"
		} else {
			resp.Library = "default"
		}
	}
	if memInfo.err != nil {
		log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
		C.free(unsafe.Pointer(memInfo.err))
		return resp
	}

	resp.DeviceCount = uint32(memInfo.count)
	resp.FreeMemory = uint64(memInfo.free)
	resp.TotalMemory = uint64(memInfo.total)
	return resp
}

func getCPUMem() (memInfo, error) {
	var ret memInfo
	var info C.mem_info_t
	C.cpu_check_ram(&info)
	if info.err != nil {
		defer C.free(unsafe.Pointer(info.err))
		return ret, fmt.Errorf(C.GoString(info.err))
	}
	ret.FreeMemory = uint64(info.free)
	ret.TotalMemory = uint64(info.total)
	return ret, nil
}

func CheckVRAM() (int64, error) {
	gpuInfo := GetGPUInfo()
	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
		// leave 10% or 384Mi of VRAM free for unaccounted for overhead
		overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10
		if overhead < 384*1024*1024 {
			overhead = 384 * 1024 * 1024
		}
		return int64(gpuInfo.FreeMemory - overhead), nil
	}

	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
}
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`//go:build linux \|\| windows`

			`package gpu`

			`/*`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm`
			`#cgo windows LDFLAGS: -lpthread`

Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`#include "gpu_info.h"`

			`*/`
			`import "C"`
			`import (`
			`"fmt"`
			`"log"`
Switch windows build to fully dynamic Refactor where we store build outputs, and support a fully dynamic loading model on windows so the base executable has no special dependencies thus doesn't require a special PATH. 2023-12-23 19:35:44 +00:00			`"runtime"`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`"sync"`
			`"unsafe"`
			`)`

			`type handles struct {`
			`cuda *C.cuda_handle_t`
			`rocm *C.rocm_handle_t`
			`}`

			`var gpuMutex sync.Mutex`
			`var gpuHandles *handles = nil`

Set corret CUDA minimum compute capability version If you attempt to run the current CUDA build on compute capability 5.2 cards, you'll hit the following failure: cuBLAS error 15 at ggml-cuda.cu:7956: the requested functionality is not supported 2024-01-09 19:28:24 +00:00			`// With our current CUDA compile flags, 5.2 and older will not work properly`
			`const CudaComputeMajorMin = 6`
Detect very old CUDA GPUs and fall back to CPU If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode. 2024-01-07 05:40:04 +00:00
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`// Note: gpuMutex must already be held`
			`func initGPUHandles() {`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`log.Printf("Detecting GPU type")`
			`gpuHandles = &handles{nil, nil}`
			`var resp C.cuda_init_resp_t`
			`C.cuda_init(&resp)`
			`if resp.err != nil {`
			`log.Printf("CUDA not detected: %s", C.GoString(resp.err))`
			`C.free(unsafe.Pointer(resp.err))`

			`var resp C.rocm_init_resp_t`
			`C.rocm_init(&resp)`
			`if resp.err != nil {`
			`log.Printf("ROCm not detected: %s", C.GoString(resp.err))`
			`C.free(unsafe.Pointer(resp.err))`
			`} else {`
			`log.Printf("Radeon GPU detected")`
			`rocm := resp.rh`
			`gpuHandles.rocm = &rocm`
			`}`
			`} else {`
			`log.Printf("Nvidia GPU detected")`
			`cuda := resp.ch`
			`gpuHandles.cuda = &cuda`
			`}`
			`}`

			`func GetGPUInfo() GpuInfo {`
			`// TODO - consider exploring lspci (and equivalent on windows) to check for`
			`// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries`
			`gpuMutex.Lock()`
			`defer gpuMutex.Unlock()`
			`if gpuHandles == nil {`
			`initGPUHandles()`
			`}`

			`var memInfo C.mem_info_t`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`resp := GpuInfo{}`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`if gpuHandles.cuda != nil {`
			`C.cuda_check_vram(*gpuHandles.cuda, &memInfo)`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`if memInfo.err != nil {`
			`log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))`
			`C.free(unsafe.Pointer(memInfo.err))`
			`} else {`
Detect very old CUDA GPUs and fall back to CPU If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode. 2024-01-07 05:40:04 +00:00			`// Verify minimum compute capability`
			`var cc C.cuda_compute_capability_t`
			`C.cuda_compute_capability(*gpuHandles.cuda, &cc)`
			`if cc.err != nil {`
			`log.Printf("error looking up CUDA GPU compute capability: %s", C.GoString(cc.err))`
			`C.free(unsafe.Pointer(cc.err))`
			`} else if cc.major >= CudaComputeMajorMin {`
			`log.Printf("CUDA Compute Capability detected: %d.%d", cc.major, cc.minor)`
			`resp.Library = "cuda"`
			`} else {`
			`log.Printf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)`
			`}`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`}`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`} else if gpuHandles.rocm != nil {`
			`C.rocm_check_vram(*gpuHandles.rocm, &memInfo)`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`if memInfo.err != nil {`
			`log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))`
			`C.free(unsafe.Pointer(memInfo.err))`
			`} else {`
Switch windows build to fully dynamic Refactor where we store build outputs, and support a fully dynamic loading model on windows so the base executable has no special dependencies thus doesn't require a special PATH. 2023-12-23 19:35:44 +00:00			`resp.Library = "rocm"`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`}`
			`}`
Switch windows build to fully dynamic Refactor where we store build outputs, and support a fully dynamic loading model on windows so the base executable has no special dependencies thus doesn't require a special PATH. 2023-12-23 19:35:44 +00:00			`if resp.Library == "" {`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`C.cpu_check_ram(&memInfo)`
Revamp the dynamic library shim This switches the default llama.cpp to be CPU based, and builds the GPU variants as dynamically loaded libraries which we can select at runtime. This also bumps the ROCm library to version 6 given 5.7 builds don't work on the latest ROCm library that just shipped. 2023-12-20 18:36:01 +00:00			`// In the future we may offer multiple CPU variants to tune CPU features`
Switch windows build to fully dynamic Refactor where we store build outputs, and support a fully dynamic loading model on windows so the base executable has no special dependencies thus doesn't require a special PATH. 2023-12-23 19:35:44 +00:00			`if runtime.GOOS == "windows" {`
			`resp.Library = "cpu"`
			`} else {`
			`resp.Library = "default"`
			`}`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`if memInfo.err != nil {`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`C.free(unsafe.Pointer(memInfo.err))`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`return resp`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
calculate overhead based number of gpu devices (#1875) 2024-01-09 20:53:33 +00:00
			`resp.DeviceCount = uint32(memInfo.count)`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp.FreeMemory = uint64(memInfo.free)`
			`resp.TotalMemory = uint64(memInfo.total)`
			`return resp`
			`}`

Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`func getCPUMem() (memInfo, error) {`
			`var ret memInfo`
			`var info C.mem_info_t`
			`C.cpu_check_ram(&info)`
			`if info.err != nil {`
			`defer C.free(unsafe.Pointer(info.err))`
			`return ret, fmt.Errorf(C.GoString(info.err))`
			`}`
			`ret.FreeMemory = uint64(info.free)`
			`ret.TotalMemory = uint64(info.total)`
			`return ret, nil`
			`}`

Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`func CheckVRAM() (int64, error) {`
			`gpuInfo := GetGPUInfo()`
Switch windows build to fully dynamic Refactor where we store build outputs, and support a fully dynamic loading model on windows so the base executable has no special dependencies thus doesn't require a special PATH. 2023-12-23 19:35:44 +00:00			`if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" \|\| gpuInfo.Library == "rocm") {`
update rough cuda overhead estimate to 15% + 384MiB 2024-01-09 16:47:30 +00:00			`// leave 10% or 384Mi of VRAM free for unaccounted for overhead`
calculate overhead based number of gpu devices (#1875) 2024-01-09 20:53:33 +00:00			`overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10`
update rough cuda overhead estimate to 15% + 384MiB 2024-01-09 16:47:30 +00:00			`if overhead < 38410241024 {`
			`overhead = 384 * 1024 * 1024`
			`}`
			`return int64(gpuInfo.FreeMemory - overhead), nil`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`

Offload layers to GPU based on new model size estimates (#1850) * select layers based on estimated model memory usage * always account for scratch vram * dont load +1 layers * better estmation for graph alloc * Update gpu/gpu_darwin.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * Update llm/llm.go * add overhead for cuda memory * Update llm/llm.go Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> * fix build error on linux * address comments --------- Co-authored-by: Bruce MacDonald <brucewmacdonald@gmail.com> 2024-01-08 21:42:00 +00:00			`return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`