ollama/gpu/gpu_info_cuda.c

#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include "gpu_info_cuda.h"

#include <string.h>

#ifndef _WIN32
const char *cuda_lib_paths[] = {
    "libnvidia-ml.so",
    "/usr/local/cuda/lib64/libnvidia-ml.so",
    "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
    "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
    "/usr/lib/wsl/lib/libnvidia-ml.so.1",  // TODO Maybe glob?
    NULL,
};
#else
const char *cuda_lib_paths[] = {
    "nvml.dll",
    "",
    NULL,
};
#endif

#define CUDA_LOOKUP_SIZE 6

void cuda_init(cuda_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
  } l[CUDA_LOOKUP_SIZE] = {
      {"nvmlInit_v2", (void *)&resp->ch.initFn},
      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
  };

  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
  }
  if (!resp->ch.handle) {
    // TODO improve error message, as the LOAD_ERR will have typically have the
    // final path that was checked which might be confusing.
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
             cuda_lib_paths[0], msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      char *msg = LOAD_ERR();
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
               msg);
      free(msg);
      resp->err = strdup(buf);
      return;
    }
  }

  ret = (*resp->ch.initFn)();
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

  return;
}

void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle sn't initialized");
    return;
  }

  unsigned int devices;
  ret = (*h.getCount)(&devices);
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  resp->total = 0;
  resp->free = 0;

  for (i = 0; i < devices; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getMemInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    resp->total += memInfo.total;
    resp->free += memInfo.free;
  }
}

void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  resp->err = NULL;
  resp->major = 0;
  resp->minor = 0;
  nvmlDevice_t device;
  int major = 0;
  int minor = 0;
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle not initialized");
    return;
  }

  unsigned int devices;
  ret = (*h.getCount)(&devices);
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; i < devices; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getComputeCapability)(device, &major, &minor);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    // Report the lowest major.minor we detect as that limits our compatibility
    if (resp->major == 0 || resp->major > major ) {
      resp->major = major;
      resp->minor = minor;
    } else if ( resp->major == major && resp->minor > minor ) {
      resp->minor = minor;
    }
  }
}
#endif  // __APPLE__
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?`

			`#include "gpu_info_cuda.h"`

			`#include <string.h>`

			`#ifndef _WIN32`
			`const char *cuda_lib_paths[] = {`
			`"libnvidia-ml.so",`
			`"/usr/local/cuda/lib64/libnvidia-ml.so",`
Additional nvidial-ml path to check 2023-12-19 23:52:34 +00:00			`"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",`
add cuda lib path for nvidia container toolkit 2024-01-06 02:10:34 +00:00			`"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",`
Add WSL2 path to nvidia-ml.so library 2023-12-16 04:16:02 +00:00			`"/usr/lib/wsl/lib/libnvidia-ml.so.1", // TODO Maybe glob?`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`NULL,`
			`};`
			`#else`
			`const char *cuda_lib_paths[] = {`
			`"nvml.dll",`
			`"",`
			`NULL,`
			`};`
			`#endif`

Detect very old CUDA GPUs and fall back to CPU If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode. 2024-01-07 05:40:04 +00:00			`#define CUDA_LOOKUP_SIZE 6`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`void cuda_init(cuda_init_resp_t *resp) {`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`nvmlReturn_t ret;`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = NULL;`
			`const int buflen = 256;`
			`char buf[buflen + 1];`
			`int i;`

			`struct lookup {`
			`char *s;`
			`void **p;`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`} l[CUDA_LOOKUP_SIZE] = {`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`{"nvmlInit_v2", (void *)&resp->ch.initFn},`
			`{"nvmlShutdown", (void *)&resp->ch.shutdownFn},`
			`{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},`
			`{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},`
Detect very old CUDA GPUs and fall back to CPU If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode. 2024-01-07 05:40:04 +00:00			`{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`};`

			`for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {`
			`resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);`
			`}`
			`if (!resp->ch.handle) {`
Additional nvidial-ml path to check 2023-12-19 23:52:34 +00:00			`// TODO improve error message, as the LOAD_ERR will have typically have the`
			`// final path that was checked which might be confusing.`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`char *msg = LOAD_ERR();`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`snprintf(buf, buflen,`
			`"Unable to load %s library to query for Nvidia GPUs: %s",`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`cuda_lib_paths[0], msg);`
			`free(msg);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`

gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);`
			`if (!l[i].p) {`
			`UNLOAD_LIBRARY(resp->ch.handle);`
			`resp->ch.handle = NULL;`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`char *msg = LOAD_ERR();`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`msg);`
			`free(msg);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`
			`}`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00
			`ret = (*resp->ch.initFn)();`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "nvml vram init failure: %d", ret);`
			`resp->err = strdup(buf);`
			`}`

Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`return;`
			`}`

			`void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {`
			`resp->err = NULL;`
			`nvmlDevice_t device;`
			`nvmlMemory_t memInfo = {0};`
			`nvmlReturn_t ret;`
			`const int buflen = 256;`
			`char buf[buflen + 1];`
			`int i;`

			`if (h.handle == NULL) {`
			`resp->err = strdup("nvml handle sn't initialized");`
			`return;`
			`}`

gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`unsigned int devices;`
			`ret = (*h.getCount)(&devices);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`if (ret != NVML_SUCCESS) {`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`snprintf(buf, buflen, "unable to get device count: %d", ret);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`

gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`resp->total = 0;`
			`resp->free = 0;`

			`for (i = 0; i < devices; i++) {`
			`ret = (*h.getHandle)(i, &device);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`ret = (*h.getMemInfo)(device, &memInfo);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`resp->total += memInfo.total;`
			`resp->free += memInfo.free;`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`}`
Detect very old CUDA GPUs and fall back to CPU If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode. 2024-01-07 05:40:04 +00:00
			`void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {`
			`resp->err = NULL;`
			`resp->major = 0;`
			`resp->minor = 0;`
			`nvmlDevice_t device;`
			`int major = 0;`
			`int minor = 0;`
			`nvmlReturn_t ret;`
			`const int buflen = 256;`
			`char buf[buflen + 1];`
			`int i;`

			`if (h.handle == NULL) {`
			`resp->err = strdup("nvml handle not initialized");`
			`return;`
			`}`

			`unsigned int devices;`
			`ret = (*h.getCount)(&devices);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "unable to get device count: %d", ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`for (i = 0; i < devices; i++) {`
			`ret = (*h.getHandle)(i, &device);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`ret = (*h.getComputeCapability)(device, &major, &minor);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`
			`// Report the lowest major.minor we detect as that limits our compatibility`
			`if (resp->major == 0 \|\| resp->major > major ) {`
			`resp->major = major;`
			`resp->minor = minor;`
			`} else if ( resp->major == major && resp->minor > minor ) {`
			`resp->minor = minor;`
			`}`
			`}`
			`}`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`#endif // __APPLE__`