ollama/gpu/gpu_info_cuda.c

#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include "gpu_info_cuda.h"

#include <string.h>

#define CUDA_LOOKUP_SIZE 6

void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
  } l[CUDA_LOOKUP_SIZE] = {
      {"nvmlInit_v2", (void *)&resp->ch.initFn},
      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
  };

  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
             cuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      char *msg = LOAD_ERR();
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
               msg);
      free(msg);
      resp->err = strdup(buf);
      return;
    }
  }

  ret = (*resp->ch.initFn)();
  if (ret != NVML_SUCCESS) {
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

  return;
}

void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle sn't initialized");
    return;
  }

  ret = (*h.getCount)(&resp->count);
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  resp->total = 0;
  resp->free = 0;
  for (i = 0; i < resp->count; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getMemInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    resp->total += memInfo.total;
    resp->free += memInfo.free;
  }
}

void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  resp->err = NULL;
  resp->major = 0;
  resp->minor = 0;
  nvmlDevice_t device;
  int major = 0;
  int minor = 0;
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle not initialized");
    return;
  }

  unsigned int devices;
  ret = (*h.getCount)(&devices);
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; i < devices; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getComputeCapability)(device, &major, &minor);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    // Report the lowest major.minor we detect as that limits our compatibility
    if (resp->major == 0 || resp->major > major ) {
      resp->major = major;
      resp->minor = minor;
    } else if ( resp->major == major && resp->minor > minor ) {
      resp->minor = minor;
    }
  }
}
#endif  // __APPLE__
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?`

			`#include "gpu_info_cuda.h"`

			`#include <string.h>`

Detect very old CUDA GPUs and fall back to CPU If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode. 2024-01-07 05:40:04 +00:00			`#define CUDA_LOOKUP_SIZE 6`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00
Harden GPU mgmt library lookup When there are multiple management libraries installed on a system not every one will be compatible with the current driver. This change improves our management library algorithm to build up a set of discovered libraries based on glob patterns, and then try all of them until we're able to load one without error. 2024-01-10 22:39:51 +00:00			`void cuda_init(char cuda_lib_path, cuda_init_resp_t resp) {`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`nvmlReturn_t ret;`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = NULL;`
			`const int buflen = 256;`
			`char buf[buflen + 1];`
			`int i;`

			`struct lookup {`
			`char *s;`
			`void **p;`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`} l[CUDA_LOOKUP_SIZE] = {`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`{"nvmlInit_v2", (void *)&resp->ch.initFn},`
			`{"nvmlShutdown", (void *)&resp->ch.shutdownFn},`
			`{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},`
			`{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},`
Detect very old CUDA GPUs and fall back to CPU If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode. 2024-01-07 05:40:04 +00:00			`{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`};`

Harden GPU mgmt library lookup When there are multiple management libraries installed on a system not every one will be compatible with the current driver. This change improves our management library algorithm to build up a set of discovered libraries based on glob patterns, and then try all of them until we're able to load one without error. 2024-01-10 22:39:51 +00:00			`resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`if (!resp->ch.handle) {`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`char *msg = LOAD_ERR();`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`snprintf(buf, buflen,`
			`"Unable to load %s library to query for Nvidia GPUs: %s",`
Harden GPU mgmt library lookup When there are multiple management libraries installed on a system not every one will be compatible with the current driver. This change improves our management library algorithm to build up a set of discovered libraries based on glob patterns, and then try all of them until we're able to load one without error. 2024-01-10 22:39:51 +00:00			`cuda_lib_path, msg);`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`free(msg);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`

gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);`
			`if (!l[i].p) {`
			`UNLOAD_LIBRARY(resp->ch.handle);`
			`resp->ch.handle = NULL;`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`char *msg = LOAD_ERR();`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`msg);`
			`free(msg);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`
			`}`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00
			`ret = (*resp->ch.initFn)();`
			`if (ret != NVML_SUCCESS) {`
Harden GPU mgmt library lookup When there are multiple management libraries installed on a system not every one will be compatible with the current driver. This change improves our management library algorithm to build up a set of discovered libraries based on glob patterns, and then try all of them until we're able to load one without error. 2024-01-10 22:39:51 +00:00			`UNLOAD_LIBRARY(resp->ch.handle);`
			`resp->ch.handle = NULL;`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`snprintf(buf, buflen, "nvml vram init failure: %d", ret);`
			`resp->err = strdup(buf);`
			`}`

Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`return;`
			`}`

			`void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {`
			`resp->err = NULL;`
			`nvmlDevice_t device;`
			`nvmlMemory_t memInfo = {0};`
			`nvmlReturn_t ret;`
			`const int buflen = 256;`
			`char buf[buflen + 1];`
			`int i;`

			`if (h.handle == NULL) {`
			`resp->err = strdup("nvml handle sn't initialized");`
			`return;`
			`}`

calculate overhead based number of gpu devices (#1875) 2024-01-09 20:53:33 +00:00			`ret = (*h.getCount)(&resp->count);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`if (ret != NVML_SUCCESS) {`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`snprintf(buf, buflen, "unable to get device count: %d", ret);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`

gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`resp->total = 0;`
			`resp->free = 0;`
calculate overhead based number of gpu devices (#1875) 2024-01-09 20:53:33 +00:00			`for (i = 0; i < resp->count; i++) {`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`ret = (*h.getHandle)(i, &device);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`ret = (*h.getMemInfo)(device, &memInfo);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`resp->total += memInfo.total;`
			`resp->free += memInfo.free;`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`}`
Detect very old CUDA GPUs and fall back to CPU If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode. 2024-01-07 05:40:04 +00:00
			`void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {`
			`resp->err = NULL;`
			`resp->major = 0;`
			`resp->minor = 0;`
			`nvmlDevice_t device;`
			`int major = 0;`
			`int minor = 0;`
			`nvmlReturn_t ret;`
			`const int buflen = 256;`
			`char buf[buflen + 1];`
			`int i;`

			`if (h.handle == NULL) {`
			`resp->err = strdup("nvml handle not initialized");`
			`return;`
			`}`

			`unsigned int devices;`
			`ret = (*h.getCount)(&devices);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "unable to get device count: %d", ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`for (i = 0; i < devices; i++) {`
			`ret = (*h.getHandle)(i, &device);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`ret = (*h.getComputeCapability)(device, &major, &minor);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`
			`// Report the lowest major.minor we detect as that limits our compatibility`
			`if (resp->major == 0 \|\| resp->major > major ) {`
			`resp->major = major;`
			`resp->minor = minor;`
			`} else if ( resp->major == major && resp->minor > minor ) {`
			`resp->minor = minor;`
			`}`
			`}`
			`}`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`#endif // __APPLE__`