ollama/gpu/gpu_info_cuda.c

#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include "gpu_info_cuda.h"

#include <string.h>

#define CUDA_LOOKUP_SIZE 12

void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
  } l[CUDA_LOOKUP_SIZE] = {
      {"nvmlInit_v2", (void *)&resp->ch.initFn},
      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
      {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
      {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
      {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
      {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
      {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
      {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
  };

  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
             cuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      char *msg = LOAD_ERR();
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
               msg);
      free(msg);
      resp->err = strdup(buf);
      return;
    }
  }

  ret = (*resp->ch.initFn)();
  if (ret != NVML_SUCCESS) {
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

  // Report driver version if we're in verbose mode, ignore errors
  ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
  if (ret != NVML_SUCCESS) {
    LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
  } else {
    LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
  }
}

void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle sn't initialized");
    return;
  }

  ret = (*h.getCount)(&resp->count);
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  resp->total = 0;
  resp->free = 0;
  for (i = 0; i < resp->count; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getMemInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    if (h.verbose) {
      nvmlBrandType_t brand = 0;
      // When in verbose mode, report more information about
      // the card we discover, but don't fail on error
      ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
      }
      ret = (*h.nvmlDeviceGetBrand)(device, &brand);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
      }
    }

    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);

    resp->total += memInfo.total;
    resp->free += memInfo.free;
  }
}

void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  resp->err = NULL;
  resp->major = 0;
  resp->minor = 0;
  nvmlDevice_t device;
  int major = 0;
  int minor = 0;
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle not initialized");
    return;
  }

  unsigned int devices;
  ret = (*h.getCount)(&devices);
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; i < devices; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getComputeCapability)(device, &major, &minor);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    // Report the lowest major.minor we detect as that limits our compatibility
    if (resp->major == 0 || resp->major > major ) {
      resp->major = major;
      resp->minor = minor;
    } else if ( resp->major == major && resp->minor > minor ) {
      resp->minor = minor;
    }
  }
}
#endif  // __APPLE__
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?`

			`#include "gpu_info_cuda.h"`

			`#include <string.h>`

Report more information about GPUs in verbose mode This adds additional calls to both CUDA and ROCm management libraries to discover additional attributes about the GPU(s) detected in the system, and wires up runtime verbosity selection. When users hit problems with GPUs we can ask them to run with `OLLAMA_DEBUG=1 ollama serve` and share the results. 2024-01-23 00:03:32 +00:00			`#define CUDA_LOOKUP_SIZE 12`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00
Harden GPU mgmt library lookup When there are multiple management libraries installed on a system not every one will be compatible with the current driver. This change improves our management library algorithm to build up a set of discovered libraries based on glob patterns, and then try all of them until we're able to load one without error. 2024-01-10 22:39:51 +00:00			`void cuda_init(char cuda_lib_path, cuda_init_resp_t resp) {`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`nvmlReturn_t ret;`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = NULL;`
			`const int buflen = 256;`
			`char buf[buflen + 1];`
			`int i;`

			`struct lookup {`
			`char *s;`
			`void **p;`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`} l[CUDA_LOOKUP_SIZE] = {`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`{"nvmlInit_v2", (void *)&resp->ch.initFn},`
			`{"nvmlShutdown", (void *)&resp->ch.shutdownFn},`
			`{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},`
			`{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},`
Detect very old CUDA GPUs and fall back to CPU If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode. 2024-01-07 05:40:04 +00:00			`{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},`
Report more information about GPUs in verbose mode This adds additional calls to both CUDA and ROCm management libraries to discover additional attributes about the GPU(s) detected in the system, and wires up runtime verbosity selection. When users hit problems with GPUs we can ask them to run with `OLLAMA_DEBUG=1 ollama serve` and share the results. 2024-01-23 00:03:32 +00:00			`{"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},`
			`{"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},`
			`{"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},`
			`{"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},`
			`{"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},`
			`{"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`};`

Harden GPU mgmt library lookup When there are multiple management libraries installed on a system not every one will be compatible with the current driver. This change improves our management library algorithm to build up a set of discovered libraries based on glob patterns, and then try all of them until we're able to load one without error. 2024-01-10 22:39:51 +00:00			`resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`if (!resp->ch.handle) {`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`char *msg = LOAD_ERR();`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`snprintf(buf, buflen,`
			`"Unable to load %s library to query for Nvidia GPUs: %s",`
Harden GPU mgmt library lookup When there are multiple management libraries installed on a system not every one will be compatible with the current driver. This change improves our management library algorithm to build up a set of discovered libraries based on glob patterns, and then try all of them until we're able to load one without error. 2024-01-10 22:39:51 +00:00			`cuda_lib_path, msg);`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`free(msg);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`

gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);`
			`if (!l[i].p) {`
			`UNLOAD_LIBRARY(resp->ch.handle);`
			`resp->ch.handle = NULL;`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`char *msg = LOAD_ERR();`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,`
Fix windows system memory lookup This refines the gpu package error handling and fixes a bug with the system memory lookup on windows. 2023-12-22 23:43:31 +00:00			`msg);`
			`free(msg);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`
			`}`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00
			`ret = (*resp->ch.initFn)();`
			`if (ret != NVML_SUCCESS) {`
Harden GPU mgmt library lookup When there are multiple management libraries installed on a system not every one will be compatible with the current driver. This change improves our management library algorithm to build up a set of discovered libraries based on glob patterns, and then try all of them until we're able to load one without error. 2024-01-10 22:39:51 +00:00			`UNLOAD_LIBRARY(resp->ch.handle);`
			`resp->ch.handle = NULL;`
Refine build to support CPU only If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version 2023-12-14 01:26:47 +00:00			`snprintf(buf, buflen, "nvml vram init failure: %d", ret);`
			`resp->err = strdup(buf);`
			`}`

Report more information about GPUs in verbose mode This adds additional calls to both CUDA and ROCm management libraries to discover additional attributes about the GPU(s) detected in the system, and wires up runtime verbosity selection. When users hit problems with GPUs we can ask them to run with `OLLAMA_DEBUG=1 ollama serve` and share the results. 2024-01-23 00:03:32 +00:00			`// Report driver version if we're in verbose mode, ignore errors`
			`ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);`
			`if (ret != NVML_SUCCESS) {`
			`LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);`
			`} else {`
			`LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);`
			`}`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`

			`void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {`
			`resp->err = NULL;`
			`nvmlDevice_t device;`
			`nvmlMemory_t memInfo = {0};`
			`nvmlReturn_t ret;`
			`const int buflen = 256;`
			`char buf[buflen + 1];`
			`int i;`

			`if (h.handle == NULL) {`
			`resp->err = strdup("nvml handle sn't initialized");`
			`return;`
			`}`

calculate overhead based number of gpu devices (#1875) 2024-01-09 20:53:33 +00:00			`ret = (*h.getCount)(&resp->count);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`if (ret != NVML_SUCCESS) {`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`snprintf(buf, buflen, "unable to get device count: %d", ret);`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`

gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`resp->total = 0;`
			`resp->free = 0;`
calculate overhead based number of gpu devices (#1875) 2024-01-09 20:53:33 +00:00			`for (i = 0; i < resp->count; i++) {`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00			`ret = (*h.getHandle)(i, &device);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`ret = (*h.getMemInfo)(device, &memInfo);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`
Report more information about GPUs in verbose mode This adds additional calls to both CUDA and ROCm management libraries to discover additional attributes about the GPU(s) detected in the system, and wires up runtime verbosity selection. When users hit problems with GPUs we can ask them to run with `OLLAMA_DEBUG=1 ollama serve` and share the results. 2024-01-23 00:03:32 +00:00			`if (h.verbose) {`
			`nvmlBrandType_t brand = 0;`
			`// When in verbose mode, report more information about`
			`// the card we discover, but don't fail on error`
			`ret = (*h.nvmlDeviceGetName)(device, buf, buflen);`
			`if (ret != RSMI_STATUS_SUCCESS) {`
			`LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);`
			`} else {`
			`LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);`
			`}`
			`ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);`
			`if (ret != RSMI_STATUS_SUCCESS) {`
			`LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);`
			`} else {`
			`LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);`
			`}`
			`ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);`
			`if (ret != RSMI_STATUS_SUCCESS) {`
			`LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);`
			`} else {`
			`LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);`
			`}`
			`ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);`
			`if (ret != RSMI_STATUS_SUCCESS) {`
			`LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);`
			`} else {`
			`LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);`
			`}`
			`ret = (*h.nvmlDeviceGetBrand)(device, &brand);`
			`if (ret != RSMI_STATUS_SUCCESS) {`
			`LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);`
			`} else {`
			`LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);`
			`}`
			`}`

			`LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);`
			`LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);`
gpu: read memory info from all cuda devices (#1802) * gpu: read memory info from all cuda devices * add `LOOKUP_SIZE` constant * better constant name * address comments 2024-01-05 16:25:58 +00:00
			`resp->total += memInfo.total;`
			`resp->free += memInfo.free;`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`}`
			`}`
Detect very old CUDA GPUs and fall back to CPU If we try to load the CUDA library on an old GPU, it panics and crashes the server. This checks the compute capability before we load the library so we can gracefully fall back to CPU mode. 2024-01-07 05:40:04 +00:00
			`void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {`
			`resp->err = NULL;`
			`resp->major = 0;`
			`resp->minor = 0;`
			`nvmlDevice_t device;`
			`int major = 0;`
			`int minor = 0;`
			`nvmlReturn_t ret;`
			`const int buflen = 256;`
			`char buf[buflen + 1];`
			`int i;`

			`if (h.handle == NULL) {`
			`resp->err = strdup("nvml handle not initialized");`
			`return;`
			`}`

			`unsigned int devices;`
			`ret = (*h.getCount)(&devices);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "unable to get device count: %d", ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`for (i = 0; i < devices; i++) {`
			`ret = (*h.getHandle)(i, &device);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`ret = (*h.getComputeCapability)(device, &major, &minor);`
			`if (ret != NVML_SUCCESS) {`
			`snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`
			`// Report the lowest major.minor we detect as that limits our compatibility`
			`if (resp->major == 0 \|\| resp->major > major ) {`
			`resp->major = major;`
			`resp->minor = minor;`
			`} else if ( resp->major == major && resp->minor > minor ) {`
			`resp->minor = minor;`
			`}`
			`}`
			`}`
Adapted rocm support to cgo based llama.cpp 2023-11-29 19:00:37 +00:00			`#endif // __APPLE__`