ollama/gpu/gpu_info_cudart.c

#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include <string.h>
#include "gpu_info_cudart.h"

void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
  cudartReturn_t ret;
  resp->err = NULL;
  resp->num_devices = 0;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
  } l[] = {
      {"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},
      {"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},
      {"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},
      {"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},
      {"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},
      {"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},
      {"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},
      {"cudaGetDeviceProperties", (void *)&resp->ch.cudaGetDeviceProperties},
      {NULL, NULL},
  };

  resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
    LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);
    snprintf(buf, buflen,
            "Unable to load %s library to query for Nvidia GPUs: %s",
            cudart_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
              msg);
      free(msg);
      resp->err = strdup(buf);
      return;
    }
  }

  ret = (*resp->ch.cudaSetDevice)(0);
  if (ret != CUDART_SUCCESS) {
    LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
      resp->err = strdup("your nvidia driver is too old or missing.  If you have a CUDA GPU please upgrade to run ollama");
      return;
    }
    snprintf(buf, buflen, "cudart init failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  int version = 0;
  cudartDriverVersion_t driverVersion;
  driverVersion.major = 0;
  driverVersion.minor = 0;

  // Report driver version if we're in verbose mode, ignore errors
  ret = (*resp->ch.cudaDriverGetVersion)(&version);
  if (ret != CUDART_SUCCESS) {
    LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);
  } else {
    driverVersion.major = version / 1000;
    driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);
  }

  ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);
  if (ret != CUDART_SUCCESS) {
    LOG(resp->ch.verbose, "cudaGetDeviceCount err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }
}


void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
  resp->err = NULL;
  cudartMemory_t memInfo = {0,0,0};
  cudartReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];

  if (h.handle == NULL) {
    resp->err = strdup("cudart handle isn't initialized");
    return;
  }

  ret = (*h.cudaSetDevice)(i);
  if (ret != CUDART_SUCCESS) {
    snprintf(buf, buflen, "cudart device failed to initialize");
    resp->err = strdup(buf);
    return;
  }

  cudaDeviceProp_t props;
  ret = (*h.cudaGetDeviceProperties)(&props, i);
  if (ret != CUDART_SUCCESS) {
    LOG(h.verbose, "[%d] device properties lookup failure: %d\n", i, ret);
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
    resp->major = 0;
    resp->minor = 0;
  } else {
    int allNull = 1;
    for (int j = 0; j < 16; j++) {
      if (props.uuid.bytes[j] != 0) {
        allNull = 0;
        break;
      }
    }
    if (allNull != 0) {
      snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
    } else {
      // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
      snprintf(&resp->gpu_id[0], GPU_ID_LEN,
          "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
          props.uuid.bytes[0],
          props.uuid.bytes[1],
          props.uuid.bytes[2],
          props.uuid.bytes[3],
          props.uuid.bytes[4],
          props.uuid.bytes[5],
          props.uuid.bytes[6],
          props.uuid.bytes[7],
          props.uuid.bytes[8],
          props.uuid.bytes[9],
          props.uuid.bytes[10],
          props.uuid.bytes[11],
          props.uuid.bytes[12],
          props.uuid.bytes[13],
          props.uuid.bytes[14],
          props.uuid.bytes[15]
        );
    }
    resp->major = props.major;
    resp->minor = props.minor;

    // TODO add other useful properties from props
  }
  ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);
  if (ret != CUDART_SUCCESS) {
    snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);
    resp->err = strdup(buf);
    return;
  }

  resp->total = memInfo.total;
  resp->free = memInfo.free;
  resp->used = memInfo.used;

  LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
  LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
  LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
}

void cudart_release(cudart_handle_t h) {
  LOG(h.verbose, "releasing cudart library\n");
  UNLOAD_LIBRARY(h.handle);
  h.handle = NULL;
}

#endif  // __APPLE__
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`#ifndef __APPLE__ // TODO - maybe consider nvidia support on intel macs?`

			`#include <string.h>`
			`#include "gpu_info_cudart.h"`

			`void cudart_init(char cudart_lib_path, cudart_init_resp_t resp) {`
			`cudartReturn_t ret;`
			`resp->err = NULL;`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`resp->num_devices = 0;`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`const int buflen = 256;`
			`char buf[buflen + 1];`
			`int i;`

			`struct lookup {`
			`char *s;`
			`void **p;`
			`} l[] = {`
			`{"cudaSetDevice", (void *)&resp->ch.cudaSetDevice},`
			`{"cudaDeviceSynchronize", (void *)&resp->ch.cudaDeviceSynchronize},`
			`{"cudaDeviceReset", (void *)&resp->ch.cudaDeviceReset},`
			`{"cudaMemGetInfo", (void *)&resp->ch.cudaMemGetInfo},`
			`{"cudaGetDeviceCount", (void *)&resp->ch.cudaGetDeviceCount},`
			`{"cudaDeviceGetAttribute", (void *)&resp->ch.cudaDeviceGetAttribute},`
			`{"cudaDriverGetVersion", (void *)&resp->ch.cudaDriverGetVersion},`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`{"cudaGetDeviceProperties", (void *)&resp->ch.cudaGetDeviceProperties},`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`{NULL, NULL},`
			`};`

			`resp->ch.handle = LOAD_LIBRARY(cudart_lib_path, RTLD_LAZY);`
			`if (!resp->ch.handle) {`
			`char *msg = LOAD_ERR();`
			`LOG(resp->ch.verbose, "library %s load err: %s\n", cudart_lib_path, msg);`
			`snprintf(buf, buflen,`
			`"Unable to load %s library to query for Nvidia GPUs: %s",`
			`cudart_lib_path, msg);`
			`free(msg);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`for (i = 0; l[i].s != NULL; i++) {`
			`*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);`
			`if (!l[i].p) {`
			`char *msg = LOAD_ERR();`
			`LOG(resp->ch.verbose, "dlerr: %s\n", msg);`
			`UNLOAD_LIBRARY(resp->ch.handle);`
			`resp->ch.handle = NULL;`
			`snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,`
			`msg);`
			`free(msg);`
			`resp->err = strdup(buf);`
			`return;`
			`}`
			`}`

			`ret = (*resp->ch.cudaSetDevice)(0);`
			`if (ret != CUDART_SUCCESS) {`
			`LOG(resp->ch.verbose, "cudaSetDevice err: %d\n", ret);`
			`UNLOAD_LIBRARY(resp->ch.handle);`
			`resp->ch.handle = NULL;`
Detect too-old cuda driver "cudart init failure: 35" isn't particularly helpful in the logs. 2024-03-28 16:27:17 +00:00			`if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");`
Detect too-old cuda driver "cudart init failure: 35" isn't particularly helpful in the logs. 2024-03-28 16:27:17 +00:00			`return;`
			`}`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`snprintf(buf, buflen, "cudart init failure: %d", ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`

			`int version = 0;`
			`cudartDriverVersion_t driverVersion;`
			`driverVersion.major = 0;`
			`driverVersion.minor = 0;`

			`// Report driver version if we're in verbose mode, ignore errors`
			`ret = (*resp->ch.cudaDriverGetVersion)(&version);`
			`if (ret != CUDART_SUCCESS) {`
			`LOG(resp->ch.verbose, "cudaDriverGetVersion failed: %d\n", ret);`
			`} else {`
			`driverVersion.major = version / 1000;`
			`driverVersion.minor = (version - (driverVersion.major * 1000)) / 10;`
			`LOG(resp->ch.verbose, "CUDA driver version: %d-%d\n", driverVersion.major, driverVersion.minor);`
			`}`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00
			`ret = (*resp->ch.cudaGetDeviceCount)(&resp->num_devices);`
			`if (ret != CUDART_SUCCESS) {`
			`LOG(resp->ch.verbose, "cudaGetDeviceCount err: %d\n", ret);`
			`UNLOAD_LIBRARY(resp->ch.handle);`
			`resp->ch.handle = NULL;`
			`snprintf(buf, buflen, "unable to get device count: %d", ret);`
			`resp->err = strdup(buf);`
			`return;`
			`}`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`}`


Refine GPU discovery to bootstrap once Now that we call the GPU discovery routines many times to update memory, this splits initial discovery from free memory updating. 2024-05-15 22:13:16 +00:00			`void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`resp->err = NULL;`
			`cudartMemory_t memInfo = {0,0,0};`
			`cudartReturn_t ret;`
			`const int buflen = 256;`
			`char buf[buflen + 1];`

			`if (h.handle == NULL) {`
			`resp->err = strdup("cudart handle isn't initialized");`
			`return;`
			`}`

Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`ret = (*h.cudaSetDevice)(i);`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`if (ret != CUDART_SUCCESS) {`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`snprintf(buf, buflen, "cudart device failed to initialize");`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`

Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`cudaDeviceProp_t props;`
			`ret = (*h.cudaGetDeviceProperties)(&props, i);`
			`if (ret != CUDART_SUCCESS) {`
			`LOG(h.verbose, "[%d] device properties lookup failure: %d\n", i, ret);`
			`snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);`
			`resp->major = 0;`
			`resp->minor = 0;`
			`} else {`
			`int allNull = 1;`
			`for (int j = 0; j < 16; j++) {`
			`if (props.uuid.bytes[j] != 0) {`
			`allNull = 0;`
			`break;`
			`}`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`}`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`if (allNull != 0) {`
			`snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);`
			`} else {`
			`// GPU-d110a105-ac29-1d54-7b49-9c90440f215b`
			`snprintf(&resp->gpu_id[0], GPU_ID_LEN,`
			`"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",`
			`props.uuid.bytes[0],`
			`props.uuid.bytes[1],`
			`props.uuid.bytes[2],`
			`props.uuid.bytes[3],`
			`props.uuid.bytes[4],`
			`props.uuid.bytes[5],`
			`props.uuid.bytes[6],`
			`props.uuid.bytes[7],`
			`props.uuid.bytes[8],`
			`props.uuid.bytes[9],`
			`props.uuid.bytes[10],`
			`props.uuid.bytes[11],`
			`props.uuid.bytes[12],`
			`props.uuid.bytes[13],`
			`props.uuid.bytes[14],`
			`props.uuid.bytes[15]`
			`);`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`}`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`resp->major = props.major;`
			`resp->minor = props.minor;`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`// TODO add other useful properties from props`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`}`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`ret = (*h.cudaMemGetInfo)(&memInfo.free, &memInfo.total);`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`if (ret != CUDART_SUCCESS) {`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`snprintf(buf, buflen, "cudart device memory info lookup failure %d", ret);`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`resp->err = strdup(buf);`
			`return;`
			`}`

Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`resp->total = memInfo.total;`
			`resp->free = memInfo.free;`
Reintroduce nvidia nvml library for windows This library will give us the most reliable free VRAM reporting on windows to enable concurrent model scheduling. 2024-06-03 22:07:50 +00:00			`resp->used = memInfo.used;`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);`
			`LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);`
Reintroduce nvidia nvml library for windows This library will give us the most reliable free VRAM reporting on windows to enable concurrent model scheduling. 2024-06-03 22:07:50 +00:00			`LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);`
Request and model concurrency This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS. 2024-03-30 16:50:05 +00:00			`LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);`
add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`}`

Release gpu discovery library after use Leaving the cudart library loaded kept ~30m of memory pinned in the GPU in the main process. This change ensures we don't hold GPU resources when idle. 2024-03-30 22:34:21 +00:00			`void cudart_release(cudart_handle_t h) {`
			`LOG(h.verbose, "releasing cudart library\n");`
			`UNLOAD_LIBRARY(h.handle);`
			`h.handle = NULL;`
			`}`

add support for libcudart.so for CUDA devices (adds Jetson support) 2024-03-25 15:07:44 +00:00			`#endif // __APPLE__`