Refine build to support CPU only
If someone checks out the ollama repo and doesn't install the CUDA library, this will ensure they can build a CPU only version
This commit is contained in:
parent
51082535e1
commit
1b991d0ba9
9 changed files with 152 additions and 98 deletions
35
Dockerfile.cpu
Normal file
35
Dockerfile.cpu
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
# Dockerfile variant to ensure we can build CPU only on linux
|
||||||
|
FROM --platform=linux/amd64 ubuntu:20.04 AS base-cpu-amd64
|
||||||
|
ENV CMAKE_ARCH "x86_64"
|
||||||
|
|
||||||
|
FROM --platform=linux/arm64 ubuntu:20.04 AS base-cpu-arm64
|
||||||
|
ENV CMAKE_ARCH "aarch64"
|
||||||
|
|
||||||
|
FROM base-cpu-${TARGETARCH} AS cpu-builder
|
||||||
|
ARG TARGETARCH
|
||||||
|
ARG GOFLAGS
|
||||||
|
ARG CGO_CFLAGS
|
||||||
|
|
||||||
|
# Common toolchain
|
||||||
|
RUN apt-get update && \
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get install -y wget make gcc-10 g++-10 cpp-10 git ocl-icd-opencl-dev && \
|
||||||
|
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 --slave /usr/bin/g++ g++ /usr/bin/g++-10 --slave /usr/bin/gcov gcov /usr/bin/gcov-10
|
||||||
|
RUN wget "https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-${CMAKE_ARCH}.sh" -O /tmp/cmake-installer.sh && \
|
||||||
|
chmod +x /tmp/cmake-installer.sh && /tmp/cmake-installer.sh --skip-license --prefix=/usr
|
||||||
|
|
||||||
|
# install go
|
||||||
|
ADD https://dl.google.com/go/go1.21.3.linux-$TARGETARCH.tar.gz /tmp/go1.21.3.tar.gz
|
||||||
|
RUN mkdir -p /usr/local && tar xz -C /usr/local </tmp/go1.21.3.tar.gz
|
||||||
|
|
||||||
|
# build the final binary
|
||||||
|
FROM cpu-builder AS cpu-build
|
||||||
|
WORKDIR /go/src/github.com/jmorganca/ollama
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
ENV GOOS=linux
|
||||||
|
ENV GOARCH=$TARGETARCH
|
||||||
|
ENV GOFLAGS=$GOFLAGS
|
||||||
|
ENV CGO_CFLAGS=${CGO_CFLAGS}
|
||||||
|
|
||||||
|
RUN /usr/local/go/bin/go generate ./... && \
|
||||||
|
/usr/local/go/bin/go build .
|
29
README.md
29
README.md
|
@ -200,35 +200,8 @@ Then build the binary:
|
||||||
go build .
|
go build .
|
||||||
```
|
```
|
||||||
|
|
||||||
### Linux/Windows CUDA (NVIDIA)
|
More detailed instructions can be found in the [developer guide](https://github.com/jmorganca/ollama/blob/main/docs/development.md)
|
||||||
*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
|
|
||||||
|
|
||||||
Note: at present, Ollama is optimized for GPU usage on linux, and requires the CUDA libraries at a minimum to compile even if you do not have an NVIDIA GPU.
|
|
||||||
|
|
||||||
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
|
|
||||||
Then generate dependencies:
|
|
||||||
```
|
|
||||||
go generate ./...
|
|
||||||
```
|
|
||||||
Then build the binary:
|
|
||||||
```
|
|
||||||
go build .
|
|
||||||
```
|
|
||||||
|
|
||||||
### Linux ROCm (AMD)
|
|
||||||
*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
|
|
||||||
|
|
||||||
Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
|
|
||||||
Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
|
|
||||||
```
|
|
||||||
CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
|
|
||||||
```
|
|
||||||
Then build the binary:
|
|
||||||
```
|
|
||||||
go build .
|
|
||||||
```
|
|
||||||
|
|
||||||
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
|
|
||||||
|
|
||||||
### Running local builds
|
### Running local builds
|
||||||
Next, start the server:
|
Next, start the server:
|
||||||
|
|
|
@ -34,6 +34,35 @@ Now you can run `ollama`:
|
||||||
|
|
||||||
## Building on Linux with GPU support
|
## Building on Linux with GPU support
|
||||||
|
|
||||||
- Install cmake and nvidia-cuda-toolkit
|
|
||||||
- run `go generate ./...`
|
### Linux/Windows CUDA (NVIDIA)
|
||||||
- run `go build .`
|
*Your operating system distribution may already have packages for NVIDIA CUDA. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
|
||||||
|
|
||||||
|
Install `cmake` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) development and runtime packages.
|
||||||
|
Then generate dependencies:
|
||||||
|
```
|
||||||
|
go generate ./...
|
||||||
|
```
|
||||||
|
Then build the binary:
|
||||||
|
```
|
||||||
|
go build .
|
||||||
|
```
|
||||||
|
|
||||||
|
### Linux ROCm (AMD)
|
||||||
|
*Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!*
|
||||||
|
|
||||||
|
Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) developement packages first, as well as `cmake` and `golang`.
|
||||||
|
Adjust the paths below (correct for Arch) as appropriate for your distributions install locations and generate dependencies:
|
||||||
|
```
|
||||||
|
CLBlast_DIR=/usr/lib/cmake/CLBlast ROCM_PATH=/opt/rocm go generate ./...
|
||||||
|
```
|
||||||
|
Then build the binary:
|
||||||
|
```
|
||||||
|
go build .
|
||||||
|
```
|
||||||
|
|
||||||
|
ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
|
||||||
|
|
||||||
|
## Containerized Build
|
||||||
|
|
||||||
|
If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included.
|
29
gpu/gpu.go
29
gpu/gpu.go
|
@ -3,6 +3,9 @@
|
||||||
package gpu
|
package gpu
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
||||||
|
#cgo windows LDFLAGS: -lpthread
|
||||||
|
|
||||||
#include "gpu_info.h"
|
#include "gpu_info.h"
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
@ -26,6 +29,7 @@ var gpuHandles *handles = nil
|
||||||
|
|
||||||
// Note: gpuMutex must already be held
|
// Note: gpuMutex must already be held
|
||||||
func initGPUHandles() {
|
func initGPUHandles() {
|
||||||
|
// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
|
||||||
log.Printf("Detecting GPU type")
|
log.Printf("Detecting GPU type")
|
||||||
gpuHandles = &handles{nil, nil}
|
gpuHandles = &handles{nil, nil}
|
||||||
var resp C.cuda_init_resp_t
|
var resp C.cuda_init_resp_t
|
||||||
|
@ -61,20 +65,32 @@ func GetGPUInfo() GpuInfo {
|
||||||
}
|
}
|
||||||
|
|
||||||
var memInfo C.mem_info_t
|
var memInfo C.mem_info_t
|
||||||
var resp GpuInfo
|
resp := GpuInfo{"", 0, 0}
|
||||||
if gpuHandles.cuda != nil {
|
if gpuHandles.cuda != nil {
|
||||||
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
|
C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
|
||||||
|
if memInfo.err != nil {
|
||||||
|
log.Printf("error looking up CUDA GPU memory: %s", C.GoString(memInfo.err))
|
||||||
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
|
} else {
|
||||||
resp.Driver = "CUDA"
|
resp.Driver = "CUDA"
|
||||||
|
}
|
||||||
} else if gpuHandles.rocm != nil {
|
} else if gpuHandles.rocm != nil {
|
||||||
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
|
||||||
resp.Driver = "ROCM"
|
if memInfo.err != nil {
|
||||||
|
log.Printf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))
|
||||||
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
} else {
|
} else {
|
||||||
|
resp.Driver = "ROCM"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if resp.Driver == "" {
|
||||||
C.cpu_check_ram(&memInfo)
|
C.cpu_check_ram(&memInfo)
|
||||||
resp.Driver = "CPU"
|
resp.Driver = "CPU"
|
||||||
}
|
}
|
||||||
if memInfo.err != nil {
|
if memInfo.err != nil {
|
||||||
log.Printf("error looking up GPU memory: %s", C.GoString(memInfo.err))
|
log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
|
||||||
C.free(unsafe.Pointer(memInfo.err))
|
C.free(unsafe.Pointer(memInfo.err))
|
||||||
|
return resp
|
||||||
}
|
}
|
||||||
resp.FreeMemory = uint64(memInfo.free)
|
resp.FreeMemory = uint64(memInfo.free)
|
||||||
resp.TotalMemory = uint64(memInfo.total)
|
resp.TotalMemory = uint64(memInfo.total)
|
||||||
|
@ -108,12 +124,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
|
||||||
// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
|
// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
|
||||||
layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
|
layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
|
||||||
|
|
||||||
// TODO - not sure on this part... if we can't fit all the layers, just fallback to CPU
|
log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Driver, numLayer)
|
||||||
// if int64(layers) < numLayer {
|
|
||||||
// log.Printf("%d MB VRAM available, insufficient to load current model (reuires %d MB) - falling back to CPU %d", freeBytes/(1024*1024), fileSizeBytes/(1024*1024))
|
|
||||||
// return 0
|
|
||||||
// }
|
|
||||||
log.Printf("%d MB VRAM available, loading up to %d GPU layers out of %d", info.FreeMemory/(1024*1024), layers, numLayer)
|
|
||||||
|
|
||||||
return layers
|
return layers
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@ const char *cuda_lib_paths[] = {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void cuda_init(cuda_init_resp_t *resp) {
|
void cuda_init(cuda_init_resp_t *resp) {
|
||||||
|
nvmlReturn_t ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
|
@ -56,6 +57,13 @@ void cuda_init(cuda_init_resp_t *resp) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = (*resp->ch.initFn)();
|
||||||
|
if (ret != NVML_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -73,17 +81,9 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = (*h.initFn)();
|
|
||||||
if (ret != NVML_SUCCESS) {
|
|
||||||
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO - handle multiple GPUs
|
// TODO - handle multiple GPUs
|
||||||
ret = (*h.getHandle)(0, &device);
|
ret = (*h.getHandle)(0, &device);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
(*h.shutdownFn)();
|
|
||||||
snprintf(buf, buflen, "unable to get device handle: %d", ret);
|
snprintf(buf, buflen, "unable to get device handle: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
|
@ -91,20 +91,12 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
|
||||||
|
|
||||||
ret = (*h.getMemInfo)(device, &memInfo);
|
ret = (*h.getMemInfo)(device, &memInfo);
|
||||||
if (ret != NVML_SUCCESS) {
|
if (ret != NVML_SUCCESS) {
|
||||||
(*h.shutdownFn)();
|
|
||||||
snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
|
snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
resp->total = memInfo.total;
|
resp->total = memInfo.total;
|
||||||
resp->free = memInfo.free;
|
resp->free = memInfo.free;
|
||||||
|
|
||||||
ret = (*h.shutdownFn)();
|
|
||||||
if (ret != NVML_SUCCESS) {
|
|
||||||
snprintf(buf, buflen, "nvml vram shutdown failure: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // __APPLE__
|
#endif // __APPLE__
|
|
@ -20,6 +20,7 @@ const char *rocm_lib_paths[] = {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void rocm_init(rocm_init_resp_t *resp) {
|
void rocm_init(rocm_init_resp_t *resp) {
|
||||||
|
rsmi_status_t ret;
|
||||||
resp->err = NULL;
|
resp->err = NULL;
|
||||||
const int buflen = 256;
|
const int buflen = 256;
|
||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
|
@ -56,6 +57,13 @@ void rocm_init(rocm_init_resp_t *resp) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ret = (*resp->rh.initFn)(0);
|
||||||
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
|
snprintf(buf, buflen, "rocm vram init failure: %d", ret);
|
||||||
|
resp->err = strdup(buf);
|
||||||
|
}
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -70,10 +78,8 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||||
char buf[buflen + 1];
|
char buf[buflen + 1];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
ret = (*h.initFn)(0);
|
if (h.handle == NULL) {
|
||||||
if (ret != RSMI_STATUS_SUCCESS) {
|
resp->err = strdup("nvml handle sn't initialized");
|
||||||
snprintf(buf, buflen, "rocm vram init failure: %d", ret);
|
|
||||||
resp->err = strdup(buf);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -89,20 +95,17 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
|
||||||
// Get total memory - used memory for available memory
|
// Get total memory - used memory for available memory
|
||||||
ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
|
ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
|
||||||
if (ret != RSMI_STATUS_SUCCESS) {
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
(*h.shutdownFn)();
|
|
||||||
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
|
snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
|
ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
|
||||||
if (ret != RSMI_STATUS_SUCCESS) {
|
if (ret != RSMI_STATUS_SUCCESS) {
|
||||||
(*h.shutdownFn)();
|
|
||||||
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
|
snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
|
||||||
resp->err = strdup(buf);
|
resp->err = strdup(buf);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
(*h.shutdownFn)();
|
|
||||||
resp->total = totalMem;
|
resp->total = totalMem;
|
||||||
resp->free = totalMem - usedMem;
|
resp->free = totalMem - usedMem;
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -21,17 +21,7 @@ package llm
|
||||||
#cgo linux CFLAGS: -D_GNU_SOURCE
|
#cgo linux CFLAGS: -D_GNU_SOURCE
|
||||||
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
|
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
|
||||||
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
|
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
|
||||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/examples/server/libext_server.a
|
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libollama.a
|
||||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/common/libcommon.a
|
|
||||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libllama.a
|
|
||||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libggml_static.a
|
|
||||||
|
|
||||||
// Note: the following requires cuda library presence on linux to build, even if you only have rocm or CPU only
|
|
||||||
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudart_static.a
|
|
||||||
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublas_static.a
|
|
||||||
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcublasLt_static.a
|
|
||||||
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libcudadevrt.a
|
|
||||||
#cgo linux LDFLAGS: /usr/local/cuda/lib64/libculibos.a
|
|
||||||
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
|
||||||
#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin
|
#cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin
|
||||||
#cgo windows LDFLAGS: -lext_server_shared -lpthread
|
#cgo windows LDFLAGS: -lext_server_shared -lpthread
|
||||||
|
|
|
@ -13,28 +13,43 @@ source $(dirname $0)/gen_common.sh
|
||||||
init_vars
|
init_vars
|
||||||
git_module_setup
|
git_module_setup
|
||||||
apply_patches
|
apply_patches
|
||||||
|
if [ -d /usr/local/cuda/lib64/ ] ; then
|
||||||
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||||
|
else
|
||||||
|
CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||||
|
fi
|
||||||
BUILD_DIR="gguf/build/cuda"
|
BUILD_DIR="gguf/build/cuda"
|
||||||
LIB_DIR="${BUILD_DIR}/lib"
|
LIB_DIR="${BUILD_DIR}/lib"
|
||||||
mkdir -p ../../dist/
|
mkdir -p ../../dist/
|
||||||
build
|
build
|
||||||
# TODO - explore mechanism to soften the hard cuda dependency on linux
|
|
||||||
# by conditionally building some archive here that aggregates the cuda libs if present
|
if [ -d /usr/local/cuda/lib64/ ] ; then
|
||||||
# so that the cgo flags link this intermediate archive instead of the underlying cuda libs
|
pwd
|
||||||
#
|
ar -M <<EOF
|
||||||
# gcc -fPIC -g -shared -o ${LIB_DIR}/libcuda_server.so \
|
create ${BUILD_DIR}/libollama.a
|
||||||
# -Wl,--whole-archive \
|
addlib ${BUILD_DIR}/examples/server/libext_server.a
|
||||||
# ${BUILD_DIR}/examples/server/CMakeFiles/ext_server.dir/server.cpp.o \
|
addlib ${BUILD_DIR}/common/libcommon.a
|
||||||
# ${BUILD_DIR}/common/libcommon.a \
|
addlib ${BUILD_DIR}/libllama.a
|
||||||
# ${BUILD_DIR}/libllama.a \
|
addlib ${BUILD_DIR}/libggml_static.a
|
||||||
# ${BUILD_DIR}/examples/llava/libllava_static.a \
|
addlib /usr/local/cuda/lib64/libcudart_static.a
|
||||||
# -Wl,--no-whole-archive \
|
addlib /usr/local/cuda/lib64/libcublas_static.a
|
||||||
# -lrt -lpthread -ldl -lstdc++ -lm \
|
addlib /usr/local/cuda/lib64/libcublasLt_static.a
|
||||||
# /usr/local/cuda/lib64/libcudart_static.a \
|
addlib /usr/local/cuda/lib64/libcudadevrt.a
|
||||||
# /usr/local/cuda/lib64/libcublas_static.a \
|
addlib /usr/local/cuda/lib64/libculibos.a
|
||||||
# /usr/local/cuda/lib64/libcublasLt_static.a \
|
save
|
||||||
# /usr/local/cuda/lib64/libcudadevrt.a \
|
end
|
||||||
# /usr/local/cuda/lib64/libculibos.a
|
EOF
|
||||||
|
else
|
||||||
|
ar -M <<EOF
|
||||||
|
create ${BUILD_DIR}/libollama.a
|
||||||
|
addlib ${BUILD_DIR}/examples/server/libext_server.a
|
||||||
|
addlib ${BUILD_DIR}/common/libcommon.a
|
||||||
|
addlib ${BUILD_DIR}/libllama.a
|
||||||
|
addlib ${BUILD_DIR}/libggml_static.a
|
||||||
|
save
|
||||||
|
end
|
||||||
|
EOF
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -z "${ROCM_PATH}" ] ; then
|
if [ -z "${ROCM_PATH}" ] ; then
|
||||||
# Try the default location in case it exists
|
# Try the default location in case it exists
|
||||||
|
|
|
@ -8,8 +8,14 @@ export GOFLAGS="'-ldflags=-w -s \"-X=github.com/jmorganca/ollama/version.Version
|
||||||
mkdir -p dist
|
mkdir -p dist
|
||||||
|
|
||||||
for TARGETARCH in amd64 arm64; do
|
for TARGETARCH in amd64 arm64; do
|
||||||
docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t builder:$TARGETARCH .
|
docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.build -t gpubuilder:$TARGETARCH .
|
||||||
docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
|
docker create --platform linux/$TARGETARCH --name gpubuilder-$TARGETARCH gpubuilder:$TARGETARCH
|
||||||
docker cp builder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
|
docker cp gpubuilder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH
|
||||||
docker rm builder-$TARGETARCH
|
docker rm gpubuilder-$TARGETARCH
|
||||||
|
|
||||||
|
docker buildx build --load --progress=plain --platform=linux/$TARGETARCH --build-arg=VERSION --build-arg=GOFLAGS -f Dockerfile.cpu -t cpubuilder:$TARGETARCH .
|
||||||
|
docker create --platform linux/$TARGETARCH --name cpubuilder-$TARGETARCH cpubuilder:$TARGETARCH
|
||||||
|
docker cp cpubuilder-$TARGETARCH:/go/src/github.com/jmorganca/ollama/ollama ./dist/ollama-linux-$TARGETARCH-cpu
|
||||||
|
docker rm cpubuilder-$TARGETARCH
|
||||||
|
|
||||||
done
|
done
|
||||||
|
|
Loading…
Add table
Reference in a new issue