a017cf2fea
We're over budget for github's maximum release artifact size with rocm + 2 cuda versions. This splits rocm back out as a discrete artifact, but keeps the layout so it can be extracted into the same location as the main bundle.
281 lines
12 KiB
Bash
Executable file
281 lines
12 KiB
Bash
Executable file
#!/bin/bash
|
|
# This script is intended to run inside the go generate
|
|
# working directory must be llm/generate/
|
|
|
|
# First we build one or more CPU based LLM libraries
|
|
#
|
|
# Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
|
|
# library dependencies
|
|
#
|
|
# Then if we detect ROCm, we build a dynamically loaded ROCm lib. The ROCM
|
|
# libraries are quite large, and also dynamically load data files at runtime
|
|
# which in turn are large, so we don't attempt to cary them as payload
|
|
|
|
set -ex
|
|
set -o pipefail
|
|
compress_pids=""
|
|
|
|
# See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
|
|
amdGPUs() {
|
|
if [ -n "${AMDGPU_TARGETS}" ]; then
|
|
echo "${AMDGPU_TARGETS}"
|
|
return
|
|
fi
|
|
GPU_LIST=(
|
|
"gfx900"
|
|
"gfx906:xnack-"
|
|
"gfx908:xnack-"
|
|
"gfx90a:xnack+"
|
|
"gfx90a:xnack-"
|
|
"gfx940"
|
|
"gfx941"
|
|
"gfx942"
|
|
"gfx1010"
|
|
"gfx1012"
|
|
"gfx1030"
|
|
"gfx1100"
|
|
"gfx1101"
|
|
"gfx1102"
|
|
)
|
|
(
|
|
IFS=$';'
|
|
echo "'${GPU_LIST[*]}'"
|
|
)
|
|
}
|
|
|
|
echo "Starting linux generate script"
|
|
if [ -z "${CUDACXX}" ]; then
|
|
if [ -x /usr/local/cuda/bin/nvcc ]; then
|
|
export CUDACXX=/usr/local/cuda/bin/nvcc
|
|
else
|
|
# Try the default location in case it exists
|
|
export CUDACXX=$(command -v nvcc)
|
|
fi
|
|
fi
|
|
COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
|
|
source $(dirname $0)/gen_common.sh
|
|
init_vars
|
|
git_module_setup
|
|
apply_patches
|
|
|
|
init_vars
|
|
if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
|
|
# Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static"
|
|
# Enables optimized Dockerfile builds using a blanket skip and targeted overrides
|
|
# Static build for linking into the Go binary
|
|
init_vars
|
|
CMAKE_TARGETS="--target llama --target ggml"
|
|
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DGGML_NATIVE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off ${CMAKE_DEFS}"
|
|
BUILD_DIR="../build/linux/${ARCH}_static"
|
|
echo "Building static library"
|
|
build
|
|
fi
|
|
|
|
init_vars
|
|
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|
# Users building from source can tune the exact flags we pass to cmake for configuring
|
|
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
|
|
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
|
init_vars
|
|
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
|
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
|
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
|
echo "Building custom CPU"
|
|
build
|
|
install
|
|
compress
|
|
else
|
|
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
|
# -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
|
# -DGGML_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
|
|
# -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
|
# -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
|
# Note: the following seem to yield slower results than AVX2 - ymmv
|
|
# -DGGML_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
|
|
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
|
|
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
|
|
|
|
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
|
|
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
|
#
|
|
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
|
#
|
|
init_vars
|
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
|
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
|
echo "Building LCD CPU"
|
|
build
|
|
install
|
|
compress
|
|
fi
|
|
|
|
if [ "${ARCH}" == "x86_64" ]; then
|
|
#
|
|
# ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
|
|
#
|
|
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
|
|
#
|
|
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
|
|
# Approximately 400% faster than LCD on same CPU
|
|
#
|
|
init_vars
|
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
|
BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
|
|
echo "Building AVX CPU"
|
|
build
|
|
install
|
|
compress
|
|
fi
|
|
|
|
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
|
|
#
|
|
# ~2013 CPU Dynamic library
|
|
# Approximately 10% faster than AVX on same CPU
|
|
#
|
|
init_vars
|
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
|
|
BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
|
|
echo "Building AVX2 CPU"
|
|
build
|
|
install
|
|
compress
|
|
fi
|
|
fi
|
|
fi
|
|
else
|
|
echo "Skipping CPU generation step as requested"
|
|
fi
|
|
|
|
# If needed, look for the default CUDA toolkit location
|
|
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
|
|
CUDA_LIB_DIR=/usr/local/cuda/lib64
|
|
fi
|
|
|
|
# If needed, look for CUDA on Arch Linux
|
|
if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
|
|
CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
|
|
fi
|
|
|
|
# Allow override in case libcudart is in the wrong place
|
|
if [ -z "${CUDART_LIB_DIR}" ]; then
|
|
CUDART_LIB_DIR="${CUDA_LIB_DIR}"
|
|
fi
|
|
|
|
if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
|
|
echo "CUDA libraries detected - building dynamic CUDA library"
|
|
init_vars
|
|
CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
|
|
if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
|
|
CUDA_VARIANT=_v${CUDA_MAJOR}
|
|
fi
|
|
if [ "${ARCH}" == "arm64" ]; then
|
|
echo "ARM CPU detected - disabling unsupported AVX instructions"
|
|
|
|
# ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
|
|
#
|
|
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
|
|
# Disabling has minimal performance effect while maintaining compatibility.
|
|
ARM64_DEFS="-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_CUDA_F16=off"
|
|
fi
|
|
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
|
if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then
|
|
echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\""
|
|
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
|
echo "Building custom CUDA GPU"
|
|
else
|
|
CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
|
fi
|
|
export CUDAFLAGS="-t8"
|
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
|
|
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
|
export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
|
CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
|
|
build
|
|
install
|
|
echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
|
|
mkdir -p "${CUDA_DIST_DIR}"
|
|
for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
|
|
cp -a "${lib}" "${CUDA_DIST_DIR}"
|
|
done
|
|
compress
|
|
|
|
fi
|
|
|
|
if [ -z "${ONEAPI_ROOT}" ]; then
|
|
# Try the default location in case it exists
|
|
ONEAPI_ROOT=/opt/intel/oneapi
|
|
fi
|
|
|
|
if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
|
|
echo "OneAPI libraries detected - building dynamic OneAPI library"
|
|
init_vars
|
|
source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
|
|
CC=icx
|
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
|
|
BUILD_DIR="../build/linux/${ARCH}/oneapi"
|
|
ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
|
|
export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
|
|
DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
|
|
build
|
|
|
|
# copy oneAPI dependencies
|
|
mkdir -p "${ONEAPI_DIST_DIR}"
|
|
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
|
|
cp -a "${dep}" "${ONEAPI_DIST_DIR}"
|
|
done
|
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
|
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
|
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
|
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
|
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
|
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
|
|
cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
|
|
install
|
|
compress
|
|
fi
|
|
|
|
if [ -z "${ROCM_PATH}" ]; then
|
|
# Try the default location in case it exists
|
|
ROCM_PATH=/opt/rocm
|
|
fi
|
|
|
|
if [ -z "${CLBlast_DIR}" ]; then
|
|
# Try the default location in case it exists
|
|
if [ -d /usr/lib/cmake/CLBlast ]; then
|
|
export CLBlast_DIR=/usr/lib/cmake/CLBlast
|
|
fi
|
|
fi
|
|
|
|
if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
|
|
echo "ROCm libraries detected - building dynamic ROCm library"
|
|
if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
|
|
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
|
|
fi
|
|
init_vars
|
|
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DLLAMA_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
|
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
|
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
|
|
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
|
|
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
|
|
echo "Building custom ROCM GPU"
|
|
fi
|
|
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
|
# ROCm dependencies are too large to fit into a unified bundle
|
|
ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
|
|
# TODO figure out how to disable runpath (rpath)
|
|
# export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
|
|
export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
|
build
|
|
|
|
# copy the ROCM dependencies
|
|
mkdir -p "${ROCM_DIST_DIR}"
|
|
for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
|
|
cp -a "${dep}"* "${ROCM_DIST_DIR}"
|
|
done
|
|
install
|
|
compress
|
|
fi
|
|
|
|
cleanup
|
|
wait_for_compress
|
|
echo "go generate completed. LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
|