From 5c44461ccfd0bfca4815e9447b7cf20a74b6494f Mon Sep 17 00:00:00 2001 From: Daniel Hiltgen Date: Tue, 22 Oct 2024 12:54:15 -0700 Subject: [PATCH] Fix rocm windows build and clean up dependency gathering (#7305) On windows ensure windows version define is properly set for rocm. Remove duplicate rocm arch flags. Resolve wildcards in the targets so parallel builds don't race. Use readlink to resolve rocm dependencies since wildcards omit libelf Keep windows rocm deps aligned with unified packaging model --- llama/Dockerfile | 4 ++-- llama/make/Makefile.rocm | 20 ++++++++++++-------- llama/make/gpu.make | 8 ++++---- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/llama/Dockerfile b/llama/Dockerfile index c636ee5d..0e2386d4 100644 --- a/llama/Dockerfile +++ b/llama/Dockerfile @@ -9,7 +9,7 @@ ARG ROCM_VERSION=6.1.2 ### To create a local image for building linux binaries on mac or windows with efficient incremental builds # -# docker build --platform linux/amd64 -t builder-amd64 -f Dockerfile.new --target unified-builder-amd64 . +# docker build --platform linux/amd64 -t builder-amd64 -f llama/Dockerfile --target unified-builder-amd64 . # docker run --platform linux/amd64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-amd64 # ### Then incremental builds will be much faster in this container @@ -41,7 +41,7 @@ ENTRYPOINT [ "zsh" ] ### To create a local image for building linux binaries on mac or linux/arm64 with efficient incremental builds # Note: this does not contain jetson variants # -# docker build --platform linux/arm64 -t builder-arm64 -f Dockerfile.new --target unified-builder-arm64 . +# docker build --platform linux/arm64 -t builder-arm64 -f llama/Dockerfile --target unified-builder-arm64 . # docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64 # FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64 diff --git a/llama/make/Makefile.rocm b/llama/make/Makefile.rocm index 865714b8..ee3fe2e4 100644 --- a/llama/make/Makefile.rocm +++ b/llama/make/Makefile.rocm @@ -21,7 +21,8 @@ else ifeq ($(OS),linux) GPU_LIB_DIR_LINUX := $(HIP_PATH)/lib GPU_COMPILER_LINUX := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X) GPU_COMPILER:=$(GPU_COMPILER_LINUX) - ROCM_TRANSITIVE_LIBS = $(shell ldd $(ROCM_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf | sort -u ) + ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(ROCM_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf)) + ROCM_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL)) endif # TODO future multi-variant support for ROCm @@ -36,14 +37,18 @@ GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64 GPU_RUNNER_LIBS_SHORT := hipblas rocblas GPU_PATH_ROOT_WIN=$(dir $(GPU_LIB_DIR_WIN)) GPU_PATH_ROOT_LINUX=$(dir $(GPU_LIB_DIR_LINUX)) -GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) +GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602 GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -fPIC -D_GNU_SOURCE -GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) +GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -fPIC -D_GNU_SOURCE ROCM_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) -ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH)-rocm)/lib/ollama -ROCM_DIST_DEPS_LIBS = $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(ROCM_LIBS)) $(notdir $(ROCM_TRANSITIVE_LIBS))) +ifeq ($(OS),windows) + ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))/lib/ollama +else ifeq ($(OS),linux) + ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH)-rocm)/lib/ollama +endif +ROCM_DIST_DEPS_LIBS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(ROCM_LIBS)) $(notdir $(ROCM_TRANSITIVE_LIBS)))) ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt ifeq ($(OS),linux) @@ -84,8 +89,7 @@ GPU_COMPILER_CUFLAGS = \ -Wno-pass-failed \ -Wno-deprecated-declarations \ -Wno-unused-result \ - -I. \ - $(foreach arch, $(HIP_ARCHS_COMMON), --offload-arch=$(arch)) + -I. include make/gpu.make @@ -98,4 +102,4 @@ $(ROCBLAS_DIST_DEP_MANIFEST): @echo "rocblas library copy complete" $(ROCM_DIST_DEPS_LIBS): @-mkdir -p $(dir $@) - $(CP) $(dir $(filter %$(notdir $@),$(ROCM_LIBS) $(ROCM_TRANSITIVE_LIBS)))/$(notdir $@)* $(dir $@) + $(CP) $(dir $(filter %$(notdir $@),$(ROCM_LIBS) $(ROCM_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@) diff --git a/llama/make/gpu.make b/llama/make/gpu.make index 7143bed6..2d63af56 100644 --- a/llama/make/gpu.make +++ b/llama/make/gpu.make @@ -79,7 +79,7 @@ $(GPU_RUNNER_NAME): $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) # Build targets $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu @-mkdir -p $(dir $@) - $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $< + $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $< $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c @-mkdir -p $(dir $@) $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $< @@ -97,14 +97,14 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME). # Distribution targets $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% @-mkdir -p $(dir $@) - cp $< $@ + $(CP) $< $@ $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) @-mkdir -p $(dir $@) - cp $< $@ + $(CP) $< $@ $(DIST_GPU_RUNNER_LIB_DEPS): @-mkdir -p $(dir $@) - $(CP) $(GPU_LIB_DIR)/$(notdir $@)* $(dir $@) + $(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@) # Payload targets $(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server.gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server