diff --git a/llama/Makefile b/llama/Makefile index c289a41f..47a87a75 100644 --- a/llama/Makefile +++ b/llama/Makefile @@ -9,8 +9,7 @@ ifeq ($(OS),windows) CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null)) CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null) CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null) - HIP_PATH_83 := $(shell cygpath -m -s "$(subst \,/,$(HIP_PATH))" 2>/dev/null) - HIP_LIB_DIR := $(shell ls -d $(HIP_PATH_83)/lib 2>/dev/null) + HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null) else ifeq ($(OS),linux) HIP_PATH?=/opt/rocm HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null) diff --git a/llama/llama.go b/llama/llama.go index 47786a36..ca17e38c 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -44,8 +44,8 @@ package llama #cgo rocm CFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_rocm -lhipblas -lamdhip64 -lrocblas -#cgo windows CFLAGS: -Wno-discarded-qualifiers -#cgo windows CFLAGS: -Wno-discarded-qualifiers +#cgo windows CFLAGS: -Wno-discarded-qualifiers -D_WIN32_WINNT=0x602 +#cgo windows CXXFLAGS: -D_WIN32_WINNT=0x602 #cgo windows LDFLAGS: -lmsvcrt #cgo windows LDFLAGS: -lmsvcrt -static-libstdc++ -static-libgcc -static #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64 diff --git a/llama/make/Makefile.default b/llama/make/Makefile.default index 03a316b1..95b13a73 100644 --- a/llama/make/Makefile.default +++ b/llama/make/Makefile.default @@ -24,17 +24,17 @@ all: $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS) $(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS) $(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) @-mkdir -p $(dir $@) - GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS))) -o $@ ./runner + GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS))) -o $@ ./runner $(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx" $(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) @-mkdir -p $(dir $@) - GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner + GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner $(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2" $(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) @-mkdir -p $(dir $@) - GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner + GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% @-mkdir -p $(dir $@) @@ -49,3 +49,6 @@ clean: .PHONY: clean all +# Handy debugging for make variables +print-%: + @echo '$*=$($*)' diff --git a/llama/make/Makefile.rocm b/llama/make/Makefile.rocm index ee3fe2e4..947c43a6 100644 --- a/llama/make/Makefile.rocm +++ b/llama/make/Makefile.rocm @@ -9,20 +9,16 @@ HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- ifeq ($(OS),windows) - GPU_LIB_DIR_WIN := $(shell cygpath -m -s "$(HIP_PATH)\bin") - # If HIP_PATH has spaces, hipcc trips over them when subprocessing - HIP_PATH := $(shell cygpath -m -s "$(HIP_PATH)\") - CGO_EXTRA_LDFLAGS_WIN := -L$(shell cygpath -m -s "$(HIP_PATH)\lib") - export HIP_PATH - GPU_COMPILER_WIN := $(HIP_PATH)bin/hipcc.bin.exe + GPU_LIB_DIR_WIN := $(shell cygpath -m -s "$(HIP_PATH)/bin") + CGO_EXTRA_LDFLAGS_WIN := -L$(shell cygpath -m -s "$(HIP_PATH)/lib") + GPU_COMPILER_WIN := $(HIP_PATH)/bin/hipcc.bin.exe GPU_COMPILER:=$(GPU_COMPILER_WIN) else ifeq ($(OS),linux) - HIP_PATH?=/opt/rocm GPU_LIB_DIR_LINUX := $(HIP_PATH)/lib GPU_COMPILER_LINUX := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X) GPU_COMPILER:=$(GPU_COMPILER_LINUX) - ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(ROCM_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf)) - ROCM_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL)) + ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf)) + GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL)) endif # TODO future multi-variant support for ROCm @@ -42,13 +38,13 @@ GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -fPIC -D_GNU_SOURCE GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -fPIC -D_GNU_SOURCE -ROCM_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) +GPU_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) ifeq ($(OS),windows) ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))/lib/ollama else ifeq ($(OS),linux) ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH)-rocm)/lib/ollama endif -ROCM_DIST_DEPS_LIBS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(ROCM_LIBS)) $(notdir $(ROCM_TRANSITIVE_LIBS)))) +GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS)) $(notdir $(GPU_TRANSITIVE_LIBS)))) ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt ifeq ($(OS),linux) @@ -94,12 +90,9 @@ GPU_COMPILER_CUFLAGS = \ include make/gpu.make # Adjust the rules from gpu.make to handle the ROCm dependencies properly -$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(ROCM_DIST_DEPS_LIBS) +$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(ROCBLAS_DIST_DEP_MANIFEST): @-mkdir -p $(dir $@) @echo "Copying rocblas library..." cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . | (cd $(dir $@) && tar xf - ) @echo "rocblas library copy complete" -$(ROCM_DIST_DEPS_LIBS): - @-mkdir -p $(dir $@) - $(CP) $(dir $(filter %$(notdir $@),$(ROCM_LIBS) $(ROCM_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@) diff --git a/llama/make/common-defs.make b/llama/make/common-defs.make index 0f34254b..34545069 100644 --- a/llama/make/common-defs.make +++ b/llama/make/common-defs.make @@ -57,12 +57,18 @@ ifeq ($(OS),windows) EXE_EXT := .exe SHARED_PREFIX := CPU_FLAG_PREFIX := /arch: +ifneq ($(HIP_PATH),) + # If HIP_PATH has spaces, hipcc trips over them when subprocessing + HIP_PATH := $(shell cygpath -m -s "$(patsubst %\,%,$(HIP_PATH))") + export HIP_PATH +endif else ifeq ($(OS),linux) CP := cp -af OBJ_EXT := o SHARED_EXT := so SHARED_PREFIX := lib CPU_FLAG_PREFIX := -m + HIP_PATH?=/opt/rocm else OBJ_EXT := o SHARED_EXT := so diff --git a/llama/make/cuda.make b/llama/make/cuda.make index d50940bf..7ff1815f 100644 --- a/llama/make/cuda.make +++ b/llama/make/cuda.make @@ -19,6 +19,9 @@ GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602 GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602 GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE +GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))) +GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_LIB_DIR)/,$(notdir $(GPU_LIBS)))) + ifeq ($(OS),linux) CUDA_PATH?=/usr/local/cuda GPU_COMPILER_FPIC = -fPIC -Wno-unused-function -std=c++11 diff --git a/llama/make/gpu.make b/llama/make/gpu.make index 2d63af56..1b233e1f 100644 --- a/llama/make/gpu.make +++ b/llama/make/gpu.make @@ -89,7 +89,7 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS) $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) @-mkdir -p $(dir $@) - GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner + GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS) @-mkdir -p $(dir $@) $(CCACHE) $(GPU_COMPILER) --shared $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@ @@ -98,13 +98,16 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME). $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/% @-mkdir -p $(dir $@) $(CP) $< $@ -$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) +$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS) $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) @-mkdir -p $(dir $@) $(CP) $< $@ $(DIST_GPU_RUNNER_LIB_DEPS): @-mkdir -p $(dir $@) $(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@) +$(GPU_DIST_DEPS_LIBS): + @-mkdir -p $(dir $@) + $(CP) $(dir $(filter %$(notdir $@),$(GPU_LIBS) $(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@) # Payload targets $(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server.gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server