diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1f9f9bf0..10dbabe6 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -333,4 +333,4 @@ jobs:
           submodules: recursive
       - name: Verify patches carry all the changes
         run: |
-          cd llama && ./sync.sh && git diff --compact-summary --exit-code .
\ No newline at end of file
+          cd llama && make apply-patches sync && git diff --compact-summary --exit-code .
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index ac4800be..d4785d9c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,4 +14,5 @@ llm/build
 build/*/*/*
 !build/**/placeholder
 llama/build
-__debug_bin*
\ No newline at end of file
+__debug_bin*
+llama/vendor
\ No newline at end of file
diff --git a/llama/Makefile b/llama/Makefile
index f7c3bf08..c289a41f 100644
--- a/llama/Makefile
+++ b/llama/Makefile
@@ -41,6 +41,9 @@ runners: $(RUNNER_TARGETS)
 $(RUNNER_TARGETS):
 	$(MAKE) -f make/Makefile.$@
 
+help-sync apply-patches create-patches sync:
+	$(MAKE) -f make/Makefile.sync $@
+
 clean:
 	rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
 	go clean -cache
diff --git a/llama/README.md b/llama/README.md
index 05c95c90..a2091036 100644
--- a/llama/README.md
+++ b/llama/README.md
@@ -91,10 +91,84 @@ go build -tags avx,rocm .
 make -j
 ```
 
-## Syncing with llama.cpp
+## Vendoring
 
-To update this package to the latest llama.cpp code, use the `sync.sh` script:
+Ollama currently vendors [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [ggml](https://github.com/ggerganov/ggml) through a vendoring model.  While we generally strive to contribute changes back upstream to avoid drift, we cary a small set of patches which are applied to the tracking commit.  A set of make targets are available to aid developers in updating to a newer tracking commit, or to work on changes.
+
+> [!IMPORTANT]
+> Prior to merging #7157 we continue to leverage a submodule for llama.cpp which establishes the tracking commit.  After merging that PR a new manifest file we be utilized
+
+If you update the vendoring code, start by running the following command to establish the tracking llama.cpp repo in the `./vendor/` directory.
 
 ```
-./sync.sh ../../llama.cpp
+make -C llama apply-patches
 ```
+
+### Updating Base Commit
+
+**Pin to new base commit**
+
+To update to a newer base commit, select the upstream git tag or commit
+
+> [!IMPORTANT]
+> After merging #7157 a manifest will be used instead of the submodule
+
+```
+cd llm/llama.cpp
+git fetch
+git checkout NEW_BASE_COMMIT
+cd ..
+git add llama.cpp
+```
+
+#### Applying patches
+
+When updating to a newer base commit, the existing patches may not apply cleanly and require manual merge resolution.
+
+Start by applying the patches.  If any of the patches have conflicts, the `git am` will stop at the first failure.
+
+```
+make -C llama apply-patches
+```
+
+If you see an error message about a conflict, go into the `./vendor/` directory, and perform merge resolution using your preferred tool to the patch commit which failed.  Save the file(s) and continue the patch series with `git am --continue` .  If any additional patches fail, follow the same pattern until the full patch series is applied.  Once finished, run a final `create-patches` and `sync` target to ensure everything is updated.
+
+```
+make -C llama create-patches sync
+```
+
+Build and test Ollama, and make any necessary changes to the Go code based on the new base commit.  Submit your PR to the Ollama repo.
+
+### Generating Patches
+
+When working on new fixes or features that impact vendored code, use the following model.  First get a clean tracking repo with all current patches applied:
+
+```
+make -C llama apply-patches
+```
+
+Now edit the upstream native code in the `./vendor/` directory.  You do not need to commit every change in order to build, a dirty working tree in the tracking repo is OK while developing.  Simply save in your editor, and run the following to refresh the vendored code with your changes, build the backend(s) and build ollama:
+
+```
+make -C llama sync
+make -C llama -j 8
+go build .
+```
+
+> [!IMPORTANT]
+> Do **NOT** run `apply-patches` while you're iterating as that will reset the tracking repo.  It will detect a dirty tree and abort, but if your tree is clean and you accidentally ran this target, use `git reflog` to recover your commit(s).
+
+Iterate until you're ready to submit PRs.  Once your code is ready, commit a change in the `./vendor/` directory, then generate the patches for ollama with
+
+```
+make -C llama create-patches
+```
+
+> [!IMPORTANT]
+> Once you have completed this step, it is safe to run `apply-patches` since your change is preserved in the patches.
+
+In your `./vendor/` directory, create a branch, and cherry-pick the new commit to that branch, then submit a PR upstream to llama.cpp.
+
+Commit the changes in the ollama repo and submit a PR to Ollama, which will include the vendored code update with your change, along with the patches.
+
+After your PR upstream is merged, follow the **Updating Base Commit** instructions above, however first remove your patch before running `apply-patches` since the new base commit contains your change already.
\ No newline at end of file
diff --git a/llama/build-info.cpp b/llama/build-info.cpp
index 8dacf166..61bde0e4 100644
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,30 +1,4 @@
-/**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
 int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "";
+char const *LLAMA_COMMIT = "3f1ae2e32cde00c39b96be6d01c2997c29bae555";
 char const *LLAMA_COMPILER = "";
 char const *LLAMA_BUILD_TARGET = "";
diff --git a/llama/make/Makefile.sync b/llama/make/Makefile.sync
new file mode 100644
index 00000000..58f7ef18
--- /dev/null
+++ b/llama/make/Makefile.sync
@@ -0,0 +1,190 @@
+# Helpers for managing our vendored llama.cpp repo and patch set
+
+# TODO - this should include a manifest file at the top of the tree 
+LLAMACPP_BASE_COMMIT=$(shell cd ../llm/llama.cpp && git rev-parse HEAD)
+
+LLAMACPP_REPO := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))vendor/
+
+DST_DIR=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
+LLAMACPP_PATCH_DIR := $(DST_DIR)patches/
+
+
+help-sync:
+	@echo "The following make targets will help you update llama.cpp to a new base commit, or work on new features/fixes"
+	@echo ""
+	@echo "\tmake apply-patches   # Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
+	@echo "\tmake sync            # Vendor llama.cpp and ggml from the tracking repo working tree"
+	@echo "\tmake create-patches  # Generate the patch set based on the current commits in the tracking repo since the base commit"
+	@echo ""
+	@echo "For more details on the workflow, see the Vendoring section in ../docs/development.md"
+
+apply-patches: $(LLAMACPP_REPO)
+	@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
+  		echo "ERROR: Your llama.cpp repo is dirty.  The apply-patches target requires a clean working tree"; \
+		echo "To clobber: git -C $(LLAMACPP_REPO) reset --hard HEAD" ; \
+  		exit 1; \
+	fi
+	@echo "Checking out $(LLAMACPP_BASE_COMMIT)"
+	@git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) || \
+		git -C $(LLAMACPP_REPO) fetch --all && git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT)
+	@echo "Applying ollama patches..."
+	@git -c 'user.name=nobody' -c 'user.email=<>' -C $(LLAMACPP_REPO) am -3 $(LLAMACPP_PATCH_DIR)/*.patch || \
+		echo "Please resolve the conflicts in $(LLAMACPP_REPO), and run 'git am --continue' to continue applying subsequent patches"
+	@echo ""
+	@echo "The tracking repo $(LLAMACPP_REPO) is now in a detached state with all patches applied."
+	@echo "Don't forget to commit any changes you make and run 'make create-patches' "
+	
+$(LLAMACPP_REPO):
+	@echo "Cloning llama.cpp to $(LLAMACPP_REPO)"
+	git clone https://github.com/ggerganov/llama.cpp.git $@
+
+create-patches: $(LLAMACPP_REPO)
+	@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
+  		echo "ERROR: Your llama.cpp repo is dirty.  You must commit any pending changes for format-patch to generate patches"; \
+  		exit 1; \
+	fi
+	git -C $(LLAMACPP_REPO) format-patch --no-signature --no-numbered --zero-commit -o $(LLAMACPP_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
+
+# Vendoring template logic
+EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
+OLLAMA_NATIVE_FILES=mllama.cpp mllama.h llama_darwin.c sampling_ext.cpp sampling_ext.h
+define vendor_file
+$(strip $(addprefix $(2),$(notdir $1))) : $(addprefix $(LLAMACPP_REPO),$(1))
+ifneq ($$(filter-out $(EXCLUDED_FILES),$(notdir $1)),)
+	@echo "vendoring $1"; \
+		mkdir -p $$(dir $$@) && \
+		echo "/**" > $$@ && \
+		echo " * llama.cpp - commit $$(LLAMACPP_BASE_COMMIT) - do not edit this file" >> $$@ && \
+		echo " *" >> $$@ && \
+		sed 's/^/ * /' <$(LLAMACPP_REPO)/LICENSE | sed 's/ *$$$$//' >> $$@ && \
+		echo " */" >> $$@ && \
+		echo "" >> $$@ && \
+		cat $$< >> $$@
+else
+	@echo "vendoring $1"; \
+		mkdir -p $$(dir $$@) && \
+		cat $$< > $$@
+endif
+VENDORED_FILES += $(strip $(addprefix $(2),$(notdir $1)))
+endef
+
+# llama.cpp files -> llama/
+LLAMACPP_FILES=\
+	src/unicode.cpp \
+	src/unicode.h \
+	src/unicode-data.cpp \
+	src/unicode-data.h \
+	src/llama.cpp \
+	src/llama-impl.h \
+	src/llama-vocab.cpp \
+	src/llama-vocab.h \
+	src/llama-grammar.cpp \
+	src/llama-grammar.h \
+	src/llama-sampling.cpp \
+	src/llama-sampling.h \
+	include/llama.h \
+	ggml/src/llamafile/sgemm.cpp \
+	ggml/src/llamafile/sgemm.h
+$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+
+# llama.cpp files -> llama/llamafile
+LLAMAFILE_FILES= \
+	ggml/src/llamafile/sgemm.h
+$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR)llamafile/)))
+
+# ggml files -> llama/
+GGML_FILES= \
+	ggml/src/ggml.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-quants.c \
+	ggml/src/ggml-quants.h \
+	ggml/src/ggml-metal.metal \
+	ggml/include/ggml-metal.h \
+	ggml/src/ggml-impl.h \
+	ggml/include/ggml-cuda.h \
+	ggml/src/ggml-cuda.cu \
+	ggml/src/ggml-common.h \
+	ggml/include/ggml-backend.h \
+	ggml/src/ggml-backend.c \
+	ggml/src/ggml-backend-impl.h \
+	ggml/include/ggml-alloc.h \
+	ggml/src/ggml-alloc.c \
+	ggml/src/ggml-aarch64.h \
+	ggml/src/ggml-aarch64.c \
+	ggml/src/ggml-cpu-impl.h \
+	ggml/include/ggml-blas.h \
+	ggml/src/ggml-blas.cpp
+$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+
+# TODO generalize renaming pattern if we have more of these
+$(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
+	@echo "vendoring $(subst $(LLAMACPP_REPO),,$<)"; \
+		mkdir -p $(dir $@) && \
+		echo "/**" > $@ && \
+		echo " * llama.cpp - commit $(LLAMACPP_BASE_COMMIT) - do not edit this file" >> $@ && \
+		echo " *" >> $@ && \
+		sed 's/^/ * /' <$(LLAMACPP_REPO)/LICENSE | sed 's/ *$$//' >> $@ && \
+		echo " */" >> $@ && \
+		echo "" >> $@ && \
+		cat $< >> $@
+VENDORED_FILES += $(DST_DIR)ggml-metal_darwin_arm64.m
+
+# ggml-cuda -> llama/ggml-cuda/
+GGML_CUDA_FILES= ggml/src/ggml-cuda/*.cu ggml/src/ggml-cuda/*.cuh
+GGML_CUDA_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CUDA_FILES)))))
+$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/)))
+
+GGML_TEMPLATE_FILES= ggml/src/ggml-cuda/template-instances/*.cu
+GGML_TEMPLATE_FILES_EXPANDED = 	$(addprefix ggml/src/ggml-cuda/template-instances/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_TEMPLATE_FILES)))))
+$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/template-instances/)))
+
+GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h
+GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES)))))
+$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/vendors/)))
+
+# llava -> llama/
+LAVA_FILES= \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
+	examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	common/log.h \
+	common/log.cpp \
+	common/stb_image.h
+# These files are mostly used by the llava code
+# and shouldn't be necessary once we use clip.cpp directly
+LAVA_FILES+= \
+	common/common.cpp \
+	common/common.h \
+	common/sampling.cpp \
+	common/sampling.h \
+	common/json.hpp \
+	common/json-schema-to-grammar.cpp \
+	common/json-schema-to-grammar.h \
+	common/base64.hpp
+$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+
+$(DST_DIR)build-info.cpp:
+	@echo "Generating $@"
+	@echo "int LLAMA_BUILD_NUMBER = 0;" > $@
+	@echo "char const *LLAMA_COMMIT = \"$(LLAMACPP_BASE_COMMIT)\";" >> $@
+	@echo "char const *LLAMA_COMPILER = \"\";" >> $@
+	@echo "char const *LLAMA_BUILD_TARGET = \"\";" >> $@
+VENDORED_FILES += $(DST_DIR)build-info.cpp
+
+
+sync: $(LLAMACPP_REPO) .WAIT $(VENDORED_FILES) .WAIT remove-stale-files
+
+PATS=*.c *.h *.cpp *.m *.metal *.cu *.cuh
+NATIVE_DIRS=$(DST_DIR) $(DST_DIR)llamafile/ $(DST_DIR)ggml-cuda/ $(DST_DIR)ggml-cuda/template-instances/ $(DST_DIR)ggml-cuda/vendors/
+ALL_NATIVE_FILES=$(foreach dir,$(NATIVE_DIRS),$(wildcard $(addprefix $(dir),$(PATS))))
+EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
+remove-stale-files:
+	@rm -f $(EXTRA_NATIVE_FILES)
+
+.PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT 
+
+
+# Handy debugging for make variables
+print-%:
+	@echo '$*=$($*)'
diff --git a/llama/patches/01-cuda.diff b/llama/patches/0001-cuda.patch
similarity index 88%
rename from llama/patches/01-cuda.diff
rename to llama/patches/0001-cuda.patch
index 0096d77b..b0bb3b67 100644
--- a/llama/patches/01-cuda.diff
+++ b/llama/patches/0001-cuda.patch
@@ -1,3 +1,14 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Thu, 6 Jun 2024 23:55:47 -0700
+Subject: [PATCH] cuda
+
+---
+ ggml/include/ggml-cuda.h | 2 ++
+ ggml/src/ggml-backend.c  | 5 +++++
+ ggml/src/ggml-cuda.cu    | 6 ++++--
+ 3 files changed, 11 insertions(+), 2 deletions(-)
+
 diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
 index 71bb6dcf..08be0895 100644
 --- a/ggml/include/ggml-cuda.h
diff --git a/llama/patches/02-pretokenizer.diff b/llama/patches/0002-pretokenizer.patch
similarity index 88%
rename from llama/patches/02-pretokenizer.diff
rename to llama/patches/0002-pretokenizer.patch
index b50e005d..4f8f93d1 100644
--- a/llama/patches/02-pretokenizer.diff
+++ b/llama/patches/0002-pretokenizer.patch
@@ -1,3 +1,12 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:13 -0700
+Subject: [PATCH] pretokenizer
+
+---
+ src/llama.cpp | 14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
 diff --git a/src/llama.cpp b/src/llama.cpp
 index 4c0a1bb6..800dfb95 100644
 --- a/src/llama.cpp
diff --git a/llama/patches/03-metal.diff b/llama/patches/0003-metal.patch
similarity index 91%
rename from llama/patches/03-metal.diff
rename to llama/patches/0003-metal.patch
index 41f358a4..f54e439d 100644
--- a/llama/patches/03-metal.diff
+++ b/llama/patches/0003-metal.patch
@@ -1,3 +1,12 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:12 -0700
+Subject: [PATCH] metal
+
+---
+ ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
+ 1 file changed, 13 insertions(+), 17 deletions(-)
+
 diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
 index 9da08fe2..3a433703 100644
 --- a/ggml/src/ggml-metal.m
diff --git a/llama/patches/04-ggml-metal.diff b/llama/patches/0004-ggml-metal.patch
similarity index 73%
rename from llama/patches/04-ggml-metal.diff
rename to llama/patches/0004-ggml-metal.patch
index b9013006..36cd220c 100644
--- a/llama/patches/04-ggml-metal.diff
+++ b/llama/patches/0004-ggml-metal.patch
@@ -1,3 +1,12 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: jmorganca <jmorganca@gmail.com>
+Date: Wed, 12 Jun 2024 12:18:40 -0700
+Subject: [PATCH] ggml-metal
+
+---
+ ggml/src/ggml-metal.m | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
 diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
 index 3a433703..829c5e39 100644
 --- a/ggml/src/ggml-metal.m
diff --git a/llama/patches/05-embeddings.diff b/llama/patches/0005-embeddings.patch
similarity index 77%
rename from llama/patches/05-embeddings.diff
rename to llama/patches/0005-embeddings.patch
index fd05bf9e..daf2501a 100644
--- a/llama/patches/05-embeddings.diff
+++ b/llama/patches/0005-embeddings.patch
@@ -1,28 +1,36 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:14 -0700
+Subject: [PATCH] embeddings
+
+---
+ src/llama.cpp | 15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)
+
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 4c0a1bb6..17e5bc2a 100644
+index 800dfb95..a639522d 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -16928,7 +16928,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
+@@ -16920,7 +16920,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
      const auto n_embd  = hparams.n_embd;
-
+ 
      // TODO: use a per-batch flag for logits presence instead
 -    const bool has_logits = !cparams.embeddings;
 +    const bool has_logits =  cparams.causal_attn;
      const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
-
+ 
      const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
-@@ -17200,20 +17200,23 @@ static int llama_decode_internal(
+@@ -17192,20 +17192,23 @@ static int llama_decode_internal(
              // no output
              res  = nullptr;
              embd = nullptr;
 -        } else if (cparams.embeddings) {
 -            res  = nullptr; // do not extract logits for embedding case
 -            embd = nullptr;
--            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
 +        }
 +
 +        if (cparams.embeddings) {
-+            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+             for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
 +                embd = ggml_graph_node(gf, i);
                  if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
 -                    embd = ggml_graph_node(gf, i);
@@ -39,5 +47,5 @@ index 4c0a1bb6..17e5bc2a 100644
 +            res = nullptr; // do not extract logits when not needed
 +        }
          // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
-
+ 
          ggml_backend_sched_alloc_graph(lctx.sched, gf);
diff --git a/llama/patches/07-clip-unicode.diff b/llama/patches/0006-clip-unicode.patch
similarity index 87%
rename from llama/patches/07-clip-unicode.diff
rename to llama/patches/0006-clip-unicode.patch
index d33e2284..7789710c 100644
--- a/llama/patches/07-clip-unicode.diff
+++ b/llama/patches/0006-clip-unicode.patch
@@ -1,3 +1,12 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:15 -0700
+Subject: [PATCH] clip-unicode
+
+---
+ examples/llava/clip.cpp | 40 +++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 39 insertions(+), 1 deletion(-)
+
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index 14e02c8d..6e849d8e 100644
 --- a/examples/llava/clip.cpp
diff --git a/llama/patches/08-solar-pro.diff b/llama/patches/0007-solar-pro.patch
similarity index 96%
rename from llama/patches/08-solar-pro.diff
rename to llama/patches/0007-solar-pro.patch
index 515fbb51..e3050ffa 100644
--- a/llama/patches/08-solar-pro.diff
+++ b/llama/patches/0007-solar-pro.patch
@@ -1,5 +1,21 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Michael Yang <mxyng@pm.me>
+Date: Mon, 16 Sep 2024 15:53:16 -0700
+Subject: [PATCH] solar-pro
+
+solar-pro introduces block skip connections where blocks are connected
+to other, non-sequential blocks with a scale multiple
+
+this change adds 4 new keys to store the skip connections and one new
+tensor to store the scalar. the scalar is implemented a 1-dimensional
+tensor with 2 elements dervied from the model's bskcn_tv configuration.
+in general, the values are (bskcn_tv, 1 - bskcn_tv)
+---
+ src/llama.cpp | 269 +++++++++++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 255 insertions(+), 14 deletions(-)
+
 diff --git a/src/llama.cpp b/src/llama.cpp
-index bdad28b3..1fe6189a 100644
+index a639522d..83b80b59 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
 @@ -217,6 +217,7 @@ enum llm_arch {
diff --git a/llama/patches/10-conditional-fattn.diff b/llama/patches/0008-conditional-fattn.patch
similarity index 59%
rename from llama/patches/10-conditional-fattn.diff
rename to llama/patches/0008-conditional-fattn.patch
index 0ec0236c..b61d95b9 100644
--- a/llama/patches/10-conditional-fattn.diff
+++ b/llama/patches/0008-conditional-fattn.patch
@@ -1,8 +1,17 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Hiltgen <daniel@ollama.com>
+Date: Wed, 9 Oct 2024 17:26:23 -0700
+Subject: [PATCH] conditional-fattn
+
+---
+ ggml/src/ggml-cuda.cu | 2 ++
+ 1 file changed, 2 insertions(+)
+
 diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
-index 8a844b02..61d61542 100644
+index 809d6ab1..fe77b81c 100644
 --- a/ggml/src/ggml-cuda.cu
 +++ b/ggml/src/ggml-cuda.cu
-@@ -2310,9 +2310,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
+@@ -2347,9 +2347,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
          case GGML_OP_ARGSORT:
              ggml_cuda_op_argsort(ctx, dst);
              break;
diff --git a/llama/patches/11-blas.diff b/llama/patches/0009-blas.patch
similarity index 67%
rename from llama/patches/11-blas.diff
rename to llama/patches/0009-blas.patch
index 04e966e9..f0795d1d 100644
--- a/llama/patches/11-blas.diff
+++ b/llama/patches/0009-blas.patch
@@ -1,3 +1,12 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Mon, 30 Sep 2024 16:31:04 -0700
+Subject: [PATCH] blas
+
+---
+ ggml/src/ggml-blas.cpp | 4 ++++
+ 1 file changed, 4 insertions(+)
+
 diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
 index 6d99c6be..8e1ab99d 100644
 --- a/ggml/src/ggml-blas.cpp
diff --git a/llama/sync.sh b/llama/sync.sh
deleted file mode 100755
index 558cde75..00000000
--- a/llama/sync.sh
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Run in the llama directory
-
-# Set the source directory
-# TODO in the future: src_dir=$1
-src_dir=../llm/llama.cpp
-
-if [ -z "$src_dir" ]; then
-  echo "Usage: $0 LLAMA_CPP_DIR"
-  exit 1
-fi
-
-# Set the destination directory
-dst_dir=$(pwd)
-
-# TODO remove once we no longer use the submodule
-if [ -z "${OLLAMA_SKIP_PATCHING}" ]; then
-  (cd ../ && git submodule init && git submodule update --force ./llm/llama.cpp)
-
-  # apply patches
-  for patch in $dst_dir/patches/*.diff; do
-    echo "Applying $patch"
-    git -C $src_dir apply "$patch"
-  done
-else
-  echo "Skipping patching"
-fi
-
-# llama.cpp
-cp $src_dir/src/unicode.cpp $dst_dir/unicode.cpp
-cp $src_dir/src/unicode.h $dst_dir/unicode.h
-cp $src_dir/src/unicode-data.cpp $dst_dir/unicode-data.cpp
-cp $src_dir/src/unicode-data.h $dst_dir/unicode-data.h
-cp $src_dir/src/llama.cpp $dst_dir/llama.cpp
-cp $src_dir/src/llama-impl.h $dst_dir/llama-impl.h
-cp $src_dir/src/llama-vocab.cpp $dst_dir/llama-vocab.cpp
-cp $src_dir/src/llama-vocab.h $dst_dir/llama-vocab.h
-cp $src_dir/src/llama-grammar.cpp $dst_dir/llama-grammar.cpp
-cp $src_dir/src/llama-grammar.h $dst_dir/llama-grammar.h
-cp $src_dir/src/llama-sampling.cpp $dst_dir/llama-sampling.cpp
-cp $src_dir/src/llama-sampling.h $dst_dir/llama-sampling.h
-cp $src_dir/include/llama.h $dst_dir/llama.h
-cp $src_dir/ggml/src/llamafile/sgemm.cpp $dst_dir/sgemm.cpp
-cp $src_dir/ggml/src/llamafile/sgemm.h $dst_dir/sgemm.h
-mkdir -p $dst_dir/llamafile
-cp $src_dir/ggml/src/llamafile/sgemm.h $dst_dir/llamafile/sgemm.h
-
-# ggml
-cp $src_dir/ggml/src/ggml.c $dst_dir/ggml.c
-cp $src_dir/ggml/include/ggml.h $dst_dir/ggml.h
-cp $src_dir/ggml/src/ggml-quants.c $dst_dir/ggml-quants.c
-cp $src_dir/ggml/src/ggml-quants.h $dst_dir/ggml-quants.h
-cp $src_dir/ggml/src/ggml-metal.metal $dst_dir/ggml-metal.metal
-cp $src_dir/ggml/include/ggml-metal.h $dst_dir/ggml-metal.h
-cp $src_dir/ggml/src/ggml-metal.m $dst_dir/ggml-metal_darwin_arm64.m
-cp $src_dir/ggml/src/ggml-impl.h $dst_dir/ggml-impl.h
-cp $src_dir/ggml/include/ggml-cuda.h $dst_dir/ggml-cuda.h
-cp $src_dir/ggml/src/ggml-cuda.cu $dst_dir/ggml-cuda.cu
-cp $src_dir/ggml/src/ggml-common.h $dst_dir/ggml-common.h
-cp $src_dir/ggml/include/ggml-backend.h $dst_dir/ggml-backend.h
-cp $src_dir/ggml/src/ggml-backend.c $dst_dir/ggml-backend.c
-cp $src_dir/ggml/src/ggml-backend-impl.h $dst_dir/ggml-backend-impl.h
-cp $src_dir/ggml/include/ggml-alloc.h $dst_dir/ggml-alloc.h
-cp $src_dir/ggml/src/ggml-alloc.c $dst_dir/ggml-alloc.c
-cp $src_dir/ggml/src/ggml-aarch64.h $dst_dir/ggml-aarch64.h
-cp $src_dir/ggml/src/ggml-aarch64.c $dst_dir/ggml-aarch64.c
-cp $src_dir/ggml/src/ggml-cpu-impl.h $dst_dir/ggml-cpu-impl.h
-cp $src_dir/ggml/include/ggml-blas.h $dst_dir/ggml-blas.h
-cp $src_dir/ggml/src/ggml-blas.cpp $dst_dir/ggml-blas.cpp
-
-# ggml-cuda
-mkdir -p $dst_dir/ggml-cuda/template-instances
-mkdir -p $dst_dir/ggml-cuda/vendors
-cp $src_dir/ggml/src/ggml-cuda/*.cu $dst_dir/ggml-cuda/
-cp $src_dir/ggml/src/ggml-cuda/*.cuh $dst_dir/ggml-cuda/
-cp $src_dir/ggml/src/ggml-cuda/template-instances/*.cu $dst_dir/ggml-cuda/template-instances/
-cp $src_dir/ggml/src/ggml-cuda/vendors/*.h $dst_dir/ggml-cuda/vendors/
-
-# llava
-cp $src_dir/examples/llava/clip.cpp $dst_dir/clip.cpp
-cp $src_dir/examples/llava/clip.h $dst_dir/clip.h
-cp $src_dir/examples/llava/llava.cpp $dst_dir/llava.cpp
-cp $src_dir/examples/llava/llava.h $dst_dir/llava.h
-cp $src_dir/common/log.h $dst_dir/log.h
-cp $src_dir/common/log.cpp $dst_dir/log.cpp
-cp $src_dir/common/stb_image.h $dst_dir/stb_image.h
-
-# These files are mostly used by the llava code
-# and shouldn't be necessary once we use clip.cpp directly
-cp $src_dir/common/common.cpp $dst_dir/common.cpp
-cp $src_dir/common/common.h $dst_dir/common.h
-cp $src_dir/common/sampling.cpp $dst_dir/sampling.cpp
-cp $src_dir/common/sampling.h $dst_dir/sampling.h
-cp $src_dir/common/json.hpp $dst_dir/json.hpp
-cp $src_dir/common/json-schema-to-grammar.cpp $dst_dir/json-schema-to-grammar.cpp
-cp $src_dir/common/json-schema-to-grammar.h $dst_dir/json-schema-to-grammar.h
-cp $src_dir/common/base64.hpp $dst_dir/base64.hpp
-cat <<EOF > $dst_dir/build-info.cpp
-int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "$sha1";
-char const *LLAMA_COMPILER = "";
-char const *LLAMA_BUILD_TARGET = "";
-EOF
-
-# add licenses
-sha1=$(git -C $src_dir rev-parse @)
-
-TEMP_LICENSE=$(mktemp)
-cleanup() {
-    rm -f $TEMP_LICENSE
-}
-trap cleanup 0
-
-cat <<EOF | sed 's/ *$//' >$TEMP_LICENSE
-/**
- * llama.cpp - commit $sha1 - do not edit this file
- *
-$(sed 's/^/ * /' <$src_dir/LICENSE)
- */
-
-EOF
-
-LICENSE_FILES=$(find $dst_dir -type f \( -name "*.c" -o -name "*.h" -o -name "*.cpp" -o -name "*.m" -o -name "*.metal" -o -name "*.cu" -o -name "*.cuh" \))
-EXCLUDED_FILES=("sgemm.cpp" "sgemm.h" "sampling_ext.cpp" "sampling_ext.h" "stb_image.h" "json.hpp" "llama_darwin.c")
-
-for IN in $LICENSE_FILES; do
-    for EXCLUDED in "${EXCLUDED_FILES[@]}"; do
-        if [[ "$IN" == *"$EXCLUDED" ]]; then
-            continue 2
-        fi
-    done
-    TMP=$(mktemp)
-    cat $TEMP_LICENSE $IN >$TMP
-    mv $TMP $IN
-done