From f1615f05e6032bf79d4dc0c683a518ed39aca55a Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Mon, 3 Apr 2023 22:54:46 +0200
Subject: [PATCH 01/58] Chat llama.cpp example implementation

---
 examples/low_level_api_chatllama_cpp.py | 235 ++++++++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 examples/low_level_api_chatllama_cpp.py

diff --git a/examples/low_level_api_chatllama_cpp.py b/examples/low_level_api_chatllama_cpp.py
new file mode 100644
index 0000000..a244867
--- /dev/null
+++ b/examples/low_level_api_chatllama_cpp.py
@@ -0,0 +1,235 @@
+"""
+This is an example implementation of main.cpp from llama.cpp
+Quirks:
+ * Its not exactly alike since this port is designed around programmatic I/O
+ * Input is always echoed if on, so it should be turned off when using "input()"
+ * The first antiprompt should be the userprompt like "\nUser:", 
+   because its added when n_predict is reached (aka generation ended prematurely)
+ * n_predict can be set to -1 for unlimited length responses
+"""
+import llama_cpp
+
+def toIntArray(lst):
+	return [int(i) for i in lst]
+
+# A LLaMA interactive session
+class LLaMAInteract:
+	def __init__(self,
+		primer: str="",
+		model: str="./models/30B/ggml-model-q4_0.bin",
+		n_ctx: int=1024,
+		seed: int=0,
+		n_threads: int=8,
+		antiprompt: list[str]=[],
+		input_echo: bool=True,
+		n_predict: int=20,
+		n_batch: int=8,
+		repeat_last_n: int=64,
+		top_k: int=50,
+		top_p: float=1.,
+		temp: float=1.0,
+		repeat_penalty: float=1,
+	) -> None:
+		# input args
+		self.n_threads = n_threads
+		self.input_echo = input_echo
+		self.n_predict = n_predict
+		self.n_batch = n_batch
+		self.repeat_last_n = repeat_last_n
+		self.top_k=top_k
+		self.top_p=top_p
+		self.temp=temp
+		self.repeat_penalty=repeat_penalty
+		self.n_ctx = n_ctx
+		self.seed = seed
+
+		# runtime args
+		self.input_consumed = 0
+		self.embd = []
+		self.embd_inp = []
+		self.n_past = 0
+		self.first_antiprompt = []
+		self.remaining_tokens = self.n_predict
+		self.output_echo = input_echo
+
+		# model load
+		self.lparams = llama_cpp.llama_context_default_params()
+		self.lparams.n_ctx = self.n_ctx
+		self.lparams.seed = self.seed
+		self.ctx = llama_cpp.llama_init_from_file(model.encode("utf8"), self.lparams)
+
+		# determine the required inference memory per token:
+		tmp = [0, 1, 2, 3]
+		llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
+
+		# determine newline token
+		self.llama_token_newline = (llama_cpp.llama_token * 1)()
+		llama_cpp.llama_tokenize(self.ctx, b"\n", self.llama_token_newline, len(self.llama_token_newline), False)
+		self.llama_token_newline = toIntArray(self.llama_token_newline)
+
+		# primer feed
+		if (len(primer) > 0):
+			self.input(primer)
+		self.n_keep = len(self.embd_inp)
+
+		# create internal context
+		self.n_ctx = int(llama_cpp.llama_n_ctx(self.ctx))
+		self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
+
+		# determine antiprompt tokens
+		for i in antiprompt:
+			d_antiprompt = (llama_cpp.llama_token * (len(i) + 1))()
+			n_antiprompt = llama_cpp.llama_tokenize(self.ctx, i.encode("utf8"), d_antiprompt, len(d_antiprompt), False)
+			self.first_antiprompt.append(toIntArray(d_antiprompt[:n_antiprompt]))
+
+	# if an antiprompt is present
+	def use_antiprompt(self):
+		return len(self.first_antiprompt) > 0
+
+	def generate(self):
+		while self.remaining_tokens > 0 or self.use_antiprompt():
+			# predict
+			if len(self.embd) > 0:
+				# infinite text generation via context swapping
+				# if we run out of context:
+				# - take the n_keep first tokens from the original prompt (via n_past)
+				# - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
+				if (self.n_past + len(self.embd) > self.n_ctx):
+					n_left = self.n_past - self.n_keep
+					self.n_past = self.n_keep
+
+					# insert n_left/2 tokens at the start of embd from last_n_tokens
+					_insert = self.last_n_tokens[
+						-(int(n_left/2) - len(self.embd)):-len(self.embd)
+					]
+					self.embd[:len(_insert)] = _insert
+					#TODO: Still untested
+
+				if (llama_cpp.llama_eval(
+					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.n_threads
+				) != 0):
+					raise Exception("Failed to llama_eval!")
+
+			self.n_past += len(self.embd)
+			self.embd = []
+			if len(self.embd_inp) <= self.input_consumed:
+				# out of user input, sample next token
+				_arr = self.last_n_tokens[-min(self.repeat_last_n, self.n_past):]
+				id = llama_cpp.llama_sample_top_p_top_k(
+					self.ctx,
+					(llama_cpp.llama_token * len(_arr))(*_arr),
+					len(_arr),
+					self.top_k,
+					self.top_p,
+					self.temp,
+					self.repeat_penalty,
+				)
+				self.last_n_tokens.pop(0)
+				self.last_n_tokens.append(int(id))
+
+				# replace end of text token with newline token when in interactive mode
+				if (id == llama_cpp.llama_token_eos() and self.use_antiprompt()):
+					id = self.llama_token_newline[0]
+					# tokenize and inject first reverse prompt
+					self.embd_inp += self.first_antiprompt[0]
+
+				# add it to the context
+				self.embd.append(int(id))
+
+				# echo this to console
+				self.output_echo = True
+
+				# decrement remaining sampling budget
+				self.remaining_tokens -= 1
+			else:
+				# output to console if input echo is on
+				self.output_echo = self.input_echo
+
+				# some user input remains from prompt or interaction, forward it to processing
+				while len(self.embd_inp) > self.input_consumed:
+					self.embd.append(int(self.embd_inp[self.input_consumed]))
+					self.last_n_tokens.pop(0)
+					self.last_n_tokens.append(int(self.embd_inp[self.input_consumed]))
+					self.input_consumed += 1
+					if len(self.embd) >= self.n_batch:
+						break
+
+			# display tokens
+			if self.output_echo:
+				for id in self.embd:
+					yield id
+
+			# if antiprompt is present, stop
+			if (self.use_antiprompt() and len(self.embd_inp) <= self.input_consumed):
+				for i in self.first_antiprompt:
+					if i == self.last_n_tokens[-len(i):]:
+						return
+
+			# if end of generation
+			if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
+				break
+
+			# respect n_predict even if antiprompt is present
+			if (self.use_antiprompt() and self.remaining_tokens <= 0 and self.n_predict != -1):
+				self.embd_inp += self.first_antiprompt[0]
+				break
+
+	def past(self):
+		for id in self.last_n_tokens[-self.n_past:]:
+			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+
+	def input(self, prompt: str):
+		embd_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
+		n_of_tok = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), embd_arr, len(embd_arr), True)
+		self.embd_inp += toIntArray(embd_arr[:n_of_tok])
+
+	def output(self):
+		self.remaining_tokens = self.n_predict
+		for id in self.generate():
+			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
+
+if __name__ == "__main__":
+	from datetime import datetime
+
+	USER_NAME="User"
+	AI_NAME="ChatLLaMa"
+	
+	time_now = datetime.now()
+	prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
+{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
+There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
+The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+
+{USER_NAME}: Hello, {AI_NAME}!
+{AI_NAME}: Hello {USER_NAME}! How may I help you today?
+{USER_NAME}: What time is it?
+{AI_NAME}: It is {time_now.strftime("%H:%M")}.
+{USER_NAME}: What year is it?
+{AI_NAME}: We are in {time_now.strftime("%Y")}.
+{USER_NAME}: What is a cat?
+{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
+{USER_NAME}: Name a color.
+{AI_NAME}: Blue
+{USER_NAME}:"""
+
+	print("Loading model...")
+	ll = LLaMAInteract(prompt, 
+		model="./models/30B/ggml-model-q4_0.bin",
+		n_ctx=2048, 
+		antiprompt=[f"\n{USER_NAME}:"], 
+		repeat_last_n=256, 
+		n_predict=2048,
+		temp=0.7, top_p=0.5, top_k=40, repeat_penalty=1.17647
+	)
+	print("Loaded model!")
+
+	for i in ll.output():
+		print(i,end="",flush=True)
+	ll.input_echo = False
+
+	inp = lambda x: f" {x}\n"
+	while True:
+		ll.input(inp(input(' ')))
+		for i in ll.output():
+			print(i,end="",flush=True)
\ No newline at end of file

From 0b32bb3d43638b8cd606df0c83f89fdcede7ed1c Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Tue, 4 Apr 2023 11:48:48 +0200
Subject: [PATCH 02/58] Add instruction mode

---
 examples/low_level_api_chatllama_cpp.py | 101 +++++++++++++++---------
 1 file changed, 64 insertions(+), 37 deletions(-)

diff --git a/examples/low_level_api_chatllama_cpp.py b/examples/low_level_api_chatllama_cpp.py
index a244867..6462121 100644
--- a/examples/low_level_api_chatllama_cpp.py
+++ b/examples/low_level_api_chatllama_cpp.py
@@ -5,24 +5,26 @@ Quirks:
  * Input is always echoed if on, so it should be turned off when using "input()"
  * The first antiprompt should be the userprompt like "\nUser:", 
    because its added when n_predict is reached (aka generation ended prematurely)
- * n_predict can be set to -1 for unlimited length responses
+ * n_predict can be set to -1 for unlimited length responses (or just a really high value)
+ * It's always in interactive mode, generation ends either by reaching an antiprompt 
+   or running out of n_predict.
+ * Instruction mode adds its own antiprompt
 """
 import llama_cpp
 
-def toIntArray(lst):
-	return [int(i) for i in lst]
-
 # A LLaMA interactive session
 class LLaMAInteract:
 	def __init__(self,
 		primer: str="",
 		model: str="./models/30B/ggml-model-q4_0.bin",
+		instruct: bool=False,
 		n_ctx: int=1024,
 		seed: int=0,
 		n_threads: int=8,
 		antiprompt: list[str]=[],
 		input_echo: bool=True,
 		n_predict: int=20,
+		n_keep: int=0,
 		n_batch: int=8,
 		repeat_last_n: int=64,
 		top_k: int=50,
@@ -31,17 +33,17 @@ class LLaMAInteract:
 		repeat_penalty: float=1,
 	) -> None:
 		# input args
+		self.instruct = instruct
 		self.n_threads = n_threads
 		self.input_echo = input_echo
 		self.n_predict = n_predict
+		self.n_keep = n_keep
 		self.n_batch = n_batch
 		self.repeat_last_n = repeat_last_n
 		self.top_k=top_k
 		self.top_p=top_p
 		self.temp=temp
 		self.repeat_penalty=repeat_penalty
-		self.n_ctx = n_ctx
-		self.seed = seed
 
 		# runtime args
 		self.input_consumed = 0
@@ -54,8 +56,8 @@ class LLaMAInteract:
 
 		# model load
 		self.lparams = llama_cpp.llama_context_default_params()
-		self.lparams.n_ctx = self.n_ctx
-		self.lparams.seed = self.seed
+		self.lparams.n_ctx = n_ctx
+		self.lparams.seed = seed
 		self.ctx = llama_cpp.llama_init_from_file(model.encode("utf8"), self.lparams)
 
 		# determine the required inference memory per token:
@@ -63,29 +65,44 @@ class LLaMAInteract:
 		llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
 
 		# determine newline token
-		self.llama_token_newline = (llama_cpp.llama_token * 1)()
-		llama_cpp.llama_tokenize(self.ctx, b"\n", self.llama_token_newline, len(self.llama_token_newline), False)
-		self.llama_token_newline = toIntArray(self.llama_token_newline)
+		self.llama_token_newline = self._tokenize("\n", False)
+		self.inp_prefix = self._tokenize("\n\n### Instruction:\n\n")
+		self.inp_suffix = self._tokenize("\n\n### Response:\n\n", False)
+
+		# add instruction as antiprompt
+		if (self.instruct):
+			self.first_antiprompt.append(self.inp_prefix)
 
 		# primer feed
 		if (len(primer) > 0):
-			self.input(primer)
-		self.n_keep = len(self.embd_inp)
+			self.embd_inp += self._tokenize(primer)
+
+		# break immediately if using instruct
+		self.init_break = self.instruct
+
+		# number of tokens to keep when resetting context
+		if (self.n_keep < 0 or self.n_keep > len(self.embd_inp) or self.instruct):
+			self.n_keep = len(self.embd_inp)
 
 		# create internal context
-		self.n_ctx = int(llama_cpp.llama_n_ctx(self.ctx))
+		self.n_ctx = llama_cpp.llama_n_ctx(self.ctx)
 		self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
 
 		# determine antiprompt tokens
 		for i in antiprompt:
-			d_antiprompt = (llama_cpp.llama_token * (len(i) + 1))()
-			n_antiprompt = llama_cpp.llama_tokenize(self.ctx, i.encode("utf8"), d_antiprompt, len(d_antiprompt), False)
-			self.first_antiprompt.append(toIntArray(d_antiprompt[:n_antiprompt]))
+			self.first_antiprompt.append(self._tokenize(i, False))
+
+	# tokenize a prompt
+	def _tokenize(self, prompt, bos=True):
+		_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
+		_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos)
+		return _arr[:_n]
 
 	# if an antiprompt is present
 	def use_antiprompt(self):
 		return len(self.first_antiprompt) > 0
 
+	# generate tokens
 	def generate(self):
 		while self.remaining_tokens > 0 or self.use_antiprompt():
 			# predict
@@ -125,16 +142,16 @@ class LLaMAInteract:
 					self.repeat_penalty,
 				)
 				self.last_n_tokens.pop(0)
-				self.last_n_tokens.append(int(id))
+				self.last_n_tokens.append(id)
 
 				# replace end of text token with newline token when in interactive mode
-				if (id == llama_cpp.llama_token_eos() and self.use_antiprompt()):
+				if (id == llama_cpp.llama_token_eos() and self.use_antiprompt() and not self.instruct):
 					id = self.llama_token_newline[0]
 					# tokenize and inject first reverse prompt
 					self.embd_inp += self.first_antiprompt[0]
 
 				# add it to the context
-				self.embd.append(int(id))
+				self.embd.append(id)
 
 				# echo this to console
 				self.output_echo = True
@@ -147,9 +164,9 @@ class LLaMAInteract:
 
 				# some user input remains from prompt or interaction, forward it to processing
 				while len(self.embd_inp) > self.input_consumed:
-					self.embd.append(int(self.embd_inp[self.input_consumed]))
+					self.embd.append(self.embd_inp[self.input_consumed])
 					self.last_n_tokens.pop(0)
-					self.last_n_tokens.append(int(self.embd_inp[self.input_consumed]))
+					self.last_n_tokens.append(self.embd_inp[self.input_consumed])
 					self.input_consumed += 1
 					if len(self.embd) >= self.n_batch:
 						break
@@ -159,11 +176,17 @@ class LLaMAInteract:
 				for id in self.embd:
 					yield id
 
-			# if antiprompt is present, stop
-			if (self.use_antiprompt() and len(self.embd_inp) <= self.input_consumed):
-				for i in self.first_antiprompt:
-					if i == self.last_n_tokens[-len(i):]:
-						return
+			if (len(self.embd_inp) <= self.input_consumed):
+				# if antiprompt is present, stop
+				if (self.use_antiprompt()):
+					for i in self.first_antiprompt:
+						if i == self.last_n_tokens[-len(i):]:
+							return
+
+				# if we are using instruction mode, and we have processed the initial prompt
+				if (self.init_break):
+					self.init_break = False
+					break
 
 			# if end of generation
 			if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
@@ -174,15 +197,20 @@ class LLaMAInteract:
 				self.embd_inp += self.first_antiprompt[0]
 				break
 
+	# return past text
 	def past(self):
 		for id in self.last_n_tokens[-self.n_past:]:
 			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
 
+	# write input
 	def input(self, prompt: str):
-		embd_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
-		n_of_tok = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), embd_arr, len(embd_arr), True)
-		self.embd_inp += toIntArray(embd_arr[:n_of_tok])
+		if (self.instruct):
+			self.embd_inp += self.inp_prefix
+		self.embd_inp += self._tokenize(prompt + "\n")
+		if (self.instruct):
+			self.embd_inp += self.inp_suffix
 
+	# write output
 	def output(self):
 		self.remaining_tokens = self.n_predict
 		for id in self.generate():
@@ -193,7 +221,7 @@ if __name__ == "__main__":
 
 	USER_NAME="User"
 	AI_NAME="ChatLLaMa"
-	
+
 	time_now = datetime.now()
 	prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
 {AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
@@ -214,7 +242,7 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {USER_NAME}:"""
 
 	print("Loading model...")
-	ll = LLaMAInteract(prompt, 
+	m = LLaMAInteract(prompt, 
 		model="./models/30B/ggml-model-q4_0.bin",
 		n_ctx=2048, 
 		antiprompt=[f"\n{USER_NAME}:"], 
@@ -224,12 +252,11 @@ The transcript only includes text, it does not include markup like HTML and Mark
 	)
 	print("Loaded model!")
 
-	for i in ll.output():
+	for i in m.output():
 		print(i,end="",flush=True)
-	ll.input_echo = False
+	m.input_echo = False
 
-	inp = lambda x: f" {x}\n"
 	while True:
-		ll.input(inp(input(' ')))
-		for i in ll.output():
+		m.input(" " + input('\n> ' if m.instruct else " "))
+		for i in m.output():
 			print(i,end="",flush=True)
\ No newline at end of file

From da5a6a708924eeb48c845c88d98999d6bb5feff3 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Tue, 4 Apr 2023 16:18:26 +0200
Subject: [PATCH 03/58] Added instruction mode, fixed infinite generation, and
 various other fixes

---
 examples/low_level_api_chatllama_cpp.py | 62 ++++++++++++++++++-------
 1 file changed, 44 insertions(+), 18 deletions(-)

diff --git a/examples/low_level_api_chatllama_cpp.py b/examples/low_level_api_chatllama_cpp.py
index 6462121..357b381 100644
--- a/examples/low_level_api_chatllama_cpp.py
+++ b/examples/low_level_api_chatllama_cpp.py
@@ -8,7 +8,9 @@ Quirks:
  * n_predict can be set to -1 for unlimited length responses (or just a really high value)
  * It's always in interactive mode, generation ends either by reaching an antiprompt 
    or running out of n_predict.
- * Instruction mode adds its own antiprompt
+ * Instruction mode adds its own antiprompt.
+   You should also still be feeding the model with a "primer" prompt that 
+   shows it the expected format.
 """
 import llama_cpp
 
@@ -31,6 +33,8 @@ class LLaMAInteract:
 		top_p: float=1.,
 		temp: float=1.0,
 		repeat_penalty: float=1,
+		instruct_inp_prefix: str="\n\n### Instruction:\n\n",
+		instruct_inp_suffix: str="\n\n### Response:\n\n",
 	) -> None:
 		# input args
 		self.instruct = instruct
@@ -66,12 +70,12 @@ class LLaMAInteract:
 
 		# determine newline token
 		self.llama_token_newline = self._tokenize("\n", False)
-		self.inp_prefix = self._tokenize("\n\n### Instruction:\n\n")
-		self.inp_suffix = self._tokenize("\n\n### Response:\n\n", False)
+		self.inp_prefix = self._tokenize(instruct_inp_prefix)
+		self.inp_suffix = self._tokenize(instruct_inp_suffix, False)
 
 		# add instruction as antiprompt
 		if (self.instruct):
-			self.first_antiprompt.append(self.inp_prefix)
+			self.first_antiprompt.append(self.inp_prefix.strip())
 
 		# primer feed
 		if (len(primer) > 0):
@@ -117,10 +121,9 @@ class LLaMAInteract:
 
 					# insert n_left/2 tokens at the start of embd from last_n_tokens
 					_insert = self.last_n_tokens[
-						-(int(n_left/2) - len(self.embd)):-len(self.embd)
+						self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd)
 					]
-					self.embd[:len(_insert)] = _insert
-					#TODO: Still untested
+					self.embd = _insert + self.embd
 
 				if (llama_cpp.llama_eval(
 					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.n_threads
@@ -197,6 +200,12 @@ class LLaMAInteract:
 				self.embd_inp += self.first_antiprompt[0]
 				break
 
+	def __enter__(self):
+		return self
+
+	def __exit__(self, type, value, tb):
+		llama_cpp.llama_free(self.ctx)
+
 	# return past text
 	def past(self):
 		for id in self.last_n_tokens[-self.n_past:]:
@@ -206,7 +215,7 @@ class LLaMAInteract:
 	def input(self, prompt: str):
 		if (self.instruct):
 			self.embd_inp += self.inp_prefix
-		self.embd_inp += self._tokenize(prompt + "\n")
+		self.embd_inp += self._tokenize(prompt)
 		if (self.instruct):
 			self.embd_inp += self.inp_suffix
 
@@ -242,21 +251,38 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {USER_NAME}:"""
 
 	print("Loading model...")
-	m = LLaMAInteract(prompt, 
+	with LLaMAInteract(prompt, 
 		model="./models/30B/ggml-model-q4_0.bin",
 		n_ctx=2048, 
 		antiprompt=[f"\n{USER_NAME}:"], 
 		repeat_last_n=256, 
 		n_predict=2048,
 		temp=0.7, top_p=0.5, top_k=40, repeat_penalty=1.17647
-	)
-	print("Loaded model!")
+	) as m:
+		print("Loaded model!")
 
-	for i in m.output():
-		print(i,end="",flush=True)
-	m.input_echo = False
-
-	while True:
-		m.input(" " + input('\n> ' if m.instruct else " "))
 		for i in m.output():
-			print(i,end="",flush=True)
\ No newline at end of file
+			print(i,end="",flush=True)
+		m.input_echo = False
+
+		def inp():
+			out = ""
+			while (t := input()).endswith("\\"):
+				out += t[:-1] + "\n"
+			return out + t + "\n"
+
+		while True:
+			if (m.instruct):
+				print('\n> ', end="")
+				m.input(inp())
+			else:
+				print(f" ", end="")
+				m.input(f" {inp()}{AI_NAME}:")
+				print(f"{AI_NAME}: ",end="")
+
+			try:
+				for i in m.output():
+					print(i,end="",flush=True)
+			except KeyboardInterrupt:
+				print(f"\n{USER_NAME}:",end="")
+				m.input(f"\n{USER_NAME}:")

From 9cde7973ccc2f823fd518b8dadb7c395175c6697 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Tue, 4 Apr 2023 16:20:27 +0200
Subject: [PATCH 04/58] Fix stripping instruction prompt

---
 examples/low_level_api_chatllama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api_chatllama_cpp.py b/examples/low_level_api_chatllama_cpp.py
index 357b381..f7540ee 100644
--- a/examples/low_level_api_chatllama_cpp.py
+++ b/examples/low_level_api_chatllama_cpp.py
@@ -75,7 +75,7 @@ class LLaMAInteract:
 
 		# add instruction as antiprompt
 		if (self.instruct):
-			self.first_antiprompt.append(self.inp_prefix.strip())
+			self.first_antiprompt.append(self._tokenize(self.inp_prefix.strip()))
 
 		# primer feed
 		if (len(primer) > 0):

From c862e8bac523e1bcf6b92e058741a81905ebab96 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Tue, 4 Apr 2023 17:54:47 +0200
Subject: [PATCH 05/58] Fix repeating instructions and an antiprompt bug

---
 examples/low_level_api_chatllama_cpp.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/low_level_api_chatllama_cpp.py b/examples/low_level_api_chatllama_cpp.py
index f7540ee..594d15e 100644
--- a/examples/low_level_api_chatllama_cpp.py
+++ b/examples/low_level_api_chatllama_cpp.py
@@ -75,7 +75,7 @@ class LLaMAInteract:
 
 		# add instruction as antiprompt
 		if (self.instruct):
-			self.first_antiprompt.append(self._tokenize(self.inp_prefix.strip()))
+			self.first_antiprompt.append(self._tokenize(instruct_inp_prefix.strip(), False))
 
 		# primer feed
 		if (len(primer) > 0):
@@ -197,7 +197,8 @@ class LLaMAInteract:
 
 			# respect n_predict even if antiprompt is present
 			if (self.use_antiprompt() and self.remaining_tokens <= 0 and self.n_predict != -1):
-				self.embd_inp += self.first_antiprompt[0]
+				if not self.instruct:
+					self.embd_inp += self.first_antiprompt[0]
 				break
 
 	def __enter__(self):
@@ -213,7 +214,7 @@ class LLaMAInteract:
 
 	# write input
 	def input(self, prompt: str):
-		if (self.instruct):
+		if (self.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix):
 			self.embd_inp += self.inp_prefix
 		self.embd_inp += self._tokenize(prompt)
 		if (self.instruct):
@@ -284,5 +285,6 @@ The transcript only includes text, it does not include markup like HTML and Mark
 				for i in m.output():
 					print(i,end="",flush=True)
 			except KeyboardInterrupt:
-				print(f"\n{USER_NAME}:",end="")
-				m.input(f"\n{USER_NAME}:")
+				if not m.instruct:
+					print(f"\n{USER_NAME}:",end="")
+					m.input(f"\n{USER_NAME}:")

From 99ceecfccd3749291193d880047b238e2a18f2f8 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Wed, 5 Apr 2023 14:28:02 +0200
Subject: [PATCH 06/58] Move to new examples directory

---
 examples/{ => low_level_api}/low_level_api_chatllama_cpp.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/{ => low_level_api}/low_level_api_chatllama_cpp.py (100%)

diff --git a/examples/low_level_api_chatllama_cpp.py b/examples/low_level_api/low_level_api_chatllama_cpp.py
similarity index 100%
rename from examples/low_level_api_chatllama_cpp.py
rename to examples/low_level_api/low_level_api_chatllama_cpp.py

From 283e59c5e9d1d44916b2349660b3eee3c34a4bb4 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Wed, 5 Apr 2023 14:47:24 +0200
Subject: [PATCH 07/58] Fix bug in init_break not being set when exited via
 antiprompt and others.

---
 .../low_level_api/low_level_api_chatllama_cpp.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/examples/low_level_api/low_level_api_chatllama_cpp.py b/examples/low_level_api/low_level_api_chatllama_cpp.py
index 594d15e..02adf3c 100644
--- a/examples/low_level_api/low_level_api_chatllama_cpp.py
+++ b/examples/low_level_api/low_level_api_chatllama_cpp.py
@@ -33,6 +33,7 @@ class LLaMAInteract:
 		top_p: float=1.,
 		temp: float=1.0,
 		repeat_penalty: float=1,
+		init_break: bool=True,
 		instruct_inp_prefix: str="\n\n### Instruction:\n\n",
 		instruct_inp_suffix: str="\n\n### Response:\n\n",
 	) -> None:
@@ -48,6 +49,7 @@ class LLaMAInteract:
 		self.top_p=top_p
 		self.temp=temp
 		self.repeat_penalty=repeat_penalty
+		self.init_break = init_break
 
 		# runtime args
 		self.input_consumed = 0
@@ -81,9 +83,6 @@ class LLaMAInteract:
 		if (len(primer) > 0):
 			self.embd_inp += self._tokenize(primer)
 
-		# break immediately if using instruct
-		self.init_break = self.instruct
-
 		# number of tokens to keep when resetting context
 		if (self.n_keep < 0 or self.n_keep > len(self.embd_inp) or self.instruct):
 			self.n_keep = len(self.embd_inp)
@@ -182,13 +181,14 @@ class LLaMAInteract:
 			if (len(self.embd_inp) <= self.input_consumed):
 				# if antiprompt is present, stop
 				if (self.use_antiprompt()):
-					for i in self.first_antiprompt:
-						if i == self.last_n_tokens[-len(i):]:
-							return
+					if True in [
+						i == self.last_n_tokens[-len(i):] 
+						for i in self.first_antiprompt
+					]:
+						break
 
 				# if we are using instruction mode, and we have processed the initial prompt
 				if (self.init_break):
-					self.init_break = False
 					break
 
 			# if end of generation
@@ -201,6 +201,8 @@ class LLaMAInteract:
 					self.embd_inp += self.first_antiprompt[0]
 				break
 
+		self.init_break = False
+
 	def __enter__(self):
 		return self
 

From e1b5b9bb0422b4536fa949166265ebdfcff11362 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 5 Apr 2023 14:44:26 -0400
Subject: [PATCH 08/58] Update fastapi server example

---
 examples/high_level_api/fastapi_server.py | 93 +++++++++++++++++++++--
 1 file changed, 87 insertions(+), 6 deletions(-)

diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py
index 760a6ca..b7d2565 100644
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@@ -13,7 +13,8 @@ Then visit http://localhost:8000/docs to see the interactive API docs.
 """
 import os
 import json
-from typing import List, Optional, Literal, Union, Iterator
+from typing import List, Optional, Literal, Union, Iterator, Dict
+from typing_extensions import TypedDict
 
 import llama_cpp
 
@@ -64,13 +65,24 @@ class CreateCompletionRequest(BaseModel):
     max_tokens: int = 16
     temperature: float = 0.8
     top_p: float = 0.95
-    logprobs: Optional[int] = Field(None)
     echo: bool = False
     stop: List[str] = []
-    repeat_penalty: float = 1.1
-    top_k: int = 40
     stream: bool = False
 
+    # ignored or currently unsupported
+    model: Optional[str] = Field(None)
+    n: Optional[int] = 1
+    logprobs: Optional[int] = Field(None)
+    presence_penalty: Optional[float] = 0
+    frequency_penalty: Optional[float] = 0
+    best_of: Optional[int] = 1
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    user: Optional[str] = Field(None)
+
+    # llama.cpp specific parameters
+    top_k: int = 40
+    repeat_penalty: float = 1.1
+
     class Config:
         schema_extra = {
             "example": {
@@ -91,7 +103,20 @@ def create_completion(request: CreateCompletionRequest):
     if request.stream:
         chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict())  # type: ignore
         return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
-    return llama(**request.dict())
+    return llama(
+        **request.dict(
+            exclude={
+                "model",
+                "n",
+                "logprobs",
+                "frequency_penalty",
+                "presence_penalty",
+                "best_of",
+                "logit_bias",
+                "user",
+            }
+        )
+    )
 
 
 class CreateEmbeddingRequest(BaseModel):
@@ -132,6 +157,16 @@ class CreateChatCompletionRequest(BaseModel):
     stream: bool = False
     stop: List[str] = []
     max_tokens: int = 128
+
+    # ignored or currently unsupported
+    model: Optional[str] = Field(None)
+    n: Optional[int] = 1
+    presence_penalty: Optional[float] = 0
+    frequency_penalty: Optional[float] = 0
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    user: Optional[str] = Field(None)
+
+    # llama.cpp specific parameters
     repeat_penalty: float = 1.1
 
     class Config:
@@ -160,7 +195,16 @@ async def create_chat_completion(
     request: CreateChatCompletionRequest,
 ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
     completion_or_chunks = llama.create_chat_completion(
-        **request.dict(exclude={"model"}),
+        **request.dict(
+            exclude={
+                "model",
+                "n",
+                "presence_penalty",
+                "frequency_penalty",
+                "logit_bias",
+                "user",
+            }
+        ),
     )
 
     if request.stream:
@@ -179,3 +223,40 @@ async def create_chat_completion(
         )
     completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
     return completion
+
+
+class ModelData(TypedDict):
+    id: str
+    object: Literal["model"]
+    owned_by: str
+    permissions: List[str]
+
+
+class ModelList(TypedDict):
+    object: Literal["list"]
+    data: List[ModelData]
+
+
+GetModelResponse = create_model_from_typeddict(ModelList)
+
+
+@app.get("/v1/models", response_model=GetModelResponse)
+def get_models() -> ModelList:
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": llama.model_path,
+                "object": "model",
+                "owned_by": "me",
+                "permissions": [],
+            }
+        ],
+    }
+
+
+if __name__ == "__main__":
+    import os
+    import uvicorn
+
+    uvicorn.run(app, host=os.getenv("HOST", "localhost"), port=os.getenv("PORT", 8000))

From 44448fb3a8ac48a4a5a2487b3a6f52ca880e341b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 5 Apr 2023 16:23:25 -0400
Subject: [PATCH 09/58] Add server as a subpackage

---
 llama_cpp/server/__main__.py | 262 +++++++++++++++++++++++++++++++++++
 setup.py                     |   7 +-
 2 files changed, 268 insertions(+), 1 deletion(-)
 create mode 100644 llama_cpp/server/__main__.py

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
new file mode 100644
index 0000000..0362cff
--- /dev/null
+++ b/llama_cpp/server/__main__.py
@@ -0,0 +1,262 @@
+"""Example FastAPI server for llama.cpp.
+
+To run this example:
+
+```bash
+pip install fastapi uvicorn sse-starlette
+export MODEL=../models/7B/...
+uvicorn fastapi_server_chat:app --reload
+```
+
+Then visit http://localhost:8000/docs to see the interactive API docs.
+
+"""
+import os
+import json
+from typing import List, Optional, Literal, Union, Iterator, Dict
+from typing_extensions import TypedDict
+
+import llama_cpp
+
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
+from sse_starlette.sse import EventSourceResponse
+
+
+class Settings(BaseSettings):
+    model: str
+    n_ctx: int = 2048
+    n_batch: int = 2048
+    n_threads: int = os.cpu_count() or 1
+    f16_kv: bool = True
+    use_mlock: bool = True
+    embedding: bool = True
+    last_n_tokens_size: int = 64
+
+
+app = FastAPI(
+    title="🦙 llama.cpp Python API",
+    version="0.0.1",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+settings = Settings()
+llama = llama_cpp.Llama(
+    settings.model,
+    f16_kv=settings.f16_kv,
+    use_mlock=settings.use_mlock,
+    embedding=settings.embedding,
+    n_threads=settings.n_threads,
+    n_batch=settings.n_batch,
+    n_ctx=settings.n_ctx,
+    last_n_tokens_size=settings.last_n_tokens_size,
+)
+
+
+class CreateCompletionRequest(BaseModel):
+    prompt: str
+    suffix: Optional[str] = Field(None)
+    max_tokens: int = 16
+    temperature: float = 0.8
+    top_p: float = 0.95
+    echo: bool = False
+    stop: List[str] = []
+    stream: bool = False
+
+    # ignored or currently unsupported
+    model: Optional[str] = Field(None)
+    n: Optional[int] = 1
+    logprobs: Optional[int] = Field(None)
+    presence_penalty: Optional[float] = 0
+    frequency_penalty: Optional[float] = 0
+    best_of: Optional[int] = 1
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    user: Optional[str] = Field(None)
+
+    # llama.cpp specific parameters
+    top_k: int = 40
+    repeat_penalty: float = 1.1
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                "stop": ["\n", "###"],
+            }
+        }
+
+
+CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
+
+
+@app.post(
+    "/v1/completions",
+    response_model=CreateCompletionResponse,
+)
+def create_completion(request: CreateCompletionRequest):
+    if request.stream:
+        chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict())  # type: ignore
+        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
+    return llama(
+        **request.dict(
+            exclude={
+                "model",
+                "n",
+                "logprobs",
+                "frequency_penalty",
+                "presence_penalty",
+                "best_of",
+                "logit_bias",
+                "user",
+            }
+        )
+    )
+
+
+class CreateEmbeddingRequest(BaseModel):
+    model: Optional[str]
+    input: str
+    user: Optional[str]
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "input": "The food was delicious and the waiter...",
+            }
+        }
+
+
+CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
+
+
+@app.post(
+    "/v1/embeddings",
+    response_model=CreateEmbeddingResponse,
+)
+def create_embedding(request: CreateEmbeddingRequest):
+    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
+
+
+class ChatCompletionRequestMessage(BaseModel):
+    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
+    content: str
+    user: Optional[str] = None
+
+
+class CreateChatCompletionRequest(BaseModel):
+    model: Optional[str]
+    messages: List[ChatCompletionRequestMessage]
+    temperature: float = 0.8
+    top_p: float = 0.95
+    stream: bool = False
+    stop: List[str] = []
+    max_tokens: int = 128
+
+    # ignored or currently unsupported
+    model: Optional[str] = Field(None)
+    n: Optional[int] = 1
+    presence_penalty: Optional[float] = 0
+    frequency_penalty: Optional[float] = 0
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    user: Optional[str] = Field(None)
+
+    # llama.cpp specific parameters
+    repeat_penalty: float = 1.1
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "messages": [
+                    ChatCompletionRequestMessage(
+                        role="system", content="You are a helpful assistant."
+                    ),
+                    ChatCompletionRequestMessage(
+                        role="user", content="What is the capital of France?"
+                    ),
+                ]
+            }
+        }
+
+
+CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
+
+
+@app.post(
+    "/v1/chat/completions",
+    response_model=CreateChatCompletionResponse,
+)
+async def create_chat_completion(
+    request: CreateChatCompletionRequest,
+) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
+    completion_or_chunks = llama.create_chat_completion(
+        **request.dict(
+            exclude={
+                "model",
+                "n",
+                "presence_penalty",
+                "frequency_penalty",
+                "logit_bias",
+                "user",
+            }
+        ),
+    )
+
+    if request.stream:
+
+        async def server_sent_events(
+            chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
+        ):
+            for chat_chunk in chat_chunks:
+                yield dict(data=json.dumps(chat_chunk))
+            yield dict(data="[DONE]")
+
+        chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore
+
+        return EventSourceResponse(
+            server_sent_events(chunks),
+        )
+    completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
+    return completion
+
+
+class ModelData(TypedDict):
+    id: str
+    object: Literal["model"]
+    owned_by: str
+    permissions: List[str]
+
+
+class ModelList(TypedDict):
+    object: Literal["list"]
+    data: List[ModelData]
+
+
+GetModelResponse = create_model_from_typeddict(ModelList)
+
+
+@app.get("/v1/models", response_model=GetModelResponse)
+def get_models() -> ModelList:
+    return {
+        "object": "list",
+        "data": [
+            {
+                "id": llama.model_path,
+                "object": "model",
+                "owned_by": "me",
+                "permissions": [],
+            }
+        ],
+    }
+
+
+if __name__ == "__main__":
+    import os
+    import uvicorn
+
+    uvicorn.run(app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)))
diff --git a/setup.py b/setup.py
index 0349e79..60891bf 100644
--- a/setup.py
+++ b/setup.py
@@ -14,10 +14,15 @@ setup(
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",
-    packages=["llama_cpp"],
+    package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"},
+    packages=["llama_cpp", "llama_cpp.server"],
+    entry_points={"console_scripts": ["llama_cpp.server=llama_cpp.server:main"]},
     install_requires=[
         "typing-extensions>=4.5.0",
     ],
+    extras_require={
+        "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"],
+    },
     python_requires=">=3.7",
     classifiers=[
         "Programming Language :: Python :: 3",

From 74bf043dddfed301307dabaa073e561139395508 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 5 Apr 2023 16:25:54 -0400
Subject: [PATCH 10/58] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 53dbba7..eeaa7b0 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 53dbba769537e894ead5c6913ab2fd3a4658b738
+Subproject commit eeaa7b0492fc79baab8bb1fe195d6c87159f2bd3

From 267d3648fc0b81b6ec482bf227746d2bc5191d49 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 5 Apr 2023 16:26:22 -0400
Subject: [PATCH 11/58] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0b76366..9c989b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp"
-version = "0.1.22"
+version = "0.1.23"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 60891bf..7e4193a 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.22",
+    version="0.1.23",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 1e90597983e8273568e1d50c90e471c5290d3822 Mon Sep 17 00:00:00 2001
From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com>
Date: Wed, 5 Apr 2023 17:37:06 -0400
Subject: [PATCH 12/58] Add pydantic dep. Errors if pedantic isn't present.
 Also throws errors relating to TypeDict or subclass() if the version is too
 old or new...

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 7e4193a..f50fe8d 100644
--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,7 @@ setup(
     entry_points={"console_scripts": ["llama_cpp.server=llama_cpp.server:main"]},
     install_requires=[
         "typing-extensions>=4.5.0",
+        "pydantic==1.10.7",
     ],
     extras_require={
         "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"],

From 38f7dea6ca318e62c9b8aab55435566d8e62616b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 5 Apr 2023 17:44:25 -0400
Subject: [PATCH 13/58] Update README and docs

---
 README.md     | 25 +++++++++++++++++++++++--
 docs/index.md | 33 +++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index c69b70c..0c84c1f 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ This package provides:
   - OpenAI-like API
   - LangChain compatibility
 
-# Installation
+## Installation
 
 Install from PyPI:
 
@@ -23,7 +23,7 @@ Install from PyPI:
 pip install llama-cpp-python
 ```
 
-# Usage
+## High-level API
 
 ```python
 >>> from llama_cpp import Llama
@@ -51,6 +51,27 @@ pip install llama-cpp-python
 }
 ```
 
+## Web Server
+
+`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
+This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc).
+
+To install the server package and get started:
+
+```bash
+pip install llama-cpp-python[server]
+export MODEL=./models/7B
+python3 -m llama_cpp.server
+```
+
+Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
+
+## Low-level API
+
+The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`.
+The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+
+
 # Documentation
 
 Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).
diff --git a/docs/index.md b/docs/index.md
index 368c429..efe7fcf 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,5 +1,9 @@
-# 🦙 Python Bindings for `llama.cpp`
+# Getting Started
 
+## 🦙 Python Bindings for `llama.cpp`
+
+[![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python)
+[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
 [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
@@ -21,7 +25,7 @@ Install from PyPI:
 pip install llama-cpp-python
 ```
 
-## Usage
+## High-level API
 
 ```python
 >>> from llama_cpp import Llama
@@ -49,8 +53,33 @@ pip install llama-cpp-python
 }
 ```
 
+## Web Server
+
+`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
+This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc).
+
+To install the server package and get started:
+
+```bash
+pip install llama-cpp-python[server]
+export MODEL=./models/7B
+python3 -m llama_cpp.server
+```
+
+Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
+
+## Low-level API
+
+The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`.
+The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+
+
 ## Development
 
+This package is under active development and I welcome any contributions.
+
+To get started, clone the repository and install the package in development mode:
+
 ```bash
 git clone git@github.com:abetlen/llama-cpp-python.git
 git submodule update --init --recursive

From 76a82babef9703b814ae4cea28cc63c2340ed743 Mon Sep 17 00:00:00 2001
From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com>
Date: Wed, 5 Apr 2023 17:44:53 -0400
Subject: [PATCH 14/58] Set n_batch to the default value of 8. I think this is
 leftover from when n_ctx was missing and n_batch was 2048.

---
 llama_cpp/server/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 0362cff..67ca115 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -27,7 +27,7 @@ from sse_starlette.sse import EventSourceResponse
 class Settings(BaseSettings):
     model: str
     n_ctx: int = 2048
-    n_batch: int = 2048
+    n_batch: int = 8
     n_threads: int = os.cpu_count() or 1
     f16_kv: bool = True
     use_mlock: bool = True

From c283edd7f29acef7c24755da638c418cb69a22f1 Mon Sep 17 00:00:00 2001
From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com>
Date: Wed, 5 Apr 2023 18:17:29 -0400
Subject: [PATCH 15/58] Set n_batch to default values and reduce thread count:

Change batch size to the llama.cpp default of 8. I've seen issues in llama.cpp where batch size affects quality of generations. (It shouldn't) But in case that's still an issue I changed to default.

Set auto-determined num of threads to 1/2 system count. ggml will sometimes lock cores at 100% while doing nothing. This is being addressed, but can cause bad experience for user if pegged at 100%
---
 examples/high_level_api/fastapi_server.py | 6 +++---
 llama_cpp/server/__main__.py              | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py
index b7d2565..a649692 100644
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@@ -27,10 +27,10 @@ from sse_starlette.sse import EventSourceResponse
 class Settings(BaseSettings):
     model: str
     n_ctx: int = 2048
-    n_batch: int = 2048
-    n_threads: int = os.cpu_count() or 1
+    n_batch: int = 8
+    n_threads: int = int(os.cpu_count() / 2) or 1
     f16_kv: bool = True
-    use_mlock: bool = True
+    use_mlock: bool = False     # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
     embedding: bool = True
     last_n_tokens_size: int = 64
 
diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 67ca115..b474f67 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -28,9 +28,9 @@ class Settings(BaseSettings):
     model: str
     n_ctx: int = 2048
     n_batch: int = 8
-    n_threads: int = os.cpu_count() or 1
+    n_threads: int = int(os.cpu_count() / 2) or 1
     f16_kv: bool = True
-    use_mlock: bool = True
+    use_mlock: bool = False     # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
     embedding: bool = True
     last_n_tokens_size: int = 64
 

From 2e91affea2640eb6ef51da85dc4b131528e78fe1 Mon Sep 17 00:00:00 2001
From: MillionthOdin16 <102247808+MillionthOdin16@users.noreply.github.com>
Date: Wed, 5 Apr 2023 18:23:17 -0400
Subject: [PATCH 16/58] Ignore ./idea folder

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index d09b209..fd64c09 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,4 +163,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/

From 085cc92b1f1def4f13c39bdad4d00c87272a99a5 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Thu, 6 Apr 2023 15:30:57 +0200
Subject: [PATCH 17/58] Better llama.cpp interoperability Has some too many
 newline issues so WIP

---
 examples/__init__.py                          |   0
 examples/common.py                            | 135 +++++++
 examples/low_level_api/__init__.py            |   0
 .../low_level_api_chatllama_cpp.py            | 342 ++++++++++++------
 4 files changed, 357 insertions(+), 120 deletions(-)
 create mode 100644 examples/__init__.py
 create mode 100644 examples/common.py
 create mode 100644 examples/low_level_api/__init__.py

diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/common.py b/examples/common.py
new file mode 100644
index 0000000..f80d995
--- /dev/null
+++ b/examples/common.py
@@ -0,0 +1,135 @@
+import os
+import argparse
+
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
+
+
+@dataclass
+class GptParams:
+    seed: int = -1
+    n_threads: int = min(4, os.cpu_count() or 1)
+    n_predict: int = 128
+    repeat_last_n: int = 64
+    n_parts: int = -1
+    n_ctx: int = 512
+    n_batch: int = 8
+    n_keep: int = 0
+
+    top_k: int = 40
+    top_p: float = 0.95
+    temp: float = 0.80
+    repeat_penalty: float = 1.10
+
+    model: str = "./models/llama-7B/ggml-model.bin"
+    prompt: str = ""
+    input_prefix: str = " "
+    fix_prefix: str = ""
+    output_postfix: str = ""
+    input_echo: bool = True,
+
+    antiprompt: List[str] = field(default_factory=list)
+
+    memory_f16: bool = True
+    random_prompt: bool = False
+    use_color: bool = False
+    interactive: bool = False
+
+    embedding: bool = False
+    interactive_start: bool = False
+
+    instruct: bool = False
+    ignore_eos: bool = False
+    perplexity: bool = False
+    use_mlock: bool = False
+    mem_test: bool = False
+    verbose_prompt: bool = False
+
+    # Default instructions for Alpaca
+    # switch to "Human" and "Assistant" for Vicuna.
+    instruct_inp_prefix: str="\n\n### Instruction:\n\n",
+    instruct_inp_suffix: str="\n\n### Response:\n\n",
+
+
+def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
+    if params is None:
+        params = GptParams()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-h", "--help", action="store_true", help="show this help message and exit")
+    parser.add_argument("-s", "--seed", type=int, default=-1, help="",dest="seed")
+    parser.add_argument("-t", "--threads", type=int, default=1, help="",dest="n_threads")
+    parser.add_argument("-p", "--prompt", type=str, default="", help="",dest="prompt")
+    parser.add_argument("-f", "--file", type=str, default=None, help="")
+    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="",dest="n_ctx")
+    parser.add_argument("--memory_f32", action="store_false", help="",dest="memory_f16")
+    parser.add_argument("--top_p", type=float, default=0.9, help="",dest="top_p")
+    parser.add_argument("--temp", type=float, default=1.0, help="",dest="temp")
+    parser.add_argument("--repeat_last_n", type=int, default=64, help="",dest="repeat_last_n")
+    parser.add_argument("--repeat_penalty", type=float, default=1.0, help="",dest="repeat_penalty")
+    parser.add_argument("-b", "--batch_size", type=int, default=8, help="",dest="n_batch")
+    parser.add_argument("--keep", type=int, default=0, help="",dest="n_keep")
+    parser.add_argument("-m", "--model", type=str, help="",dest="model")
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
+    )
+    parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
+    parser.add_argument("--interactive-start", action="store_true", help="", dest="interactive_start")
+    parser.add_argument(
+        "--interactive-first",
+        action="store_true",
+        help="run in interactive mode and wait for input right away",
+        dest="interactive"
+    )
+    parser.add_argument(
+        "-ins",
+        "--instruct",
+        action="store_true",
+        help="run in instruction mode (use with Alpaca or Vicuna models)",
+        dest="instruct"
+    )
+    parser.add_argument(
+        "--color",
+        action="store_true",
+        help="colorise output to distinguish prompt and user input from generations",
+        dest="use_color"
+    )
+    parser.add_argument("--mlock", action="store_true",dest="use_mlock")
+    parser.add_argument("--mtest", action="store_true",dest="mem_test")
+    parser.add_argument(
+        "-r",
+        "--reverse-prompt",
+        type=str,
+        action='append',
+        help="run in interactive mode and poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
+        dest="antiprompt"
+    )
+    parser.add_argument("--perplexity", action="store_true", help="", dest="perplexity")
+    parser.add_argument("--ignore-eos", action="store_true", help="", dest="ignore_eos")
+    parser.add_argument("--n_parts", type=int, default=-1, help="", dest="n_parts")
+    parser.add_argument("--random-prompt", action="store_true", help="", dest="random_prompt")
+    parser.add_argument("--in-prefix", type=str, default=" ", help="", dest="input_prefix")
+    parser.add_argument("--fix-prefix", type=str, default=" ", help="", dest="fix_prefix")
+    parser.add_argument("--out-postfix", type=str, default="", help="", dest="output_postfix")
+    parser.add_argument("--input-noecho", action="store_false", help="", dest="input_echo")
+    args = parser.parse_args(argv)
+    return args
+
+def gpt_random_prompt(rng):
+    return [
+        "So",
+        "Once upon a time",
+        "When",
+        "The",
+        "After",
+        "If",
+        "import",
+        "He",
+        "She",
+        "They",
+    ][rng % 10]
+
+if __name__ == "__main__":
+    print(GptParams(gpt_params_parse()))
diff --git a/examples/low_level_api/__init__.py b/examples/low_level_api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/examples/low_level_api/low_level_api_chatllama_cpp.py b/examples/low_level_api/low_level_api_chatllama_cpp.py
index 02adf3c..5d2eadd 100644
--- a/examples/low_level_api/low_level_api_chatllama_cpp.py
+++ b/examples/low_level_api/low_level_api_chatllama_cpp.py
@@ -12,102 +12,182 @@ Quirks:
    You should also still be feeding the model with a "primer" prompt that 
    shows it the expected format.
 """
+import sys
+from time import time
+from os import cpu_count
+
 import llama_cpp
+from common import GptParams, gpt_params_parse, gpt_random_prompt
+
+ANSI_COLOR_RESET = "\x1b[0m"
+ANSI_COLOR_YELLOW = "\x1b[33m"
+ANSI_BOLD = "\x1b[1m"
+ANSI_COLOR_GREEN = "\x1b[32m"
+
+CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET
+CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW
+CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN
 
 # A LLaMA interactive session
 class LLaMAInteract:
-	def __init__(self,
-		primer: str="",
-		model: str="./models/30B/ggml-model-q4_0.bin",
-		instruct: bool=False,
-		n_ctx: int=1024,
-		seed: int=0,
-		n_threads: int=8,
-		antiprompt: list[str]=[],
-		input_echo: bool=True,
-		n_predict: int=20,
-		n_keep: int=0,
-		n_batch: int=8,
-		repeat_last_n: int=64,
-		top_k: int=50,
-		top_p: float=1.,
-		temp: float=1.0,
-		repeat_penalty: float=1,
-		init_break: bool=True,
-		instruct_inp_prefix: str="\n\n### Instruction:\n\n",
-		instruct_inp_suffix: str="\n\n### Response:\n\n",
-	) -> None:
+	def __init__(self, params: GptParams) -> None:
 		# input args
-		self.instruct = instruct
-		self.n_threads = n_threads
-		self.input_echo = input_echo
-		self.n_predict = n_predict
-		self.n_keep = n_keep
-		self.n_batch = n_batch
-		self.repeat_last_n = repeat_last_n
-		self.top_k=top_k
-		self.top_p=top_p
-		self.temp=temp
-		self.repeat_penalty=repeat_penalty
-		self.init_break = init_break
+		self.params = params
+
+		if (self.params.perplexity):
+			raise NotImplementedError("""************
+please use the 'perplexity' tool for perplexity calculations
+************""")
+
+		if (self.params.embedding):
+			raise NotImplementedError("""************
+please use the 'embedding' tool for embedding calculations
+************""")
+
+		if (self.params.n_ctx > 2048):
+			print(f"""warning: model does not support \
+context sizes greater than 2048 tokens ({self.params.n_ctx} \
+specified) expect poor results""", file=sys.stderr)
+
+		if (self.params.seed <= 0):
+			self.params.seed = int(time())
+
+		print(f"seed = {self.params.seed}", file=sys.stderr)
+
+		if (self.params.random_prompt):
+			self.params.prompt = gpt_random_prompt(self.params.seed)
 
 		# runtime args
 		self.input_consumed = 0
 		self.embd = []
-		self.embd_inp = []
 		self.n_past = 0
 		self.first_antiprompt = []
-		self.remaining_tokens = self.n_predict
-		self.output_echo = input_echo
+		self.remaining_tokens = self.params.n_predict
+		self.output_echo = self.params.input_echo
 
 		# model load
 		self.lparams = llama_cpp.llama_context_default_params()
-		self.lparams.n_ctx = n_ctx
-		self.lparams.seed = seed
-		self.ctx = llama_cpp.llama_init_from_file(model.encode("utf8"), self.lparams)
+		self.lparams.n_ctx = self.params.n_ctx
+		self.lparams.n_parts = self.params.n_parts
+		self.lparams.seed = self.params.seed
+		self.lparams.memory_f16 = self.params.memory_f16
+		self.lparams.use_mlock = self.params.use_mlock
+
+		self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams)
+		if (self.ctx == 0):
+			raise RuntimeError(f"error: failed to load model '{self.params.model}'")
+
+		print(file=sys.stderr)
+		print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \
+| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr)
 
 		# determine the required inference memory per token:
-		tmp = [0, 1, 2, 3]
-		llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
-
-		# determine newline token
-		self.llama_token_newline = self._tokenize("\n", False)
-		self.inp_prefix = self._tokenize(instruct_inp_prefix)
-		self.inp_suffix = self._tokenize(instruct_inp_suffix, False)
-
-		# add instruction as antiprompt
-		if (self.instruct):
-			self.first_antiprompt.append(self._tokenize(instruct_inp_prefix.strip(), False))
-
-		# primer feed
-		if (len(primer) > 0):
-			self.embd_inp += self._tokenize(primer)
-
-		# number of tokens to keep when resetting context
-		if (self.n_keep < 0 or self.n_keep > len(self.embd_inp) or self.instruct):
-			self.n_keep = len(self.embd_inp)
+		if (self.params.mem_test):
+			tmp = [0, 1, 2, 3]
+			llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads)
+			llama_cpp.llama_print_timings(self.ctx)
+			self.exit()
+			return
 
 		# create internal context
 		self.n_ctx = llama_cpp.llama_n_ctx(self.ctx)
-		self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
+
+		# Add a space in front of the first character to match OG llama tokenizer behavior
+		self.params.prompt = " " + self.params.prompt
+
+		# tokenize the prompt
+		self.embd_inp = self._tokenize(self.params.prompt)
+
+		if (len(self.embd_inp) > self.params.n_ctx - 4):
+			raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})")
+
+		# number of tokens to keep when resetting context
+		if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct):
+			self.params.n_keep = len(self.embd_inp)
+
+		self.inp_prefix = self._tokenize(self.params.instruct_inp_prefix)
+		self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False)
+
+		# in instruct mode, we inject a prefix and a suffix to each input by the user
+		if (self.params.instruct):
+			self.params.interactive_start = True
+			self.first_antiprompt.append(self._tokenize(self.params.instruct_inp_prefix.strip(), False))
+
+		# enable interactive mode if reverse prompt or interactive start is specified
+		if (len(self.params.antiprompt) != 0 or self.params.interactive_start):
+			self.params.interactive = True
+
+		# determine newline token
+		self.llama_token_newline = self._tokenize("\n", False)
+
+		if (self.params.verbose_prompt):
+			print(f"""
+prompt: '{self.params.prompt}'
+number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr)
+
+			for i in range(len(self.embd_inp)):
+				print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr)
+
+			if (self.params.n_keep > 0):
+				print("static prompt based on n_keep: '")
+				for i in range(self.params.n_keep):
+					print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr)
+				print("'", file=sys.stderr)
+			print(file=sys.stderr)
+
+		if (self.params.interactive):
+			print("interactive mode on.", file=sys.stderr)
+
+			if (len(self.params.antiprompt) > 0):
+				for antiprompt in self.params.antiprompt:
+					print(f"Reverse prompt: '{antiprompt}'", file=sys.stderr)
+
+			if len(self.params.input_prefix) > 0:
+				print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr)
+
+		print(f"""sampling: temp = {self.params.temp},\
+top_k = {self.params.top_k},\
+top_p = {self.params.top_p},\
+repeat_last_n = {self.params.repeat_last_n},\
+repeat_penalty = {self.params.repeat_penalty}
+
+generate: n_ctx = {self.n_ctx}, \
+n_batch = {self.params.n_batch}, \
+n_predict = {self.params.n_predict}, \
+n_keep = {self.params.n_keep}
+""", file=sys.stderr)
 
 		# determine antiprompt tokens
-		for i in antiprompt:
+		for i in self.params.antiprompt:
 			self.first_antiprompt.append(self._tokenize(i, False))
 
+		self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices
+
+		if (params.interactive):
+			print("""== Running in interactive mode. ==
+ - Press Ctrl+C to interject at any time.
+ - Press Return to return control to LLaMa.
+ - If you want to submit another line, end your input in '\\'.
+
+""", file=sys.stderr)
+		self.set_color(CONSOLE_COLOR_PROMPT)
+
 	# tokenize a prompt
 	def _tokenize(self, prompt, bos=True):
 		_arr = (llama_cpp.llama_token * (len(prompt) + 1))()
 		_n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8"), _arr, len(_arr), bos)
 		return _arr[:_n]
 
-	# if an antiprompt is present
 	def use_antiprompt(self):
 		return len(self.first_antiprompt) > 0
 
+	def set_color(self, c):
+		if (self.params.use_color):
+			print(c)
+
 	# generate tokens
 	def generate(self):
-		while self.remaining_tokens > 0 or self.use_antiprompt():
+		while self.remaining_tokens > 0 or self.params.interactive:
 			# predict
 			if len(self.embd) > 0:
 				# infinite text generation via context swapping
@@ -115,8 +195,8 @@ class LLaMAInteract:
 				# - take the n_keep first tokens from the original prompt (via n_past)
 				# - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
 				if (self.n_past + len(self.embd) > self.n_ctx):
-					n_left = self.n_past - self.n_keep
-					self.n_past = self.n_keep
+					n_left = self.n_past - self.params.n_keep
+					self.n_past = self.params.n_keep
 
 					# insert n_left/2 tokens at the start of embd from last_n_tokens
 					_insert = self.last_n_tokens[
@@ -125,7 +205,7 @@ class LLaMAInteract:
 					self.embd = _insert + self.embd
 
 				if (llama_cpp.llama_eval(
-					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.n_threads
+					self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads
 				) != 0):
 					raise Exception("Failed to llama_eval!")
 
@@ -133,24 +213,28 @@ class LLaMAInteract:
 			self.embd = []
 			if len(self.embd_inp) <= self.input_consumed:
 				# out of user input, sample next token
-				_arr = self.last_n_tokens[-min(self.repeat_last_n, self.n_past):]
+
+				#TODO: self.params.ignore_eos
+
+				_arr = self.last_n_tokens[-min(self.params.repeat_last_n, self.n_past):]
 				id = llama_cpp.llama_sample_top_p_top_k(
 					self.ctx,
 					(llama_cpp.llama_token * len(_arr))(*_arr),
 					len(_arr),
-					self.top_k,
-					self.top_p,
-					self.temp,
-					self.repeat_penalty,
+					self.params.top_k,
+					self.params.top_p,
+					self.params.temp,
+					self.params.repeat_penalty,
 				)
 				self.last_n_tokens.pop(0)
 				self.last_n_tokens.append(id)
 
 				# replace end of text token with newline token when in interactive mode
-				if (id == llama_cpp.llama_token_eos() and self.use_antiprompt() and not self.instruct):
+				if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
 					id = self.llama_token_newline[0]
-					# tokenize and inject first reverse prompt
-					self.embd_inp += self.first_antiprompt[0]
+					if (self.use_antiprompt()):
+						# tokenize and inject first reverse prompt
+						self.embd_inp += self.first_antiprompt[0]
 
 				# add it to the context
 				self.embd.append(id)
@@ -162,7 +246,7 @@ class LLaMAInteract:
 				self.remaining_tokens -= 1
 			else:
 				# output to console if input echo is on
-				self.output_echo = self.input_echo
+				self.output_echo = self.params.input_echo
 
 				# some user input remains from prompt or interaction, forward it to processing
 				while len(self.embd_inp) > self.input_consumed:
@@ -170,7 +254,7 @@ class LLaMAInteract:
 					self.last_n_tokens.pop(0)
 					self.last_n_tokens.append(self.embd_inp[self.input_consumed])
 					self.input_consumed += 1
-					if len(self.embd) >= self.n_batch:
+					if len(self.embd) >= self.params.n_batch:
 						break
 
 			# display tokens
@@ -178,7 +262,11 @@ class LLaMAInteract:
 				for id in self.embd:
 					yield id
 
-			if (len(self.embd_inp) <= self.input_consumed):
+			# reset color to default if we there is no pending user input
+			if (self.params.input_echo and len(self.embd_inp) == self.input_consumed):
+				self.set_color(CONSOLE_COLOR_DEFAULT)
+
+			if (self.params.interactive and len(self.embd_inp) <= self.input_consumed):
 				# if antiprompt is present, stop
 				if (self.use_antiprompt()):
 					if True in [
@@ -188,26 +276,36 @@ class LLaMAInteract:
 						break
 
 				# if we are using instruction mode, and we have processed the initial prompt
-				if (self.init_break):
+				if (self.n_past > 0 and self.params.interactive_start):
 					break
 
-			# if end of generation
+			# end of text token
 			if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos():
+				if (not self.params.instruct):
+					for i in " [end of text]\n":
+						yield i
 				break
 
 			# respect n_predict even if antiprompt is present
-			if (self.use_antiprompt() and self.remaining_tokens <= 0 and self.n_predict != -1):
-				if not self.instruct:
+			if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1):
+				# If we arent in instruction mode, fix the current generation by appending the antiprompt.
+				# Makes it so if chat ends prematurely you dont append the AI's text etc.
+				if not self.params.instruct:
 					self.embd_inp += self.first_antiprompt[0]
+				self.n_remain = self.params.n_predict
 				break
 
-		self.init_break = False
+		self.params.interactive_start = False
 
 	def __enter__(self):
 		return self
 
 	def __exit__(self, type, value, tb):
+		self.exit()
+
+	def exit(self):
 		llama_cpp.llama_free(self.ctx)
+		self.set_color(CONSOLE_COLOR_DEFAULT)
 
 	# return past text
 	def past(self):
@@ -216,18 +314,51 @@ class LLaMAInteract:
 
 	# write input
 	def input(self, prompt: str):
-		if (self.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix):
+		if (self.params.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix):
 			self.embd_inp += self.inp_prefix
 		self.embd_inp += self._tokenize(prompt)
-		if (self.instruct):
+		if (self.params.instruct):
 			self.embd_inp += self.inp_suffix
 
 	# write output
 	def output(self):
-		self.remaining_tokens = self.n_predict
+		self.remaining_tokens = self.params.n_predict
 		for id in self.generate():
 			yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf-8")
 
+	# read user input
+	def read_input(self):
+		out = ""
+		while (t := input()).endswith("\\"):
+			out += t[:-1] + "\n"
+		return out + t + "\n"
+
+	# interactive mode
+	def interact(self):
+		for i in self.output():
+			print(i,end="",flush=True)
+		self.params.input_echo = False
+
+		while self.params.interactive:
+			self.set_color(CONSOLE_COLOR_USER_INPUT)
+			if (self.params.instruct):
+				print('\n> ', end="")
+				self.input(self.read_input())
+			else:
+				print(self.params.input_prefix, end="")
+				self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.output_postfix}")
+				print(self.params.output_postfix,end="")
+			self.set_color(CONSOLE_COLOR_DEFAULT)
+
+			try:
+				for i in self.output():
+					print(i,end="",flush=True)
+			except KeyboardInterrupt:
+				self.set_color(CONSOLE_COLOR_DEFAULT)
+				if not self.params.instruct:
+					print(self.params.fix_prefix,end="")
+					self.input(self.params.fix_prefix)
+
 if __name__ == "__main__":
 	from datetime import datetime
 
@@ -252,41 +383,12 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {USER_NAME}: Name a color.
 {AI_NAME}: Blue
 {USER_NAME}:"""
+	args = gpt_params_parse()
+	params = GptParams(args)
 
-	print("Loading model...")
-	with LLaMAInteract(prompt, 
-		model="./models/30B/ggml-model-q4_0.bin",
-		n_ctx=2048, 
-		antiprompt=[f"\n{USER_NAME}:"], 
-		repeat_last_n=256, 
-		n_predict=2048,
-		temp=0.7, top_p=0.5, top_k=40, repeat_penalty=1.17647
-	) as m:
-		print("Loaded model!")
+	if (args.file):
+		with open(args.file) as f:
+			params.prompt = f.read()
 
-		for i in m.output():
-			print(i,end="",flush=True)
-		m.input_echo = False
-
-		def inp():
-			out = ""
-			while (t := input()).endswith("\\"):
-				out += t[:-1] + "\n"
-			return out + t + "\n"
-
-		while True:
-			if (m.instruct):
-				print('\n> ', end="")
-				m.input(inp())
-			else:
-				print(f" ", end="")
-				m.input(f" {inp()}{AI_NAME}:")
-				print(f"{AI_NAME}: ",end="")
-
-			try:
-				for i in m.output():
-					print(i,end="",flush=True)
-			except KeyboardInterrupt:
-				if not m.instruct:
-					print(f"\n{USER_NAME}:",end="")
-					m.input(f"\n{USER_NAME}:")
+	with LLaMAInteract() as m:
+		m.interact()

From 10c757111786966285eca6db88e037b7952764db Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Thu, 6 Apr 2023 15:33:22 +0200
Subject: [PATCH 18/58] Fixed too many newlines, now onto args.

Still needs shipping work so you could do "python -m llama_cpp.examples." etc.
---
 examples/low_level_api/low_level_api_chatllama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/low_level_api/low_level_api_chatllama_cpp.py b/examples/low_level_api/low_level_api_chatllama_cpp.py
index 5d2eadd..6aecb6d 100644
--- a/examples/low_level_api/low_level_api_chatllama_cpp.py
+++ b/examples/low_level_api/low_level_api_chatllama_cpp.py
@@ -183,7 +183,7 @@ n_keep = {self.params.n_keep}
 
 	def set_color(self, c):
 		if (self.params.use_color):
-			print(c)
+			print(c, end="")
 
 	# generate tokens
 	def generate(self):

From 55279b679df4153759a80945af7017a79a8ac37c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 21:07:35 -0400
Subject: [PATCH 19/58] Handle prompt list

---
 llama_cpp/server/__main__.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 0362cff..0650bc0 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -60,7 +60,7 @@ llama = llama_cpp.Llama(
 
 
 class CreateCompletionRequest(BaseModel):
-    prompt: str
+    prompt: Union[str, List[str]]
     suffix: Optional[str] = Field(None)
     max_tokens: int = 16
     temperature: float = 0.8
@@ -100,10 +100,10 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
     response_model=CreateCompletionResponse,
 )
 def create_completion(request: CreateCompletionRequest):
-    if request.stream:
-        chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict())  # type: ignore
-        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
-    return llama(
+    if isinstance(request.prompt, list):
+        request.prompt = "".join(request.prompt)
+
+    completion_or_chunks = llama(
         **request.dict(
             exclude={
                 "model",
@@ -117,6 +117,11 @@ def create_completion(request: CreateCompletionRequest):
             }
         )
     )
+    if request.stream:
+        chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks  # type: ignore
+        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
+    completion: llama_cpp.Completion = completion_or_chunks  # type: ignore
+    return completion
 
 
 class CreateEmbeddingRequest(BaseModel):
@@ -259,4 +264,6 @@ if __name__ == "__main__":
     import os
     import uvicorn
 
-    uvicorn.run(app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)))
+    uvicorn.run(
+        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
+    )

From 09707f5b2a4187091242dc57d560782b1fb3d706 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 21:08:32 -0400
Subject: [PATCH 20/58] Remove console script

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index f50fe8d..3bb104f 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,6 @@ setup(
     license="MIT",
     package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"},
     packages=["llama_cpp", "llama_cpp.server"],
-    entry_points={"console_scripts": ["llama_cpp.server=llama_cpp.server:main"]},
     install_requires=[
         "typing-extensions>=4.5.0",
         "pydantic==1.10.7",

From 7851cc1e3c7684113f7d892f64397702a4ae0578 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 21:10:34 -0400
Subject: [PATCH 21/58] Don't install pydantic by default

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3bb104f..d3256e6 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,6 @@ setup(
     packages=["llama_cpp", "llama_cpp.server"],
     install_requires=[
         "typing-extensions>=4.5.0",
-        "pydantic==1.10.7",
     ],
     extras_require={
         "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"],

From 9b7526895d7a76da318d25057d7a21c86fb280e0 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 21:19:08 -0400
Subject: [PATCH 22/58] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9c989b4..bafb743 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp"
-version = "0.1.23"
+version = "0.1.24"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index d3256e6..88ed500 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.23",
+    version="0.1.24",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From da539cc2ee8ce0af2c5519c84b836ad1c9a64a9a Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 21:22:19 -0400
Subject: [PATCH 23/58] Safer calculation of default n_threads

---
 llama_cpp/server/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index c3168b4..d19c45a 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -28,7 +28,7 @@ class Settings(BaseSettings):
     model: str
     n_ctx: int = 2048
     n_batch: int = 8
-    n_threads: int = int(os.cpu_count() / 2) or 1
+    n_threads: int = ((os.cpu_count() or 2) // 2) or 1
     f16_kv: bool = True
     use_mlock: bool = False     # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
     embedding: bool = True

From baa825dacb3c5ce0c86650340af372f84be385f3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 21:27:01 -0400
Subject: [PATCH 24/58] Add windows and mac runners

---
 .github/workflows/test.yaml | 48 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 403ff30..4908950 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -6,7 +6,7 @@ on:
       - main
 
 jobs:
-  build:
+  build-linux:
 
     runs-on: ubuntu-latest
     strategy:
@@ -28,3 +28,49 @@ jobs:
       - name: Test with pytest
         run: |
           pytest
+
+  build-windows:
+
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: "true"
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip pytest cmake scikit-build
+          python3 setup.py develop
+      - name: Test with pytest
+        run: |
+          pytest
+
+  build-linux:
+
+    runs-on: macos-latest
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: "true"
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip pytest cmake scikit-build
+          python3 setup.py develop
+      - name: Test with pytest
+        run: |
+          pytest
\ No newline at end of file

From dd1c2986204340410d3af6dc38d034498150a1ec Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 21:28:03 -0400
Subject: [PATCH 25/58] Fix typo

---
 .github/workflows/test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 4908950..826355a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -52,7 +52,7 @@ jobs:
         run: |
           pytest
 
-  build-linux:
+  build-macos:
 
     runs-on: macos-latest
     strategy:

From d75196d7a1fca5dc4aaf52930365296017a30156 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 21:33:54 -0400
Subject: [PATCH 26/58] Install with pip during build step

Use setup.py install

Upgrade version of setuptools

Revert to develop

Use setup.py build and pip install

Just use pip install

Use correct name in pyproject.toml

Make pip install verbose
---
 .github/workflows/test.yaml | 12 ++++++------
 pyproject.toml              |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 826355a..e8beb9a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -23,8 +23,8 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build
-          python3 setup.py develop
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          pip install . -v
       - name: Test with pytest
         run: |
           pytest
@@ -46,8 +46,8 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build
-          python3 setup.py develop
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          pip install . -v
       - name: Test with pytest
         run: |
           pytest
@@ -69,8 +69,8 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build
-          python3 setup.py develop
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+          pip install . -v
       - name: Test with pytest
         run: |
           pytest
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index bafb743..2ef7cd4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.poetry]
-name = "llama_cpp"
+name = "llama_cpp_python"
 version = "0.1.24"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]

From 241722c9814fba181837a7dcc389190ac2807688 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 22:38:53 -0400
Subject: [PATCH 27/58] Quote destination

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 40d8f28..bcd026a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,4 +6,4 @@ set(BUILD_SHARED_LIBS "On")
 
 add_subdirectory(vendor/llama.cpp)
 
-install(TARGETS llama LIBRARY DESTINATION llama_cpp)
\ No newline at end of file
+install(TARGETS llama LIBRARY DESTINATION "llama_cpp")
\ No newline at end of file

From 88c23d04a85e03ce7d0fea1bf042d60ddcab7546 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 22:44:31 -0400
Subject: [PATCH 28/58] Fix windows dll location issue

---
 CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bcd026a..39ec507 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,4 +6,8 @@ set(BUILD_SHARED_LIBS "On")
 
 add_subdirectory(vendor/llama.cpp)
 
-install(TARGETS llama LIBRARY DESTINATION "llama_cpp")
\ No newline at end of file
+install(
+    TARGETS llama 
+    LIBRARY DESTINATION llama_cpp
+    RUNTIME DESTINATION llama_cpp
+)
\ No newline at end of file

From 0fd32046cb8291307cd2049b2b2625ddd9a260cd Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Apr 2023 22:48:54 -0400
Subject: [PATCH 29/58] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2ef7cd4..f347c95 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.24"
+version = "0.1.25"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 88ed500..7cf93fb 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.24",
+    version="0.1.25",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From d74800da528b46777829acec10b1a0ee3a7846f8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Apr 2023 03:14:38 -0400
Subject: [PATCH 30/58] Build wheels

---
 .github/workflows/wheels.yaml | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 .github/workflows/wheels.yaml

diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yaml
new file mode 100644
index 0000000..81d6314
--- /dev/null
+++ b/.github/workflows/wheels.yaml
@@ -0,0 +1,33 @@
+name: Build
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, windows-2019, macOS-11]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      # Used to host cibuildwheel
+      - uses: actions/setup-python@v3
+
+      - name: Install cibuildwheel
+        run: python -m pip install cibuildwheel==2.12.1
+
+      - name: Build wheels
+        run: python -m cibuildwheel --output-dir wheelhouse
+        # to supply options, put them in 'env', like:
+        # env:
+        #   CIBW_SOME_OPTION: value
+
+      - uses: actions/upload-artifact@v3
+        with:
+          path: ./wheelhouse/*.whl

From d4912a80dac4b8a4c3ee3eae157d46e7f6961273 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Apr 2023 03:18:56 -0400
Subject: [PATCH 31/58] Install build dependencies

---
 .github/workflows/wheels.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yaml
index 81d6314..dcbefc7 100644
--- a/.github/workflows/wheels.yaml
+++ b/.github/workflows/wheels.yaml
@@ -22,6 +22,10 @@ jobs:
       - name: Install cibuildwheel
         run: python -m pip install cibuildwheel==2.12.1
 
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+
       - name: Build wheels
         run: python -m cibuildwheel --output-dir wheelhouse
         # to supply options, put them in 'env', like:

From c3b1aa6ab7eb22ced8cc03733060ff96b533d66e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Apr 2023 03:19:07 -0400
Subject: [PATCH 32/58] Clone submodule

---
 .github/workflows/wheels.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yaml
index dcbefc7..90dd9f5 100644
--- a/.github/workflows/wheels.yaml
+++ b/.github/workflows/wheels.yaml
@@ -15,6 +15,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: "true"
 
       # Used to host cibuildwheel
       - uses: actions/setup-python@v3

From 16fc5b5d2334fe023b36d94a32706878cb9b2fe7 Mon Sep 17 00:00:00 2001
From: Mug <>
Date: Fri, 7 Apr 2023 13:32:19 +0200
Subject: [PATCH 33/58] More interoperability to the original llama.cpp, and
 arguments now work

---
 examples/__init__.py                          |  0
 examples/low_level_api/__init__.py            |  0
 examples/{ => low_level_api}/common.py        | 79 +++++++++++--------
 ...llama_cpp.py => low_level_api_chat_cpp.py} | 19 +++--
 4 files changed, 55 insertions(+), 43 deletions(-)
 delete mode 100644 examples/__init__.py
 delete mode 100644 examples/low_level_api/__init__.py
 rename examples/{ => low_level_api}/common.py (54%)
 rename examples/low_level_api/{low_level_api_chatllama_cpp.py => low_level_api_chat_cpp.py} (98%)

diff --git a/examples/__init__.py b/examples/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/examples/low_level_api/__init__.py b/examples/low_level_api/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/examples/common.py b/examples/low_level_api/common.py
similarity index 54%
rename from examples/common.py
rename to examples/low_level_api/common.py
index f80d995..1758a2d 100644
--- a/examples/common.py
+++ b/examples/low_level_api/common.py
@@ -26,9 +26,6 @@ class GptParams:
     model: str = "./models/llama-7B/ggml-model.bin"
     prompt: str = ""
     input_prefix: str = " "
-    fix_prefix: str = ""
-    output_postfix: str = ""
-    input_echo: bool = True,
 
     antiprompt: List[str] = field(default_factory=list)
 
@@ -47,41 +44,57 @@ class GptParams:
     mem_test: bool = False
     verbose_prompt: bool = False
 
+    file: str = None
+
+    # If chat ended prematurely, append this to the conversation to fix it.
+    # Set to "\nUser:" etc.
+    # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:""
+    fix_prefix: str = " "
+    output_postfix: str = ""
+    input_echo: bool = True,
+
     # Default instructions for Alpaca
     # switch to "Human" and "Assistant" for Vicuna.
-    instruct_inp_prefix: str="\n\n### Instruction:\n\n",
-    instruct_inp_suffix: str="\n\n### Response:\n\n",
+    # TODO: TBD how they are gonna handle this upstream
+    instruct_inp_prefix: str="\n\n### Instruction:\n\n"
+    instruct_inp_suffix: str="\n\n### Response:\n\n"
 
 
 def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
     if params is None:
         params = GptParams()
 
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-h", "--help", action="store_true", help="show this help message and exit")
-    parser.add_argument("-s", "--seed", type=int, default=-1, help="",dest="seed")
-    parser.add_argument("-t", "--threads", type=int, default=1, help="",dest="n_threads")
-    parser.add_argument("-p", "--prompt", type=str, default="", help="",dest="prompt")
-    parser.add_argument("-f", "--file", type=str, default=None, help="")
-    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="",dest="n_ctx")
-    parser.add_argument("--memory_f32", action="store_false", help="",dest="memory_f16")
-    parser.add_argument("--top_p", type=float, default=0.9, help="",dest="top_p")
-    parser.add_argument("--temp", type=float, default=1.0, help="",dest="temp")
-    parser.add_argument("--repeat_last_n", type=int, default=64, help="",dest="repeat_last_n")
-    parser.add_argument("--repeat_penalty", type=float, default=1.0, help="",dest="repeat_penalty")
-    parser.add_argument("-b", "--batch_size", type=int, default=8, help="",dest="n_batch")
-    parser.add_argument("--keep", type=int, default=0, help="",dest="n_keep")
-    parser.add_argument("-m", "--model", type=str, help="",dest="model")
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
+    parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
+    parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt")
+    parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file")
+    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
+    parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16")
+    parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p")
+    parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k")
+    parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp")
+    parser.add_argument("--n_predict", type=int, default=128, help="number of model parts",dest="n_predict")
+    parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n")
+    parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty")
+    parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
+    parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep")
+    parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model")
     parser.add_argument(
         "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive"
     )
     parser.add_argument("--embedding", action="store_true", help="", dest="embedding")
-    parser.add_argument("--interactive-start", action="store_true", help="", dest="interactive_start")
+    parser.add_argument(
+        "--interactive-start",
+        action="store_true",
+        help="run in interactive mode",
+        dest="interactive"
+    )
     parser.add_argument(
         "--interactive-first",
         action="store_true",
         help="run in interactive mode and wait for input right away",
-        dest="interactive"
+        dest="interactive_start"
     )
     parser.add_argument(
         "-ins",
@@ -96,24 +109,24 @@ def gpt_params_parse(argv = None, params: Optional[GptParams] = None):
         help="colorise output to distinguish prompt and user input from generations",
         dest="use_color"
     )
-    parser.add_argument("--mlock", action="store_true",dest="use_mlock")
-    parser.add_argument("--mtest", action="store_true",dest="mem_test")
+    parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock")
+    parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test")
     parser.add_argument(
         "-r",
         "--reverse-prompt",
         type=str,
         action='append',
-        help="run in interactive mode and poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
+        help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).",
         dest="antiprompt"
     )
-    parser.add_argument("--perplexity", action="store_true", help="", dest="perplexity")
-    parser.add_argument("--ignore-eos", action="store_true", help="", dest="ignore_eos")
-    parser.add_argument("--n_parts", type=int, default=-1, help="", dest="n_parts")
-    parser.add_argument("--random-prompt", action="store_true", help="", dest="random_prompt")
-    parser.add_argument("--in-prefix", type=str, default=" ", help="", dest="input_prefix")
-    parser.add_argument("--fix-prefix", type=str, default=" ", help="", dest="fix_prefix")
-    parser.add_argument("--out-postfix", type=str, default="", help="", dest="output_postfix")
-    parser.add_argument("--input-noecho", action="store_false", help="", dest="input_echo")
+    parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity")
+    parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos")
+    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
+    parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt")
+    parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix")
+    parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix")
+    parser.add_argument("--out-postfix", type=str, default="", help="append to input", dest="output_postfix")
+    parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo")
     args = parser.parse_args(argv)
     return args
 
diff --git a/examples/low_level_api/low_level_api_chatllama_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py
similarity index 98%
rename from examples/low_level_api/low_level_api_chatllama_cpp.py
rename to examples/low_level_api/low_level_api_chat_cpp.py
index 6aecb6d..f4d77d5 100644
--- a/examples/low_level_api/low_level_api_chatllama_cpp.py
+++ b/examples/low_level_api/low_level_api_chat_cpp.py
@@ -6,8 +6,6 @@ Quirks:
  * The first antiprompt should be the userprompt like "\nUser:", 
    because its added when n_predict is reached (aka generation ended prematurely)
  * n_predict can be set to -1 for unlimited length responses (or just a really high value)
- * It's always in interactive mode, generation ends either by reaching an antiprompt 
-   or running out of n_predict.
  * Instruction mode adds its own antiprompt.
    You should also still be feeding the model with a "primer" prompt that 
    shows it the expected format.
@@ -59,7 +57,6 @@ specified) expect poor results""", file=sys.stderr)
 
 		# runtime args
 		self.input_consumed = 0
-		self.embd = []
 		self.n_past = 0
 		self.first_antiprompt = []
 		self.remaining_tokens = self.params.n_predict
@@ -74,7 +71,7 @@ specified) expect poor results""", file=sys.stderr)
 		self.lparams.use_mlock = self.params.use_mlock
 
 		self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams)
-		if (self.ctx == 0):
+		if (not self.ctx):
 			raise RuntimeError(f"error: failed to load model '{self.params.model}'")
 
 		print(file=sys.stderr)
@@ -95,7 +92,13 @@ specified) expect poor results""", file=sys.stderr)
 		# Add a space in front of the first character to match OG llama tokenizer behavior
 		self.params.prompt = " " + self.params.prompt
 
+		# Load prompt file
+		if (self.params.file):
+			with open(self.params.file) as f:
+				self.params.prompt = f.read()
+
 		# tokenize the prompt
+		self.embd = []
 		self.embd_inp = self._tokenize(self.params.prompt)
 
 		if (len(self.embd_inp) > self.params.n_ctx - 4):
@@ -384,11 +387,7 @@ The transcript only includes text, it does not include markup like HTML and Mark
 {AI_NAME}: Blue
 {USER_NAME}:"""
 	args = gpt_params_parse()
-	params = GptParams(args)
+	params = GptParams(**vars(args))
 
-	if (args.file):
-		with open(args.file) as f:
-			params.prompt = f.read()
-
-	with LLaMAInteract() as m:
+	with LLaMAInteract(params) as m:
 		m.interact()

From a3f713039ffb9fc5ab209850d3d84517c8304cd6 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 02:38:42 -0400
Subject: [PATCH 34/58] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index eeaa7b0..62cfc54 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit eeaa7b0492fc79baab8bb1fe195d6c87159f2bd3
+Subproject commit 62cfc54f77e519057110265b52b0d614fa363e2a

From e611cfc56dcd73b450cc32c140b1b38680b4076f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 02:39:17 -0400
Subject: [PATCH 35/58] Build shared library with make on unix platforms

---
 CMakeLists.txt | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39ec507..27e06ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,12 +2,26 @@ cmake_minimum_required(VERSION 3.4...3.22)
 
 project(llama_cpp)
 
-set(BUILD_SHARED_LIBS "On")
-
-add_subdirectory(vendor/llama.cpp)
-
-install(
-    TARGETS llama 
-    LIBRARY DESTINATION llama_cpp
-    RUNTIME DESTINATION llama_cpp
-)
\ No newline at end of file
+if (UNIX)
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
+        COMMAND make libllama.so
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp
+    )
+    add_custom_target(
+        run ALL
+        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
+    )
+    install(
+        FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
+        DESTINATION llama_cpp
+    )
+else()
+    set(BUILD_SHARED_LIBS "On")
+    add_subdirectory(vendor/llama.cpp)
+    install(
+        TARGETS llama 
+        LIBRARY DESTINATION llama_cpp
+        RUNTIME DESTINATION llama_cpp
+    )
+endif(UNIX)

From ae3e9c3d6f55591d2f8b3a0f1fb78f028a60f561 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 02:45:21 -0400
Subject: [PATCH 36/58] Update shared library extension for macos

---
 llama_cpp/llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 41055bd..2d4646f 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -10,7 +10,7 @@ def _load_shared_library(lib_base_name):
     if sys.platform.startswith("linux"):
         lib_ext = ".so"
     elif sys.platform == "darwin":
-        lib_ext = ".dylib"
+        lib_ext = ".so"
     elif sys.platform == "win32":
         lib_ext = ".dll"
     else:

From edaaa1bd6303b6f8608e964ab2cce130177cc96f Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 03:11:25 -0400
Subject: [PATCH 37/58] Only build wheels on workflow dispatch

---
 .github/workflows/wheels.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/wheels.yaml b/.github/workflows/wheels.yaml
index 90dd9f5..e49dad5 100644
--- a/.github/workflows/wheels.yaml
+++ b/.github/workflows/wheels.yaml
@@ -1,9 +1,9 @@
 name: Build
 
-on:
-  push:
-    branches:
-      - main
+on: workflow_dispatch
+  # push:
+  #   branches:
+  #     - main
 
 jobs:
   build_wheels:

From c940193e641de640ce77e1a922745d64b6e90c0c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 03:13:39 -0400
Subject: [PATCH 38/58] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f347c95..d6bda0d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.25"
+version = "0.1.26"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 7cf93fb..31b4f7d 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.25",
+    version="0.1.26",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 6d1bda443e2f52b080c4c9156d35ff693857609b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 09:35:32 -0400
Subject: [PATCH 39/58] Add clients example. Closes #46

---
 examples/notebooks/Clients.ipynb | 104 +++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 examples/notebooks/Clients.ipynb

diff --git a/examples/notebooks/Clients.ipynb b/examples/notebooks/Clients.ipynb
new file mode 100644
index 0000000..caebbb6
--- /dev/null
+++ b/examples/notebooks/Clients.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<OpenAIObject text_completion id=cmpl-ad3ba53d-407c-466b-bd5f-97cb8987af83 at 0x7f6adc12d900> JSON: {\n",
+       "  \"choices\": [\n",
+       "    {\n",
+       "      \"finish_reason\": \"length\",\n",
+       "      \"index\": 0,\n",
+       "      \"logprobs\": null,\n",
+       "      \"text\": \" over the lazy dog.\"\n",
+       "    }\n",
+       "  ],\n",
+       "  \"created\": 1680960690,\n",
+       "  \"id\": \"cmpl-ad3ba53d-407c-466b-bd5f-97cb8987af83\",\n",
+       "  \"model\": \"models/ggml-alpaca.bin\",\n",
+       "  \"object\": \"text_completion\",\n",
+       "  \"usage\": {\n",
+       "    \"completion_tokens\": 5,\n",
+       "    \"prompt_tokens\": 8,\n",
+       "    \"total_tokens\": 13\n",
+       "  }\n",
+       "}"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import openai\n",
+    "\n",
+    "openai.api_key = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n",
+    "openai.api_base = \"http://100.64.159.73:8000/v1\"\n",
+    "\n",
+    "openai.Completion.create(\n",
+    "    model=\"text-davinci-003\", # currently can be anything\n",
+    "    prompt=\"The quick brown fox jumps\",\n",
+    "    max_tokens=5,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "' over the lazy dog'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n",
+    "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n",
+    "\n",
+    "from langchain.llms import OpenAI\n",
+    "\n",
+    "llms = OpenAI()\n",
+    "llms(\n",
+    "    prompt=\"The quick brown fox jumps\",\n",
+    "    stop=[\".\", \"\\n\"],\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 38f442deb03974261a4495d93ee9d08f08126dac Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 15:05:33 -0400
Subject: [PATCH 40/58] Bugfix: Wrong size of embeddings. Closes #47

---
 llama_cpp/llama_cpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 2d4646f..069ae10 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -218,7 +218,7 @@ _lib.llama_n_ctx.restype = c_int
 
 
 def llama_n_embd(ctx: llama_context_p) -> c_int:
-    return _lib.llama_n_ctx(ctx)
+    return _lib.llama_n_embd(ctx)
 
 
 _lib.llama_n_embd.argtypes = [llama_context_p]

From 0a5c551371ad6618821ad1c63e7fc79d5aa48e7e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 15:09:48 -0400
Subject: [PATCH 41/58] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d6bda0d..020bd5d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.26"
+version = "0.1.27"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 31b4f7d..c53bb29 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.26",
+    version="0.1.27",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 0067c1a5889b155e3111b31be5ec4269922b3b9b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 16:01:18 -0400
Subject: [PATCH 42/58] Formatting

---
 llama_cpp/server/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index d19c45a..44ee1f0 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -30,7 +30,7 @@ class Settings(BaseSettings):
     n_batch: int = 8
     n_threads: int = ((os.cpu_count() or 2) // 2) or 1
     f16_kv: bool = True
-    use_mlock: bool = False     # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
+    use_mlock: bool = False  # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
     embedding: bool = True
     last_n_tokens_size: int = 64
 

From 3fbc06361f3cea76044118cece785c44883780b7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 16:01:45 -0400
Subject: [PATCH 43/58] Formatting

---
 llama_cpp/llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 1049e44..c964938 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -686,8 +686,8 @@ class Llama:
             last_n_tokens_size=state["last_n_tokens_size"],
             verbose=state["verbose"],
         )
-        self.last_n_tokens_data=state["last_n_tokens_data"]
-        self.tokens_consumed=state["tokens_consumed"]
+        self.last_n_tokens_data = state["last_n_tokens_data"]
+        self.tokens_consumed = state["tokens_consumed"]
 
 
     @staticmethod

From 314ce7d1cc89e52c1f44d46b21b1282574319296 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 19:54:04 -0400
Subject: [PATCH 44/58] Fix cpu count default

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index c964938..bd8f49f 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -74,7 +74,7 @@ class Llama:
         self.tokens_consumed = 0
         self.n_batch = min(n_ctx, n_batch)
 
-        self.n_threads = n_threads or multiprocessing.cpu_count()
+        self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
 
         if not os.path.exists(model_path):
             raise ValueError(f"Model path does not exist: {model_path}")

From e636214b4ebf279cc1beeeb516e6231ee9a4ab39 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Apr 2023 19:57:37 -0400
Subject: [PATCH 45/58] Add test publish workflow

---
 .github/workflows/publish-to-test-pypi.yaml | 47 +++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 .github/workflows/publish-to-test-pypi.yaml

diff --git a/.github/workflows/publish-to-test-pypi.yaml b/.github/workflows/publish-to-test-pypi.yaml
new file mode 100644
index 0000000..90fe19f
--- /dev/null
+++ b/.github/workflows/publish-to-test-pypi.yaml
@@ -0,0 +1,47 @@
+# Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
+
+name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI
+
+on: workflow_dispatch
+
+jobs:
+  build-n-publish:
+    name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.8"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+    - name: Build source distribution
+      run: |
+        python setup.py sdist
+    # - name: Install pypa/build
+    #   run: >-
+    #     python -m
+    #     pip install
+    #     build
+    #     --user
+    # - name: Build a binary wheel and a source tarball
+    #   run: >-
+    #     python -m
+    #     build
+    #     --sdist
+    #     --wheel
+    #     --outdir dist/
+    #     .
+    - name: Publish distribution 📦 to Test PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+        repository-url: https://test.pypi.org/legacy/
+    # - name: Publish distribution 📦 to PyPI
+    #   if: startsWith(github.ref, 'refs/tags')
+    #   uses: pypa/gh-action-pypi-publish@release/v1
+    #   with:
+    #     password: ${{ secrets.PYPI_API_TOKEN }}
\ No newline at end of file

From c3c2623e8b78a215a161b55264a03a7ea213c368 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Apr 2023 22:01:33 -0400
Subject: [PATCH 46/58] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 12 ++++++++++++
 vendor/llama.cpp       |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 069ae10..6030888 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -77,6 +77,7 @@ class llama_context_params(Structure):
             c_bool,
         ),  # the llama_eval() call computes all logits, not just the last one
         ("vocab_only", c_bool),  # only load the vocabulary, no weights
+        ("use_mmap", c_bool),  # use mmap if possible
         ("use_mlock", c_bool),  # force system to keep model in RAM
         ("embedding", c_bool),  # embedding mode only
         # called with a progress value between 0 and 1, pass NULL to disable
@@ -99,6 +100,17 @@ def llama_context_default_params() -> llama_context_params:
 _lib.llama_context_default_params.argtypes = []
 _lib.llama_context_default_params.restype = llama_context_params
 
+def llama_mmap_supported() -> c_bool:
+    return _lib.llama_mmap_supported()
+
+_lib.llama_mmap_supported.argtypes = []
+_lib.llama_mmap_supported.restype = c_bool
+
+def llama_mlock_supported() -> c_bool:
+    return _lib.llama_mlock_supported()
+
+_lib.llama_mlock_supported.argtypes = []
+_lib.llama_mlock_supported.restype = c_bool
 
 # Various functions for loading a ggml llama model.
 # Allocate (almost) all memory needed for the model.
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 62cfc54..180b693 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 62cfc54f77e519057110265b52b0d614fa363e2a
+Subproject commit 180b693a47b6b825288ef9f2c39d24b6eea4eea6

From 8c2bb3042f203954f6ac032bb91d0429eaa99b4e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Apr 2023 22:12:23 -0400
Subject: [PATCH 47/58] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 020bd5d..6885d7f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.27"
+version = "0.1.28"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index c53bb29..37f5396 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.27",
+    version="0.1.28",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From baa394491c8346cefa4e80cd8ecd5587eaafded8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Apr 2023 22:32:30 -0400
Subject: [PATCH 48/58] Add PyPI publish workflow

---
 .github/workflows/publish.yaml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/workflows/publish.yaml

diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
new file mode 100644
index 0000000..52e30e1
--- /dev/null
+++ b/.github/workflows/publish.yaml
@@ -0,0 +1,29 @@
+name: Publish Python to PyPI
+
+# Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
+
+on: workflow_dispatch
+
+jobs:
+  build-n-publish:
+    name: Build and publish
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.8"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+    - name: Build source distribution
+      run: |
+        python setup.py sdist
+    - name: Publish distribution to PyPI
+      # TODO: move to tag based releases
+      # if: startsWith(github.ref, 'refs/tags')
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}
\ No newline at end of file

From fda975e5a919538a86f74893cafb73d4cd9c672c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Apr 2023 22:34:17 -0400
Subject: [PATCH 49/58] Rename test publish

---
 .github/workflows/publish-to-test-pypi.yaml | 47 ---------------------
 .github/workflows/publish-to-test.yaml      | 28 ++++++++++++
 2 files changed, 28 insertions(+), 47 deletions(-)
 delete mode 100644 .github/workflows/publish-to-test-pypi.yaml
 create mode 100644 .github/workflows/publish-to-test.yaml

diff --git a/.github/workflows/publish-to-test-pypi.yaml b/.github/workflows/publish-to-test-pypi.yaml
deleted file mode 100644
index 90fe19f..0000000
--- a/.github/workflows/publish-to-test-pypi.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-# Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
-
-name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI
-
-on: workflow_dispatch
-
-jobs:
-  build-n-publish:
-    name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: "3.8"
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip pytest cmake scikit-build setuptools
-    - name: Build source distribution
-      run: |
-        python setup.py sdist
-    # - name: Install pypa/build
-    #   run: >-
-    #     python -m
-    #     pip install
-    #     build
-    #     --user
-    # - name: Build a binary wheel and a source tarball
-    #   run: >-
-    #     python -m
-    #     build
-    #     --sdist
-    #     --wheel
-    #     --outdir dist/
-    #     .
-    - name: Publish distribution 📦 to Test PyPI
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        password: ${{ secrets.TEST_PYPI_API_TOKEN }}
-        repository-url: https://test.pypi.org/legacy/
-    # - name: Publish distribution 📦 to PyPI
-    #   if: startsWith(github.ref, 'refs/tags')
-    #   uses: pypa/gh-action-pypi-publish@release/v1
-    #   with:
-    #     password: ${{ secrets.PYPI_API_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml
new file mode 100644
index 0000000..5fdf405
--- /dev/null
+++ b/.github/workflows/publish-to-test.yaml
@@ -0,0 +1,28 @@
+# Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
+
+name: Publish to TestPyPI
+
+on: workflow_dispatch
+
+jobs:
+  build-n-publish:
+    name: Build and publish
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.8"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+    - name: Build source distribution
+      run: |
+        python setup.py sdist
+    - name: Publish to Test PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+        repository-url: https://test.pypi.org/legacy/
\ No newline at end of file

From a79d3eb73227f28f52d8b20b9281b56c25ab321e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Apr 2023 22:38:19 -0400
Subject: [PATCH 50/58] Fix workflow name

---
 .github/workflows/publish.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
index 52e30e1..2972110 100644
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -1,4 +1,4 @@
-name: Publish Python to PyPI
+name: Publish to PyPI
 
 # Based on: https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
 

From 196650ccb2a513165607b45de992aebd701ab850 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Apr 2023 22:45:55 -0400
Subject: [PATCH 51/58] Update model paths to be more clear they should point
 to file

---
 README.md                                           | 6 +++---
 docs/index.md                                       | 6 +++---
 examples/high_level_api/fastapi_server.py           | 2 +-
 examples/high_level_api/high_level_api_embedding.py | 2 +-
 examples/high_level_api/high_level_api_inference.py | 2 +-
 examples/high_level_api/high_level_api_streaming.py | 2 +-
 examples/high_level_api/langchain_custom_llm.py     | 2 +-
 examples/low_level_api/low_level_api_llama_cpp.py   | 2 +-
 8 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 0c84c1f..2c8c0a5 100644
--- a/README.md
+++ b/README.md
@@ -27,14 +27,14 @@ pip install llama-cpp-python
 
 ```python
 >>> from llama_cpp import Llama
->>> llm = Llama(model_path="models/7B/...")
+>>> llm = Llama(model_path="./models/7B/ggml-model.bin")
 >>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True)
 >>> print(output)
 {
   "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
   "object": "text_completion",
   "created": 1679561337,
-  "model": "models/7B/...",
+  "model": "./models/7B/ggml-model.bin",
   "choices": [
     {
       "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
@@ -60,7 +60,7 @@ To install the server package and get started:
 
 ```bash
 pip install llama-cpp-python[server]
-export MODEL=./models/7B
+export MODEL=./models/7B/ggml-model.bin
 python3 -m llama_cpp.server
 ```
 
diff --git a/docs/index.md b/docs/index.md
index efe7fcf..4055155 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -29,14 +29,14 @@ pip install llama-cpp-python
 
 ```python
 >>> from llama_cpp import Llama
->>> llm = Llama(model_path="models/7B/...")
+>>> llm = Llama(model_path="./models/7B/ggml-model.bin")
 >>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True)
 >>> print(output)
 {
   "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
   "object": "text_completion",
   "created": 1679561337,
-  "model": "models/7B/...",
+  "model": "./models/7B/ggml-model.bin",
   "choices": [
     {
       "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
@@ -62,7 +62,7 @@ To install the server package and get started:
 
 ```bash
 pip install llama-cpp-python[server]
-export MODEL=./models/7B
+export MODEL=./models/7B/ggml-model.bin
 python3 -m llama_cpp.server
 ```
 
diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py
index a649692..3ed0eac 100644
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@@ -4,7 +4,7 @@ To run this example:
 
 ```bash
 pip install fastapi uvicorn sse-starlette
-export MODEL=../models/7B/...
+export MODEL=../models/7B/ggml-model.bin
 uvicorn fastapi_server_chat:app --reload
 ```
 
diff --git a/examples/high_level_api/high_level_api_embedding.py b/examples/high_level_api/high_level_api_embedding.py
index 8d783f7..feb0ed6 100644
--- a/examples/high_level_api/high_level_api_embedding.py
+++ b/examples/high_level_api/high_level_api_embedding.py
@@ -3,7 +3,7 @@ import argparse
 from llama_cpp import Llama
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-m", "--model", type=str, default=".//models/...")
+parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-model.bin")
 args = parser.parse_args()
 
 llm = Llama(model_path=args.model, embedding=True)
diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py
index 0fa9cb7..e41f375 100644
--- a/examples/high_level_api/high_level_api_inference.py
+++ b/examples/high_level_api/high_level_api_inference.py
@@ -4,7 +4,7 @@ import argparse
 from llama_cpp import Llama
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-m", "--model", type=str, default="./models/...")
+parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
 args = parser.parse_args()
 
 llm = Llama(model_path=args.model)
diff --git a/examples/high_level_api/high_level_api_streaming.py b/examples/high_level_api/high_level_api_streaming.py
index 787bc6e..747c613 100644
--- a/examples/high_level_api/high_level_api_streaming.py
+++ b/examples/high_level_api/high_level_api_streaming.py
@@ -4,7 +4,7 @@ import argparse
 from llama_cpp import Llama
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-m", "--model", type=str, default="./models/...")
+parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
 args = parser.parse_args()
 
 llm = Llama(model_path=args.model)
diff --git a/examples/high_level_api/langchain_custom_llm.py b/examples/high_level_api/langchain_custom_llm.py
index 6ffd78e..b91632f 100644
--- a/examples/high_level_api/langchain_custom_llm.py
+++ b/examples/high_level_api/langchain_custom_llm.py
@@ -29,7 +29,7 @@ class LlamaLLM(LLM):
 
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-m", "--model", type=str, default="./models/...")
+parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
 args = parser.parse_args()
 
 # Load the model
diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
index 2a639aa..b048c0a 100644
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -9,7 +9,7 @@ N_THREADS = multiprocessing.cpu_count()
 prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
 
 lparams = llama_cpp.llama_context_default_params()
-ctx = llama_cpp.llama_init_from_file(b"models/ggml-alpaca-7b-q4.bin", lparams)
+ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams)
 
 # determine the required inference memory per token:
 tmp = [0, 1, 2, 3]

From a984f55d796c610e184a8f34aa96a6cc58bf2b2e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 10 Apr 2023 00:51:25 -0400
Subject: [PATCH 52/58] Quickfix: forgot to clone submodules when building and
 publishing pypi package

---
 .github/workflows/publish-to-test.yaml | 2 ++
 .github/workflows/publish.yaml         | 2 ++
 pyproject.toml                         | 2 +-
 setup.py                               | 2 +-
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/publish-to-test.yaml b/.github/workflows/publish-to-test.yaml
index 5fdf405..5a9f339 100644
--- a/.github/workflows/publish-to-test.yaml
+++ b/.github/workflows/publish-to-test.yaml
@@ -11,6 +11,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v3
+      with:
+        submodules: "true"
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
index 2972110..92b6e5b 100644
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -11,6 +11,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v3
+      with:
+        submodules: "true"
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
diff --git a/pyproject.toml b/pyproject.toml
index 6885d7f..11d950b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.28"
+version = "0.1.29"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 37f5396..0b0add0 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.28",
+    version="0.1.29",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 8594b8388ee924c115576db5f5be23b2e92f6300 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 10 Apr 2023 01:29:32 -0400
Subject: [PATCH 53/58] Add build and release

---
 .../{wheels.yaml => build-and-release.yaml}   | 29 ++++++++++++++-----
 1 file changed, 21 insertions(+), 8 deletions(-)
 rename .github/workflows/{wheels.yaml => build-and-release.yaml} (54%)

diff --git a/.github/workflows/wheels.yaml b/.github/workflows/build-and-release.yaml
similarity index 54%
rename from .github/workflows/wheels.yaml
rename to .github/workflows/build-and-release.yaml
index e49dad5..5b3e756 100644
--- a/.github/workflows/wheels.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -1,9 +1,6 @@
-name: Build
+name: Build Release
 
 on: workflow_dispatch
-  # push:
-  #   branches:
-  #     - main
 
 jobs:
   build_wheels:
@@ -11,7 +8,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-20.04, windows-2019, macOS-11]
+        os: [ubuntu-latest, windows-latest, macOS-latest]
 
     steps:
       - uses: actions/checkout@v3
@@ -30,10 +27,26 @@ jobs:
 
       - name: Build wheels
         run: python -m cibuildwheel --output-dir wheelhouse
-        # to supply options, put them in 'env', like:
-        # env:
-        #   CIBW_SOME_OPTION: value
 
       - uses: actions/upload-artifact@v3
         with:
           path: ./wheelhouse/*.whl
+
+  build_sdist:
+    name: Build source distribution
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: "true"
+      - uses: actions/setup-python@v3
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools
+      - name: Build source distribution
+        run: |
+          python setup.py sdist
+      - uses: actions/upload-artifact@v3
+        with:
+          path: ./dist/*.tar.gz
\ No newline at end of file

From d41cb0ecf78371307e5d7e47a2dcc727155bde7d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 10 Apr 2023 01:54:52 -0400
Subject: [PATCH 54/58] Add create release step to workflow

---
 .github/workflows/build-and-release.yaml | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index 5b3e756..e4f46bd 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -49,4 +49,20 @@ jobs:
           python setup.py sdist
       - uses: actions/upload-artifact@v3
         with:
-          path: ./dist/*.tar.gz
\ No newline at end of file
+          path: ./dist/*.tar.gz
+
+  release:
+    name: Release
+    needs: [build_wheels, build_sdist]
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/download-artifact@v3
+        with:
+          name: artifact
+          path: dist
+      - uses: softprops/action-gh-release@v1
+        with:
+          files: dist/*
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file

From 1f67ad2a0bbb887041247f3330f31ea3e7f8b37b Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 10 Apr 2023 02:11:35 -0400
Subject: [PATCH 55/58] Add use_mmap option

---
 llama_cpp/llama.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index bd8f49f..2d76ec4 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -23,6 +23,7 @@ class Llama:
         f16_kv: bool = False,
         logits_all: bool = False,
         vocab_only: bool = False,
+        use_mmap: bool = True,
         use_mlock: bool = False,
         embedding: bool = False,
         n_threads: Optional[int] = None,
@@ -40,6 +41,7 @@ class Llama:
             f16_kv: Use half-precision for key/value cache.
             logits_all: Return logits for all tokens, not just the last token.
             vocab_only: Only load the vocabulary no weights.
+            use_mmap: Use mmap if possible.
             use_mlock: Force the system to keep the model in RAM.
             embedding: Embedding mode only.
             n_threads: Number of threads to use. If None, the number of threads is automatically determined.
@@ -63,6 +65,7 @@ class Llama:
         self.params.f16_kv = f16_kv
         self.params.logits_all = logits_all
         self.params.vocab_only = vocab_only
+        self.params.use_mmap = use_mmap
         self.params.use_mlock = use_mlock
         self.params.embedding = embedding
 
@@ -661,6 +664,7 @@ class Llama:
             f16_kv=self.params.f16_kv,
             logits_all=self.params.logits_all,
             vocab_only=self.params.vocab_only,
+            use_mmap=self.params.use_mmap,
             use_mlock=self.params.use_mlock,
             embedding=self.params.embedding,
             last_n_tokens_size=self.last_n_tokens_size,
@@ -679,6 +683,7 @@ class Llama:
             f16_kv=state["f16_kv"],
             logits_all=state["logits_all"],
             vocab_only=state["vocab_only"],
+            use_mmap=state["use_mmap"],
             use_mlock=state["use_mlock"],
             embedding=state["embedding"],
             n_threads=state["n_threads"],

From bc02ce353b29cf67ff66a31eee6c14c5ceff708c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 10 Apr 2023 02:12:19 -0400
Subject: [PATCH 56/58] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 11d950b..89c8271 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.29"
+version = "0.1.30"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 0b0add0..2706b8d 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.29",
+    version="0.1.30",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 3d56c3b7063dc618f3856175f03b1806e2b26d09 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 10 Apr 2023 02:19:22 -0400
Subject: [PATCH 57/58] Run tests for pr's to main

---
 .github/workflows/test.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index e8beb9a..4481085 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -1,6 +1,9 @@
 name: Tests
 
 on:
+  pull_request:
+    branches:
+      - main
   push:
     branches:
       - main

From 241d608bbb1a2817b4b14ef5b3cf65e0c681a850 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 10 Apr 2023 02:35:00 -0400
Subject: [PATCH 58/58] Update workflow permissions

---
 .github/workflows/build-and-release.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index e4f46bd..2c0ca4a 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -2,6 +2,9 @@ name: Build Release
 
 on: workflow_dispatch
 
+permissions:
+  contents: write
+
 jobs:
   build_wheels:
     name: Build wheels on ${{ matrix.os }}