remove prompt cache

2023-07-06 17:49:05 -04:00 · 2023-07-06 17:49:05 -04:00 · da74384a3e
commit da74384a3e
parent 45bf83ff58
4 changed files with 12 additions and 41 deletions
--- a/llama/binding/binding.cpp
+++ b/llama/binding/binding.cpp
@ -24,7 +24,7 @@
 #include <windows.h>
 #endif

-#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) ||          \
+#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || \
    defined(_WIN32)
 void sigint_handler(int signo) {
  if (signo == SIGINT) {
@ -573,15 +573,13 @@ void *llama_allocate_params(
    const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
    float frequency_penalty, float presence_penalty, int mirostat,
    float mirostat_eta, float mirostat_tau, bool penalize_nl,
-    const char *logit_bias, const char *session_file, bool prompt_cache_all,
-    bool mlock, bool mmap, const char *maingpu, const char *tensorsplit,
-    bool prompt_cache_ro) {
+    const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
+    const char *tensorsplit) {
  gpt_params *params = new gpt_params;
  params->seed = seed;
  params->n_threads = threads;
  params->n_predict = tokens;
  params->repeat_last_n = repeat_last_n;
-  params->prompt_cache_ro = prompt_cache_ro;
  params->top_k = top_k;
  params->top_p = top_p;
  params->memory_f16 = memory_f16;
@ -612,9 +610,6 @@ void *llama_allocate_params(
    }
  }

-  params->prompt_cache_all = prompt_cache_all;
-  params->path_prompt_cache = session_file;
-
  if (ignore_eos) {
    params->logit_bias[llama_token_eos()] = -INFINITY;
  }
--- a/llama/binding/binding.h
+++ b/llama/binding/binding.h
@ -31,9 +31,8 @@ void *llama_allocate_params(
    const char **antiprompt, int antiprompt_count, float tfs_z, float typical_p,
    float frequency_penalty, float presence_penalty, int mirostat,
    float mirostat_eta, float mirostat_tau, bool penalize_nl,
-    const char *logit_bias, const char *session_file, bool prompt_cache_all,
-    bool mlock, bool mmap, const char *maingpu, const char *tensorsplit,
-    bool prompt_cache_ro);
+    const char *logit_bias, bool mlock, bool mmap, const char *maingpu,
+    const char *tensorsplit);

 void llama_free_params(void *params_ptr);

--- a/llama/llama.go
+++ b/llama/llama.go
@ -28,6 +28,7 @@ package llama
 // #include "binding/binding.h"
 // #include <stdlib.h>
 import "C"
+
 import (
 	"fmt"
 	"strings"
@ -69,7 +70,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error {
 		po.Tokens = 99999999
 	}
 	defer C.free(unsafe.Pointer(input))
-	
+
 	reverseCount := len(po.StopPrompts)
 	reversePrompt := make([]*C.char, reverseCount)
 	var pass **C.char
@ -86,9 +87,7 @@ func (l *LLama) Eval(text string, opts ...PredictOption) error {
 		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
 		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
 		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), C.CString(po.LogitBias),
-		C.CString(po.PathPromptCache), C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
-		C.CString(po.MainGPU), C.CString(po.TensorSplit),
-		C.bool(po.PromptCacheRO),
+		C.bool(po.MLock), C.bool(po.MMap), C.CString(po.MainGPU), C.CString(po.TensorSplit),
 	)
 	defer C.llama_free_params(params)

@ -128,9 +127,6 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 	cLogitBias := C.CString(po.LogitBias)
 	defer C.free(unsafe.Pointer(cLogitBias))

-	cPathPromptCache := C.CString(po.PathPromptCache)
-	defer C.free(unsafe.Pointer(cPathPromptCache))
-
 	cMainGPU := C.CString(po.MainGPU)
 	defer C.free(unsafe.Pointer(cMainGPU))

@ -143,9 +139,7 @@ func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
 		C.int(po.Batch), C.int(po.NKeep), pass, C.int(reverseCount),
 		C.float(po.TailFreeSamplingZ), C.float(po.TypicalP), C.float(po.FrequencyPenalty), C.float(po.PresencePenalty),
 		C.int(po.Mirostat), C.float(po.MirostatETA), C.float(po.MirostatTAU), C.bool(po.PenalizeNL), cLogitBias,
-		cPathPromptCache, C.bool(po.PromptCacheAll), C.bool(po.MLock), C.bool(po.MMap),
-		cMainGPU, cTensorSplit,
-		C.bool(po.PromptCacheRO),
+		C.bool(po.MLock), C.bool(po.MMap), cMainGPU, cTensorSplit,
 	)
 	defer C.llama_free_params(params)

--- a/llama/options.go
+++ b/llama/options.go
@ -57,11 +57,9 @@ type PredictOptions struct {
 	LogitBias         string
 	TokenCallback     func(string) bool

-	PathPromptCache             string
-	MLock, MMap, PromptCacheAll bool
-	PromptCacheRO               bool
-	MainGPU                     string
-	TensorSplit                 string
+	MLock, MMap bool
+	MainGPU     string
+	TensorSplit string
 }

 type PredictOption func(p *PredictOptions)
@ -182,14 +180,6 @@ var Debug PredictOption = func(p *PredictOptions) {
 	p.DebugMode = true
 }

-var EnablePromptCacheAll PredictOption = func(p *PredictOptions) {
-	p.PromptCacheAll = true
-}
-
-var EnablePromptCacheRO PredictOption = func(p *PredictOptions) {
-	p.PromptCacheRO = true
-}
-
 var EnableMLock ModelOption = func(p *ModelOptions) {
 	p.MLock = true
 }
@ -284,13 +274,6 @@ func SetTemperature(temp float64) PredictOption {
 	}
 }

-// SetPathPromptCache sets the session file to store the prompt cache.
-func SetPathPromptCache(f string) PredictOption {
-	return func(p *PredictOptions) {
-		p.PathPromptCache = f
-	}
-}
-
 // SetPenalty sets the repetition penalty for text generation.
 func SetPenalty(penalty float64) PredictOption {
 	return func(p *PredictOptions) {