From ad3a7d0e2cad1d9cdfe9ad72f14d3db1cbc4c720 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Thu, 27 Jul 2023 14:04:30 -0700
Subject: [PATCH] add NumGQA

---
 api/types.go   | 2 ++
 llama/llama.go | 1 +
 2 files changed, 3 insertions(+)

diff --git a/api/types.go b/api/types.go
index 8f12b5f9..9e5991dc 100644
--- a/api/types.go
+++ b/api/types.go
@@ -153,6 +153,7 @@ type Options struct {
 	NumCtx        int  `json:"num_ctx,omitempty"`
 	NumKeep       int  `json:"num_keep,omitempty"`
 	NumBatch      int  `json:"num_batch,omitempty"`
+	NumGQA        int  `json:"num_gqa,omitempty"`
 	NumGPU        int  `json:"num_gpu,omitempty"`
 	MainGPU       int  `json:"main_gpu,omitempty"`
 	LowVRAM       bool `json:"low_vram,omitempty"`
@@ -190,6 +191,7 @@ func DefaultOptions() Options {
 		NumCtx:   2048,
 		NumBatch: 1024,
 		NumGPU:   1,
+		NumGQA:   1,
 		LowVRAM:  false,
 		F16KV:    true,
 		UseMMap:  true,
diff --git a/llama/llama.go b/llama/llama.go
index e2c30f1f..c7bf194a 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -127,6 +127,7 @@ func New(model string, opts api.Options) (*LLM, error) {
 	params.seed = C.uint(llm.Seed)
 	params.n_ctx = C.int(llm.NumCtx)
 	params.n_batch = C.int(llm.NumBatch)
+	params.n_gqa = C.int(llm.NumGQA)
 	params.n_gpu_layers = C.int(llm.NumGPU)
 	params.main_gpu = C.int(llm.MainGPU)
 	params.low_vram = C.bool(llm.LowVRAM)