From 7dee25a07f6057a4afd42097357ffbdae0fdaacc Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 12 Sep 2023 10:01:20 -0700
Subject: [PATCH] fix falcon decode

get model and file type from bin file
---
 llm/ggml.go      | 105 +++++++++++++++++++++++++++++------------------
 llm/gguf.go      |  36 ++++++++--------
 llm/llama.go     | 100 +++++++++-----------------------------------
 llm/llm.go       |  24 +++++++----
 server/images.go |  16 ++++----
 5 files changed, 123 insertions(+), 158 deletions(-)

diff --git a/llm/ggml.go b/llm/ggml.go
index 14675da1..e95f5fc6 100644
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -8,54 +8,77 @@ import (
 	"sync"
 )
 
-type ModelFamily string
-
-const ModelFamilyUnknown ModelFamily = "unknown"
-
-type ModelType uint32
-
-const (
-	ModelType3B  ModelType = 26
-	ModelType7B  ModelType = 32
-	ModelType13B ModelType = 40
-	ModelType34B ModelType = 48
-	ModelType30B ModelType = 60
-	ModelType65B ModelType = 80
-)
-
-func (mt ModelType) String() string {
-	switch mt {
-	case ModelType3B:
-		return "3B"
-	case ModelType7B:
-		return "7B"
-	case ModelType13B:
-		return "13B"
-	case ModelType34B:
-		return "34B"
-	case ModelType30B:
-		return "30B"
-	case ModelType65B:
-		return "65B"
-	default:
-		return "Unknown"
-	}
-}
-
-type FileType interface {
-	String() string
-}
-
 type GGML struct {
 	magic uint32
 	container
 	model
 }
 
+const (
+	fileTypeF32 uint32 = iota
+	fileTypeF16
+	fileTypeQ4_0
+	fileTypeQ4_1
+	fileTypeQ4_1_F16
+	fileTypeQ8_0 uint32 = iota + 2
+	fileTypeQ5_0
+	fileTypeQ5_1
+	fileTypeQ2_K
+	fileTypeQ3_K_S
+	fileTypeQ3_K_M
+	fileTypeQ3_K_L
+	fileTypeQ4_K_S
+	fileTypeQ4_K_M
+	fileTypeQ5_K_S
+	fileTypeQ5_K_M
+	fileTypeQ6_K
+)
+
+func fileType(fileType uint32) string {
+	switch fileType {
+	case fileTypeF32:
+		return "F32"
+	case fileTypeF16:
+		return "F16"
+	case fileTypeQ4_0:
+		return "Q4_0"
+	case fileTypeQ4_1:
+		return "Q4_1"
+	case fileTypeQ4_1_F16:
+		return "Q4_1_F16"
+	case fileTypeQ8_0:
+		return "Q8_0"
+	case fileTypeQ5_0:
+		return "Q5_0"
+	case fileTypeQ5_1:
+		return "Q5_1"
+	case fileTypeQ2_K:
+		return "Q2_K"
+	case fileTypeQ3_K_S:
+		return "Q3_K_S"
+	case fileTypeQ3_K_M:
+		return "Q3_K_M"
+	case fileTypeQ3_K_L:
+		return "Q3_K_L"
+	case fileTypeQ4_K_S:
+		return "Q4_K_S"
+	case fileTypeQ4_K_M:
+		return "Q4_K_M"
+	case fileTypeQ5_K_S:
+		return "Q5_K_S"
+	case fileTypeQ5_K_M:
+		return "Q5_K_M"
+	case fileTypeQ6_K:
+		return "Q6_K"
+	default:
+		return "Unknown"
+	}
+}
+
 type model interface {
-	ModelFamily() ModelFamily
-	ModelType() ModelType
-	FileType() FileType
+	ModelFamily() string
+	ModelType() string
+	FileType() string
 }
 
 type container interface {
diff --git a/llm/gguf.go b/llm/gguf.go
index 804c3650..047d17cf 100644
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -6,7 +6,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"log"
 	"path"
 	"sync"
 )
@@ -87,38 +86,37 @@ func (llm *ggufModel) NumKV() uint64 {
 	return llm.V2.NumKV
 }
 
-func (llm *ggufModel) ModelFamily() ModelFamily {
+func (llm *ggufModel) ModelFamily() string {
 	t, ok := llm.kv["general.architecture"].(string)
 	if ok {
-		return ModelFamily(t)
+		return t
 	}
 
-	log.Printf("unknown model family: %T", t)
-	return ModelFamilyUnknown
+	return "unknown"
 }
 
-func (llm *ggufModel) ModelType() ModelType {
+func (llm *ggufModel) ModelType() string {
 	switch llm.ModelFamily() {
-	case ModelFamilyLlama:
-		blocks, ok := llm.kv["llama.block_count"].(uint32)
-		if ok {
-			return ModelType(blocks)
+	case "llama":
+		if blocks, ok := llm.kv["llama.block_count"].(uint32); ok {
+			return llamaModelType(blocks)
+		}
+	case "falcon":
+		if blocks, ok := llm.kv["falcon.block_count"].(uint32); ok {
+			return falconModelType(blocks)
 		}
 	}
 
-	return ModelType7B
+	return "Unknown"
 }
 
-func (llm *ggufModel) FileType() FileType {
-	switch llm.ModelFamily() {
-	case ModelFamilyLlama:
-		t, ok := llm.kv["general.file_type"].(uint32)
-		if ok {
-			return llamaFileType(t)
-		}
+func (llm *ggufModel) FileType() string {
+	t, ok := llm.kv["general.file_type"].(uint32)
+	if ok {
+		return fileType(t)
 	}
 
-	return llamaFileTypeF16
+	return "Unknown"
 }
 
 func (llm *ggufModel) Decode(r io.Reader) error {
diff --git a/llm/llama.go b/llm/llama.go
index 3153d091..4b7e6e01 100644
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -95,38 +95,39 @@ func chooseRunner(gpuPath, cpuPath string) string {
 	return runPath
 }
 
-const ModelFamilyLlama ModelFamily = "llama"
-
 type llamaModel struct {
 	hyperparameters llamaHyperparameters
 }
 
-func (llm *llamaModel) ModelFamily() ModelFamily {
-	return ModelFamilyLlama
+func (llm *llamaModel) ModelFamily() string {
+	return "llama"
 }
 
-func (llm *llamaModel) ModelType() ModelType {
-	switch llm.hyperparameters.NumLayer {
+func llamaModelType(numLayer uint32) string {
+	switch numLayer {
 	case 26:
-		return ModelType3B
+		return "3B"
 	case 32:
-		return ModelType7B
+		return "7B"
 	case 40:
-		return ModelType13B
+		return "13B"
 	case 48:
-		return ModelType34B
+		return "34B"
 	case 60:
-		return ModelType30B
+		return "30B"
 	case 80:
-		return ModelType65B
+		return "65B"
+	default:
+		return "Unknown"
 	}
-
-	// TODO: find a better default
-	return ModelType7B
 }
 
-func (llm *llamaModel) FileType() FileType {
-	return llm.hyperparameters.FileType
+func (llm *llamaModel) ModelType() string {
+	return llamaModelType(llm.hyperparameters.NumLayer)
+}
+
+func (llm *llamaModel) FileType() string {
+	return fileType(llm.hyperparameters.FileType)
 }
 
 type llamaHyperparameters struct {
@@ -143,70 +144,7 @@ type llamaHyperparameters struct {
 	NumRot   uint32
 
 	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
-	FileType llamaFileType
-}
-
-type llamaFileType uint32
-
-const (
-	llamaFileTypeF32 llamaFileType = iota
-	llamaFileTypeF16
-	llamaFileTypeQ4_0
-	llamaFileTypeQ4_1
-	llamaFileTypeQ4_1_F16
-	llamaFileTypeQ8_0 llamaFileType = iota + 2
-	llamaFileTypeQ5_0
-	llamaFileTypeQ5_1
-	llamaFileTypeQ2_K
-	llamaFileTypeQ3_K_S
-	llamaFileTypeQ3_K_M
-	llamaFileTypeQ3_K_L
-	llamaFileTypeQ4_K_S
-	llamaFileTypeQ4_K_M
-	llamaFileTypeQ5_K_S
-	llamaFileTypeQ5_K_M
-	llamaFileTypeQ6_K
-)
-
-func (ft llamaFileType) String() string {
-	switch ft {
-	case llamaFileTypeF32:
-		return "F32"
-	case llamaFileTypeF16:
-		return "F16"
-	case llamaFileTypeQ4_0:
-		return "Q4_0"
-	case llamaFileTypeQ4_1:
-		return "Q4_1"
-	case llamaFileTypeQ4_1_F16:
-		return "Q4_1_F16"
-	case llamaFileTypeQ8_0:
-		return "Q8_0"
-	case llamaFileTypeQ5_0:
-		return "Q5_0"
-	case llamaFileTypeQ5_1:
-		return "Q5_1"
-	case llamaFileTypeQ2_K:
-		return "Q2_K"
-	case llamaFileTypeQ3_K_S:
-		return "Q3_K_S"
-	case llamaFileTypeQ3_K_M:
-		return "Q3_K_M"
-	case llamaFileTypeQ3_K_L:
-		return "Q3_K_L"
-	case llamaFileTypeQ4_K_S:
-		return "Q4_K_S"
-	case llamaFileTypeQ4_K_M:
-		return "Q4_K_M"
-	case llamaFileTypeQ5_K_S:
-		return "Q5_K_S"
-	case llamaFileTypeQ5_K_M:
-		return "Q5_K_M"
-	case llamaFileTypeQ6_K:
-		return "Q6_K"
-	default:
-		return "Unknown"
-	}
+	FileType uint32
 }
 
 type Running struct {
diff --git a/llm/llm.go b/llm/llm.go
index de9d89c9..537898f5 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -37,7 +37,7 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
 		return nil, err
 	}
 
-	switch ggml.FileType().String() {
+	switch ggml.FileType() {
 	case "Q8_0":
 		if ggml.Name() != "gguf" && opts.NumGPU != 0 {
 			// GGML Q8_0 do not support Metal API and will
@@ -56,30 +56,36 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {
 
 	totalResidentMemory := memory.TotalMemory()
 	switch ggml.ModelType() {
-	case ModelType3B, ModelType7B:
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 16*1024*1024 {
+	case "3B", "7B":
+		if ggml.FileType() == "F16" && totalResidentMemory < 16*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 16GB of memory")
 		} else if totalResidentMemory < 8*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 8GB of memory")
 		}
-	case ModelType13B:
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 32*1024*1024 {
+	case "13B":
+		if ggml.FileType() == "F16" && totalResidentMemory < 32*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 32GB of memory")
 		} else if totalResidentMemory < 16*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 16GB of memory")
 		}
-	case ModelType30B, ModelType34B:
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 64*1024*1024 {
+	case "30B", "34B", "40B":
+		if ggml.FileType() == "F16" && totalResidentMemory < 64*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 64GB of memory")
 		} else if totalResidentMemory < 32*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 32GB of memory")
 		}
-	case ModelType65B:
-		if ggml.FileType().String() == "F16" && totalResidentMemory < 128*1024*1024 {
+	case "65B", "70B":
+		if ggml.FileType() == "F16" && totalResidentMemory < 128*1024*1024 {
 			return nil, fmt.Errorf("F16 model requires at least 128GB of memory")
 		} else if totalResidentMemory < 64*1024*1024 {
 			return nil, fmt.Errorf("model requires at least 64GB of memory")
 		}
+	case "180B":
+		if ggml.FileType() == "F16" && totalResidentMemory < 512*1024*1024 {
+			return nil, fmt.Errorf("F16 model requires at least 512GB of memory")
+		} else if totalResidentMemory < 128*1024*1024 {
+			return nil, fmt.Errorf("model requires at least 128GB of memory")
+		}
 	}
 
 	switch ggml.Name() {
diff --git a/server/images.go b/server/images.go
index f4181f43..01ec4306 100644
--- a/server/images.go
+++ b/server/images.go
@@ -114,11 +114,11 @@ type LayerReader struct {
 }
 
 type ConfigV2 struct {
-	ModelFamily llm.ModelFamily `json:"model_family"`
-	ModelType   string          `json:"model_type"`
-	ModelFormat string          `json:"model_format"`
-	FileType    string          `json:"file_type"`
-	RootFS      RootFS          `json:"rootfs"`
+	ModelFormat string `json:"model_format"`
+	ModelFamily string `json:"model_family"`
+	ModelType   string `json:"model_type"`
+	FileType    string `json:"file_type"`
+	RootFS      RootFS `json:"rootfs"`
 
 	// required by spec
 	Architecture string `json:"architecture"`
@@ -357,10 +357,10 @@ func CreateModel(ctx context.Context, name string, path string, fn func(resp api
 						return err
 					}
 
-					config.ModelFamily = ggml.ModelFamily()
-					config.ModelType = ggml.ModelType().String()
 					config.ModelFormat = ggml.Name()
-					config.FileType = ggml.FileType().String()
+					config.ModelFamily = ggml.ModelFamily()
+					config.ModelType = ggml.ModelType()
+					config.FileType = ggml.FileType()
 
 					// reset the file
 					file.Seek(0, io.SeekStart)