Merge pull request #4268 from ollama/pdevine/llama3

Convert directly from llama3
This commit is contained in:
Michael Yang 2024-05-21 14:43:37 -07:00 committed by GitHub
commit 96236b7968
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 440 additions and 309 deletions

View file

@ -208,7 +208,7 @@ func tempZipFiles(path string) (string, error) {
// pytorch files might also be unresolved git lfs references; skip if they are
// covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin
files = append(files, pt...)
} else if pt, _ := glob(filepath.Join(path, "consolidated*.pth"), "application/octet-stream"); len(pt) > 0 {
} else if pt, _ := glob(filepath.Join(path, "consolidated*.pth"), "application/zip"); len(pt) > 0 {
// pytorch files might also be unresolved git lfs references; skip if they are
// covers consolidated.x.pth, consolidated.pth
files = append(files, pt...)

View file

@ -18,6 +18,16 @@ import (
"github.com/ollama/ollama/llm"
)
const (
_ int32 = iota
tokenTypeNormal
tokenTypeUnknown
tokenTypeControl
tokenTypeUserDefined
tokenTypeUnused
tokenTypeByte
)
type Params struct {
Architectures []string `json:"architectures"`
VocabSize int `json:"vocab_size"`
@ -37,6 +47,8 @@ type Params struct {
Experts int `json:"num_local_experts"`
ExpertsUsed int `json:"num_experts_per_tok"`
PreTokenizer string
ByteOrder
}
@ -74,10 +86,9 @@ func GetModelFormat(dirname string) (ModelFormat, error) {
}
for _, fn := range files {
slog.Debug(fmt.Sprintf("file = %s", fn))
if strings.HasSuffix(fn, ".safetensors") {
return &SafetensorFormat{}, nil
} else if strings.HasSuffix(fn, ".bin") {
} else if strings.HasSuffix(fn, ".bin") || strings.HasSuffix(fn, ".pth") {
slog.Debug("model is torch")
return &TorchFormat{}, nil
}
@ -92,6 +103,7 @@ type Vocab struct {
Tokens []string
Scores []float32
Types []int32
Merges []string
}
func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
@ -170,7 +182,7 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
}
v.Tokens = append(v.Tokens, t.key)
v.Scores = append(v.Scores, -1000.0)
v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined))
v.Types = append(v.Types, tokenTypeUserDefined)
}
slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens)))
@ -180,7 +192,7 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) {
for cnt := 0; cnt < missingTokens; cnt++ {
v.Tokens = append(v.Tokens, fmt.Sprintf("<dummy%05d>", cnt+1))
v.Scores = append(v.Scores, -1)
v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined))
v.Types = append(v.Types, tokenTypeUserDefined)
}
}

103
convert/convert_test.go Normal file
View file

@ -0,0 +1,103 @@
//go:build slow
package convert
import (
"os"
"path/filepath"
"testing"
"github.com/ollama/ollama/llm"
)
func convertFull(t *testing.T, p string) (llm.KV, llm.Tensors) {
t.Helper()
mf, err := GetModelFormat(p)
if err != nil {
t.Fatal(err)
}
params, err := mf.GetParams(p)
if err != nil {
t.Fatal(err)
}
arch, err := mf.GetModelArch("", p, params)
if err != nil {
t.Fatal(err)
}
if err := arch.LoadVocab(); err != nil {
t.Fatal(err)
}
if err := arch.GetTensors(); err != nil {
t.Fatal(err)
}
f, err := os.CreateTemp(t.TempDir(), "f16")
if err != nil {
t.Fatal(err)
}
defer f.Close()
if err := arch.WriteGGUF(f); err != nil {
t.Fatal(err)
}
r, err := os.Open(f.Name())
if err != nil {
t.Fatal(err)
}
defer r.Close()
m, _, err := llm.DecodeGGML(r)
if err != nil {
t.Fatal(err)
}
return m.KV(), m.Tensors()
}
func TestConvertFull(t *testing.T) {
cases := []struct {
path string
arch string
tensors int
layers int
}{
{"Meta-Llama-3-8B-Instruct", "llama", 291, 35},
{"Mistral-7B-Instruct-v0.2", "llama", 291, 35},
{"Mixtral-8x7B-Instruct-v0.1", "llama", 291, 35},
{"gemma-2b-it", "gemma", 164, 20},
}
for _, tt := range cases {
t.Run(tt.path, func(t *testing.T) {
p := filepath.Join("testdata", tt.path)
if _, err := os.Stat(p); err != nil {
t.Skipf("%s not found", p)
}
kv, tensors := convertFull(t, p)
if kv.Architecture() != tt.arch {
t.Fatalf("expected llama, got %s", kv.Architecture())
}
if kv.FileType().String() != "F16" {
t.Fatalf("expected F16, got %s", kv.FileType())
}
if len(tensors) != tt.tensors {
t.Fatalf("expected %d tensors, got %d", tt.tensors, len(tensors))
}
layers := tensors.Layers()
if len(layers) != tt.layers {
t.Fatalf("expected %d layers, got %d", tt.layers, len(layers))
}
})
}
}

View file

@ -1,14 +1,11 @@
package convert
import (
"encoding/binary"
"fmt"
"io"
"log/slog"
"os"
"strings"
"github.com/d4l3k/go-bfloat16"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
@ -19,49 +16,27 @@ type GemmaModel struct {
ModelData
}
func gemmaLayerHandler(w io.Writer, r safetensorWriterTo, f *os.File) error {
slog.Debug(fmt.Sprintf("converting '%s'", r.t.Name))
data := make([]byte, r.end-r.start)
if err := binary.Read(f, r.bo, data); err != nil {
return err
}
tDataF32 := bfloat16.DecodeFloat32(data)
var err error
tDataF32, err = addOnes(tDataF32, int(r.t.Shape[0]))
if err != nil {
return err
}
if err := binary.Write(w, r.bo, tDataF32); err != nil {
return err
}
return nil
}
func addOnes(data []float32, vectorSize int) ([]float32, error) {
n := tensor.New(tensor.WithShape(vectorSize), tensor.WithBacking(data))
ones := tensor.Ones(tensor.Float32, vectorSize)
var err error
n, err = n.Add(ones)
n, err := n.Add(ones)
if err != nil {
return []float32{}, err
return nil, err
}
newN, err := native.SelectF32(n, 0)
ts, err := native.SelectF32(n, 0)
if err != nil {
return []float32{}, err
return nil, err
}
var fullTensor []float32
for _, v := range newN {
fullTensor = append(fullTensor, v...)
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return fullTensor, nil
return f32s, nil
}
func (m *GemmaModel) GetTensors() error {
@ -71,12 +46,10 @@ func (m *GemmaModel) GetTensors() error {
}
slog.Debug(fmt.Sprintf("Total tensors: %d", len(t)))
m.Tensors = []llm.Tensor{}
for _, l := range t {
if strings.HasSuffix(l.Name, "norm.weight") {
wt := l.WriterTo.(safetensorWriterTo)
wt.handler = gemmaLayerHandler
wt.repacker = m.Repack
l.WriterTo = wt
}
m.Tensors = append(m.Tensors, l)
@ -94,6 +67,10 @@ func (m *GemmaModel) LoadVocab() error {
return nil
}
func (m *GemmaModel) Repack(_ string, data []float32, shape []uint64) ([]float32, error) {
return addOnes(data, int(shape[0]))
}
func (m *GemmaModel) WriteGGUF(ws io.WriteSeeker) error {
kv := llm.KV{
"general.architecture": "gemma",

View file

@ -1,17 +1,17 @@
package convert
import (
"encoding/binary"
"cmp"
"errors"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"regexp"
"strings"
"github.com/nlpodyssey/gopickle/pytorch"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/x448/float16"
"github.com/ollama/ollama/llm"
)
@ -20,81 +20,12 @@ type LlamaModel struct {
ModelData
}
func llamaLayerHandler(w io.Writer, r torchWriterTo) error {
slog.Debug(fmt.Sprintf("repacking layer '%s'", r.t.Name))
data := r.storage.(*pytorch.HalfStorage).Data
tData := make([]uint16, len(data))
for cnt, v := range data {
tData[cnt] = uint16(float16.Fromfloat32(v))
}
var err error
var heads uint32
if strings.Contains(r.t.Name, "attn_q") {
heads = uint32(r.params.AttentionHeads)
} else if strings.Contains(r.t.Name, "attn_k") {
heads = uint32(r.params.KeyValHeads)
if heads == 0 {
heads = uint32(r.params.AttentionHeads)
}
} else {
return fmt.Errorf("unknown layer type")
}
slog.Debug(fmt.Sprintf("heads = %d", heads))
tData, err = llamaRepack(tData, int(heads), r.t.Shape)
if err != nil {
return err
}
if err = binary.Write(w, r.bo, tData); err != nil {
return err
}
return nil
}
func llamaRepack(data []uint16, heads int, shape []uint64) ([]uint16, error) {
n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data))
origShape := n.Shape().Clone()
// reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf
if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(origShape...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
newN, err := native.SelectU16(n, 1)
if err != nil {
return nil, err
}
var fullTensor []uint16
for _, v := range newN {
fullTensor = append(fullTensor, v...)
}
return fullTensor, nil
}
func (m *LlamaModel) GetTensors() error {
t, err := m.Format.GetTensors(m.Path, m.Params)
if err != nil {
return err
}
m.Tensors = []llm.Tensor{}
pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
re, err := regexp.Compile(pattern)
if err != nil {
@ -104,10 +35,16 @@ func (m *LlamaModel) GetTensors() error {
for _, l := range t {
matches := re.FindAllStringSubmatch(l.Name, -1)
if len(matches) > 0 {
slog.Debug(fmt.Sprintf("setting handler for: %s", l.Name))
switch m.Format.(type) {
case *TorchFormat:
wt := l.WriterTo.(torchWriterTo)
wt.handler = llamaLayerHandler
wt.repacker = m.Repack
l.WriterTo = wt
case *SafetensorFormat:
wt := l.WriterTo.(safetensorWriterTo)
wt.repacker = m.Repack
l.WriterTo = wt
}
}
m.Tensors = append(m.Tensors, l)
}
@ -115,19 +52,22 @@ func (m *LlamaModel) GetTensors() error {
return nil
}
func (m *LlamaModel) LoadVocab() error {
var v *Vocab
var err error
slog.Debug("loading vocab")
v, err = LoadSentencePieceTokens(m.Path, m.Params)
if err != nil {
func (m *LlamaModel) LoadVocab() (err error) {
pre, ts, merges, err := parseTokens(filepath.Join(m.Path, "tokenizer.json"))
if errors.Is(err, os.ErrNotExist) {
return nil
} else if err != nil {
return err
}
slog.Debug("vocab loaded")
m.Vocab = &Vocab{}
for _, t := range ts {
m.Vocab.Tokens = append(m.Vocab.Tokens, t.Content)
m.Vocab.Types = append(m.Vocab.Types, t.Type())
}
m.Vocab = v
m.Vocab.Merges = merges
m.Params.PreTokenizer = pre
return nil
}
@ -140,23 +80,79 @@ func (m *LlamaModel) WriteGGUF(ws io.WriteSeeker) error {
"llama.embedding_length": uint32(m.Params.HiddenSize),
"llama.block_count": uint32(m.Params.HiddenLayers),
"llama.feed_forward_length": uint32(m.Params.IntermediateSize),
"llama.rope.freq_base": float32(m.Params.RopeFrequencyBase),
"llama.rope.dimension_count": uint32(m.Params.HiddenSize / m.Params.AttentionHeads),
"llama.attention.head_count": uint32(m.Params.AttentionHeads),
"llama.attention.head_count_kv": uint32(m.Params.KeyValHeads),
"llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS),
"general.file_type": uint32(1),
"tokenizer.ggml.model": "llama",
"tokenizer.ggml.model": "gpt2",
"tokenizer.ggml.pre": m.Params.PreTokenizer,
"tokenizer.ggml.tokens": m.Vocab.Tokens,
"tokenizer.ggml.scores": m.Vocab.Scores,
"tokenizer.ggml.token_type": m.Vocab.Types,
"tokenizer.ggml.bos_token_id": uint32(m.Params.BoSTokenID),
"tokenizer.ggml.eos_token_id": uint32(m.Params.EoSTokenID),
"tokenizer.ggml.unknown_token_id": uint32(0),
"tokenizer.ggml.add_bos_token": true,
"tokenizer.ggml.add_eos_token": false,
}
if len(m.Vocab.Merges) > 0 {
kv["tokenizer.ggml.merges"] = m.Vocab.Merges
} else {
kv["tokenizer.ggml.scores"] = m.Vocab.Scores
}
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
}
func (m *LlamaModel) Repack(name string, data []float32, shape []uint64) ([]float32, error) {
return llamaRepack(name, m.Params, data, shape)
}
func llamaRepack(name string, params *Params, data []float32, shape []uint64) ([]float32, error) {
var dims []int
for _, dim := range shape {
if dim != 0 {
dims = append(dims, int(dim))
}
}
var heads int
if strings.HasSuffix(name, "attn_q.weight") {
heads = params.AttentionHeads
} else if strings.HasSuffix(name, "attn_k.weight") {
heads = cmp.Or(params.KeyValHeads, params.AttentionHeads)
} else {
return nil, fmt.Errorf("unknown tensor name: %s", name)
}
n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))
if err := n.Reshape(append([]int{heads, 2, dims[0] / heads / 2}, dims[1:]...)...); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(dims...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
ts, err := native.SelectF32(n, 1)
if err != nil {
return nil, err
}
var f32s []float32
for _, t := range ts {
f32s = append(f32s, t...)
}
return f32s, nil
}

View file

@ -1,17 +1,8 @@
package convert
import (
"encoding/binary"
"fmt"
"io"
"os"
"regexp"
"strings"
"github.com/d4l3k/go-bfloat16"
"github.com/pdevine/tensor"
"github.com/pdevine/tensor/native"
"github.com/x448/float16"
"github.com/ollama/ollama/llm"
)
@ -20,90 +11,12 @@ type MistralModel struct {
ModelData
}
func mistralLayerHandler(w io.Writer, r safetensorWriterTo, f *os.File) error {
layerSize := r.end - r.start
var err error
tData := make([]uint16, layerSize/2)
if err = binary.Read(f, r.bo, tData); err != nil {
return err
}
var heads uint32
if strings.Contains(r.t.Name, "attn_q") {
heads = uint32(r.params.AttentionHeads)
} else if strings.Contains(r.t.Name, "attn_k") {
heads = uint32(r.params.KeyValHeads)
if heads == 0 {
heads = uint32(r.params.AttentionHeads)
}
} else {
return fmt.Errorf("unknown layer type")
}
tData, err = repack(tData, int(heads), r.t.Shape)
if err != nil {
return err
}
var buf []byte
for _, n := range tData {
buf = r.bo.AppendUint16(buf, n)
}
tempBuf := make([]uint16, len(tData))
tDataF32 := bfloat16.DecodeFloat32(buf)
for cnt, v := range tDataF32 {
tDataF16 := float16.Fromfloat32(v)
tempBuf[cnt] = uint16(tDataF16)
}
if err = binary.Write(w, r.bo, tempBuf); err != nil {
return err
}
return nil
}
func repack(data []uint16, heads int, shape []uint64) ([]uint16, error) {
n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data))
origShape := n.Shape().Clone()
// reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf
if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil {
return nil, err
}
if err := n.T(0, 2, 1, 3); err != nil {
return nil, err
}
if err := n.Reshape(origShape...); err != nil {
return nil, err
}
if err := n.Transpose(); err != nil {
return nil, err
}
newN, err := native.SelectU16(n, 1)
if err != nil {
return nil, err
}
var fullTensor []uint16
for _, v := range newN {
fullTensor = append(fullTensor, v...)
}
return fullTensor, nil
}
func (m *MistralModel) GetTensors() error {
t, err := m.Format.GetTensors(m.Path, m.Params)
if err != nil {
return err
}
m.Tensors = []llm.Tensor{}
pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
re, err := regexp.Compile(pattern)
if err != nil {
@ -114,7 +27,7 @@ func (m *MistralModel) GetTensors() error {
matches := re.FindAllStringSubmatch(l.Name, -1)
if len(matches) > 0 {
wt := l.WriterTo.(safetensorWriterTo)
wt.handler = mistralLayerHandler
wt.repacker = m.Repack
l.WriterTo = wt
}
m.Tensors = append(m.Tensors, l)
@ -160,3 +73,7 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
}
func (m *MistralModel) Repack(name string, data []float32, shape []uint64) ([]float32, error) {
return llamaRepack(name, m.Params, data, shape)
}

View file

@ -17,8 +17,6 @@ func (m *MixtralModel) GetTensors() error {
return err
}
m.Tensors = []llm.Tensor{}
pattern := `^blk\.[0-9]+\.attn_(?P<layer>q|k)\.weight$`
re, err := regexp.Compile(pattern)
if err != nil {
@ -29,7 +27,7 @@ func (m *MixtralModel) GetTensors() error {
matches := re.FindAllStringSubmatch(l.Name, -1)
if len(matches) > 0 {
wt := l.WriterTo.(safetensorWriterTo)
wt.handler = mistralLayerHandler
wt.repacker = m.Repack
l.WriterTo = wt
}
m.Tensors = append(m.Tensors, l)
@ -83,3 +81,7 @@ func (m *MixtralModel) WriteGGUF(ws io.WriteSeeker) error {
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
}
func (m *MixtralModel) Repack(name string, data []float32, shape []uint64) ([]float32, error) {
return llamaRepack(name, m.Params, data, shape)
}

View file

@ -11,6 +11,7 @@ import (
"path/filepath"
"regexp"
"slices"
"strings"
"github.com/d4l3k/go-bfloat16"
"github.com/mitchellh/mapstructure"
@ -26,9 +27,10 @@ type safetensorWriterTo struct {
bo ByteOrder
filename string
dtype string
start, end, padding uint64
handler func(w io.Writer, r safetensorWriterTo, f *os.File) error
repacker func(string, []float32, []uint64) ([]float32, error)
}
type tensorMetaData struct {
@ -97,6 +99,10 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
var tensors []llm.Tensor
for _, k := range keys {
if strings.HasSuffix(k, "self_attn.rotary_emb.inv_freq") {
continue
}
vals := parsed[k].(map[string]interface{})
var data tensorMetaData
if err = mapstructure.Decode(vals, &data); err != nil {
@ -131,6 +137,8 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
shape[i] = uint64(data.Shape[i])
}
slog.Debug(fmt.Sprintf("'%45s': '%30s' %10d [%#v]", k, ggufName, size, data.Shape))
t := llm.Tensor{
Name: ggufName,
Kind: kind,
@ -143,6 +151,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params)
params: params,
bo: params.ByteOrder,
filename: fn,
dtype: data.Type,
start: uint64(data.Offsets[0]),
end: uint64(data.Offsets[1]),
padding: 8 + jsonSize,
@ -228,52 +237,55 @@ func (r safetensorWriterTo) WriteTo(w io.Writer) (n int64, err error) {
return 0, err
}
// use the handler if one is present
if r.handler != nil {
return 0, r.handler(w, r, f)
var f32s []float32
switch r.dtype {
case "F32":
f32s = make([]float32, (r.end-r.start)/4)
if err = binary.Read(f, r.bo, f32s); err != nil {
return 0, err
}
remaining := r.end - r.start
bufSize := uint64(10240)
var finished bool
for {
data := make([]byte, min(bufSize, remaining))
b, err := io.ReadFull(f, data)
remaining -= uint64(b)
if err == io.EOF || remaining <= 0 {
finished = true
} else if err != nil {
case "F16":
bts := make([]uint16, (r.end-r.start)/2)
if err = binary.Read(f, r.bo, bts); err != nil {
return 0, err
}
// convert bfloat16 -> ieee float32
tDataF32 := bfloat16.DecodeFloat32(data)
for _, b := range bts {
f32s = append(f32s, float16.Frombits(b).Float32())
}
case "BF16":
bts := make([]byte, r.end-r.start)
if err = binary.Read(f, r.bo, bts); err != nil {
return 0, err
}
f32s = bfloat16.DecodeFloat32(bts)
default:
return 0, fmt.Errorf("unknown data type: %s", r.dtype)
}
if r.repacker != nil {
f32s, err = r.repacker(r.t.Name, f32s, r.t.Shape)
if err != nil {
return 0, err
}
}
switch r.t.Kind {
case 0:
if err := binary.Write(w, r.bo, tDataF32); err != nil {
return 0, err
}
return 0, binary.Write(w, r.bo, f32s)
case 1:
// convert float32 -> float16
tempBuf := make([]uint16, len(data)/2)
for cnt, v := range tDataF32 {
tDataF16 := float16.Fromfloat32(v)
tempBuf[cnt] = uint16(tDataF16)
f16s := make([]uint16, len(f32s))
for i := range f32s {
f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
}
if err := binary.Write(w, r.bo, tempBuf); err != nil {
return 0, err
return 0, binary.Write(w, r.bo, f16s)
default:
return 0, fmt.Errorf("unknown storage type: %d", r.t.Kind)
}
}
if finished {
break
}
}
return 0, nil
}
func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) {
switch len(params.Architectures) {
@ -281,6 +293,15 @@ func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (M
return nil, fmt.Errorf("No architecture specified to convert")
case 1:
switch params.Architectures[0] {
case "LlamaForCausalLM":
return &LlamaModel{
ModelData{
Name: name,
Path: dirPath,
Params: params,
Format: m,
},
}, nil
case "MistralForCausalLM":
return &MistralModel{
ModelData{

109
convert/tokenizer.go Normal file
View file

@ -0,0 +1,109 @@
package convert
import (
"cmp"
"crypto/sha256"
"encoding/json"
"fmt"
"log/slog"
"os"
"slices"
"golang.org/x/exp/maps"
)
type Tokenizer struct {
Version string `json:"version"`
AddedTokens []Token `json:"added_tokens"`
Model TokenizerModel `json:"model"`
PreTokenizer struct {
PreTokenziers []struct {
Type string `json:"type"`
Pattern struct {
Regex string `json:"Regex"`
} `json:"pattern"`
} `json:"pretokenizers"`
} `json:"pre_tokenizer"`
}
type TokenizerModel struct {
Type string `json:"type"`
Vocab map[string]int `json:"vocab"`
Merges []string `json:"merges"`
Tokens []Token
}
type Token struct {
ID int `json:"id"`
Content string `json:"content"`
Special bool `json:"special"`
UserDefined bool
}
func (t *Token) Type() int32 {
switch {
case t.Special:
return tokenTypeControl
case t.UserDefined:
return tokenTypeUserDefined
default:
return tokenTypeNormal
}
}
func (t *Tokenizer) maxID() int {
return max(
slices.Max(maps.Values(t.Model.Vocab)),
slices.MaxFunc(t.AddedTokens, func(a, b Token) int {
return cmp.Compare(a.ID, b.ID)
}).ID,
)
}
func parseTokens(dirpath string) (pre string, tokens []Token, merges []string, err error) {
f, err := os.Open(dirpath)
if err != nil {
panic(err)
}
defer f.Close()
var t Tokenizer
if err := json.NewDecoder(f).Decode(&t); err != nil {
return "", nil, nil, err
}
tokens = make([]Token, t.maxID()+1)
for k, v := range t.Model.Vocab {
tokens[v] = Token{ID: v, Content: k, Special: false, UserDefined: false}
}
for _, v := range t.AddedTokens {
v.UserDefined = true
tokens[v.ID] = v
}
sha256sum := sha256.New()
for _, pt := range t.PreTokenizer.PreTokenziers {
switch pt.Type {
case "Split":
if pt.Pattern.Regex != "" {
sha256sum.Write([]byte(pt.Pattern.Regex))
}
}
}
switch digest := fmt.Sprintf("%x", sha256sum.Sum(nil)); digest {
case "d98f9631be1e9607a9848c26c1f9eac1aa9fc21ac6ba82a2fc0741af9780a48f":
pre = "llama-bpe"
case "03df5c5863ad70781dcfdef491ead25140f895fe8010964be0daefe27be32b02":
pre = "deepseek-llm"
case "21cde974d587f0d54dc8d56b183cc1e6239600172035c68fbd6d4b9f8da0576e":
pre = "deepseek-coder"
default:
slog.Warn("unknown pretokenizer, using default", "digest", digest)
pre = "default"
}
return pre, tokens, t.Model.Merges, nil
}

View file

@ -25,7 +25,7 @@ type torchWriterTo struct {
bo ByteOrder
storage pytorch.StorageInterface
handler func(w io.Writer, r torchWriterTo) error
repacker func(string, []float32, []uint64) ([]float32, error)
}
type TorchFormat struct{}
@ -33,14 +33,14 @@ type TorchFormat struct{}
func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) {
slog.Debug("getting torch tensors")
files, err := filepath.Glob(filepath.Join(dirpath, "pytorch_model-*.bin"))
if err != nil {
slog.Error("didn't find any torch files")
return nil, err
var files []string
if pt, _ := filepath.Glob(filepath.Join(dirpath, "consolidated*.pth")); len(pt) > 0 {
files = append(files, pt...)
} else if pt, _ := filepath.Glob(filepath.Join(dirpath, "pytorch_model*.pth")); len(pt) > 0 {
files = append(files, pt...)
}
var offset uint64
var tensors []llm.Tensor
for _, fn := range files {
m, err := pytorch.Load(fn)
@ -77,7 +77,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor,
slog.Error(err.Error())
return nil, err
}
slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName))
slog.Debug(fmt.Sprintf("'%35s': '%30s' %10d [%#v]", k.(string), ggufName, size, tshape))
shape := []uint64{0, 0, 0, 0}
for i := range tshape {
@ -120,7 +120,7 @@ func getAltParams(dirpath string) (*Params, error) {
AttentionHeads int `json:"n_heads"`
KeyValHeads int `json:"n_kv_heads"`
HiddenLayers int `json:"n_layers"`
RopeTheta int `json:"rope_theta"`
RopeTheta float64 `json:"rope_theta"`
NormEPS float64 `json:"norm_eps"`
}
@ -133,6 +133,7 @@ func getAltParams(dirpath string) (*Params, error) {
}
params := &Params{
Architectures: []string{"LlamaForCausalLM"},
HiddenSize: tparams.HiddenSize,
AttentionHeads: tparams.AttentionHeads,
KeyValHeads: tparams.KeyValHeads,
@ -229,37 +230,38 @@ func (m *TorchFormat) GetLayerName(n string) (string, error) {
}
func (r torchWriterTo) WriteTo(w io.Writer) (n int64, err error) {
// use the handler if one is present
if r.handler != nil {
return 0, r.handler(w, r)
var f32s []float32
switch s := r.storage.(type) {
case *pytorch.FloatStorage:
f32s = s.Data
case *pytorch.HalfStorage:
f32s = s.Data
case *pytorch.BFloat16Storage:
f32s = s.Data
default:
return 0, fmt.Errorf("unknown data type: %T", s)
}
if r.repacker != nil {
f32s, err = r.repacker(r.t.Name, f32s, r.t.Shape)
if err != nil {
return 0, err
}
}
switch r.storage.(type) {
case *pytorch.FloatStorage:
slog.Warn(fmt.Sprintf("unexpected storage found for layer '%s'; skipping", r.t.Name))
return 0, nil
case *pytorch.HalfStorage:
switch r.t.Kind {
case 0:
data := r.storage.(*pytorch.HalfStorage).Data
slog.Debug(fmt.Sprintf("%35s F32 (%d)", r.t.Name, len(data)))
if err := binary.Write(w, r.bo, data); err != nil {
return 0, err
}
return 0, binary.Write(w, r.bo, f32s)
case 1:
data := r.storage.(*pytorch.HalfStorage).Data
tData := make([]uint16, len(data))
for cnt, v := range data {
tData[cnt] = uint16(float16.Fromfloat32(v))
}
slog.Debug(fmt.Sprintf("%35s F16 (%d)", r.t.Name, len(tData)))
if err := binary.Write(w, r.bo, tData); err != nil {
return 0, err
}
}
f16s := make([]uint16, len(f32s))
for i := range f32s {
f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
}
return 0, nil
return 0, binary.Write(w, r.bo, f16s)
default:
return 0, fmt.Errorf("unknown storage type: %d", r.t.Kind)
}
}
func (m *TorchFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) {

2
go.mod
View file

@ -4,7 +4,6 @@ go 1.22.0
require (
github.com/containerd/console v1.0.3
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
github.com/emirpasic/gods v1.18.1
github.com/gin-gonic/gin v1.10.0
github.com/golang/protobuf v1.5.4 // indirect
@ -18,6 +17,7 @@ require (
)
require (
github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
github.com/mattn/go-runewidth v0.0.14
github.com/nlpodyssey/gopickle v0.3.0
github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c

View file

@ -62,16 +62,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
return model, nil
}
const (
_ uint32 = iota
GGUFTokenNormal
GGUFTokenUnknown
GGUFTokenControl
GGUFTokenUserDefined
GGUFTokenUnused
GGUFTokenByte
)
const (
ggufTypeUint8 uint32 = iota
ggufTypeInt8
@ -480,9 +470,11 @@ var ggufKVOrder = map[string][]string{
"gemma.attention.key_length",
"gemma.attention.value_length",
"general.file_type",
"tokenizer.ggml.pre",
"tokenizer.ggml.model",
"tokenizer.ggml.tokens",
"tokenizer.ggml.scores",
"tokenizer.ggml.merges",
"tokenizer.ggml.token_type",
"tokenizer.ggml.bos_token_id",
"tokenizer.ggml.eos_token_id",