diff --git a/cmd/cmd.go b/cmd/cmd.go index f79f8b97..5d919d9a 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -208,7 +208,7 @@ func tempZipFiles(path string) (string, error) { // pytorch files might also be unresolved git lfs references; skip if they are // covers pytorch_model-x-of-y.bin, pytorch_model.fp32-x-of-y.bin, pytorch_model.bin files = append(files, pt...) - } else if pt, _ := glob(filepath.Join(path, "consolidated*.pth"), "application/octet-stream"); len(pt) > 0 { + } else if pt, _ := glob(filepath.Join(path, "consolidated*.pth"), "application/zip"); len(pt) > 0 { // pytorch files might also be unresolved git lfs references; skip if they are // covers consolidated.x.pth, consolidated.pth files = append(files, pt...) diff --git a/convert/convert.go b/convert/convert.go index f4210e50..e71a0ff3 100644 --- a/convert/convert.go +++ b/convert/convert.go @@ -18,6 +18,16 @@ import ( "github.com/ollama/ollama/llm" ) +const ( + _ int32 = iota + tokenTypeNormal + tokenTypeUnknown + tokenTypeControl + tokenTypeUserDefined + tokenTypeUnused + tokenTypeByte +) + type Params struct { Architectures []string `json:"architectures"` VocabSize int `json:"vocab_size"` @@ -37,6 +47,8 @@ type Params struct { Experts int `json:"num_local_experts"` ExpertsUsed int `json:"num_experts_per_tok"` + PreTokenizer string + ByteOrder } @@ -74,10 +86,9 @@ func GetModelFormat(dirname string) (ModelFormat, error) { } for _, fn := range files { - slog.Debug(fmt.Sprintf("file = %s", fn)) if strings.HasSuffix(fn, ".safetensors") { return &SafetensorFormat{}, nil - } else if strings.HasSuffix(fn, ".bin") { + } else if strings.HasSuffix(fn, ".bin") || strings.HasSuffix(fn, ".pth") { slog.Debug("model is torch") return &TorchFormat{}, nil } @@ -92,6 +103,7 @@ type Vocab struct { Tokens []string Scores []float32 Types []int32 + Merges []string } func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) { @@ -170,7 +182,7 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) { } v.Tokens = append(v.Tokens, t.key) v.Scores = append(v.Scores, -1000.0) - v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined)) + v.Types = append(v.Types, tokenTypeUserDefined) } slog.Info(fmt.Sprintf("vocab size w/ extra tokens: %d", len(v.Tokens))) @@ -180,7 +192,7 @@ func LoadSentencePieceTokens(dirpath string, params *Params) (*Vocab, error) { for cnt := 0; cnt < missingTokens; cnt++ { v.Tokens = append(v.Tokens, fmt.Sprintf("", cnt+1)) v.Scores = append(v.Scores, -1) - v.Types = append(v.Types, int32(llm.GGUFTokenUserDefined)) + v.Types = append(v.Types, tokenTypeUserDefined) } } diff --git a/convert/convert_test.go b/convert/convert_test.go new file mode 100644 index 00000000..6aa33a49 --- /dev/null +++ b/convert/convert_test.go @@ -0,0 +1,103 @@ +//go:build slow + +package convert + +import ( + "os" + "path/filepath" + "testing" + + "github.com/ollama/ollama/llm" +) + +func convertFull(t *testing.T, p string) (llm.KV, llm.Tensors) { + t.Helper() + + mf, err := GetModelFormat(p) + if err != nil { + t.Fatal(err) + } + + params, err := mf.GetParams(p) + if err != nil { + t.Fatal(err) + } + + arch, err := mf.GetModelArch("", p, params) + if err != nil { + t.Fatal(err) + } + + if err := arch.LoadVocab(); err != nil { + t.Fatal(err) + } + + if err := arch.GetTensors(); err != nil { + t.Fatal(err) + } + + f, err := os.CreateTemp(t.TempDir(), "f16") + if err != nil { + t.Fatal(err) + } + defer f.Close() + + if err := arch.WriteGGUF(f); err != nil { + t.Fatal(err) + } + + r, err := os.Open(f.Name()) + if err != nil { + t.Fatal(err) + } + defer r.Close() + + m, _, err := llm.DecodeGGML(r) + if err != nil { + t.Fatal(err) + } + + return m.KV(), m.Tensors() +} + +func TestConvertFull(t *testing.T) { + cases := []struct { + path string + arch string + tensors int + layers int + }{ + {"Meta-Llama-3-8B-Instruct", "llama", 291, 35}, + {"Mistral-7B-Instruct-v0.2", "llama", 291, 35}, + {"Mixtral-8x7B-Instruct-v0.1", "llama", 291, 35}, + {"gemma-2b-it", "gemma", 164, 20}, + } + + for _, tt := range cases { + t.Run(tt.path, func(t *testing.T) { + p := filepath.Join("testdata", tt.path) + if _, err := os.Stat(p); err != nil { + t.Skipf("%s not found", p) + } + + kv, tensors := convertFull(t, p) + + if kv.Architecture() != tt.arch { + t.Fatalf("expected llama, got %s", kv.Architecture()) + } + + if kv.FileType().String() != "F16" { + t.Fatalf("expected F16, got %s", kv.FileType()) + } + + if len(tensors) != tt.tensors { + t.Fatalf("expected %d tensors, got %d", tt.tensors, len(tensors)) + } + + layers := tensors.Layers() + if len(layers) != tt.layers { + t.Fatalf("expected %d layers, got %d", tt.layers, len(layers)) + } + }) + } +} diff --git a/convert/gemma.go b/convert/gemma.go index 88abe646..9dc406e0 100644 --- a/convert/gemma.go +++ b/convert/gemma.go @@ -1,14 +1,11 @@ package convert import ( - "encoding/binary" "fmt" "io" "log/slog" - "os" "strings" - "github.com/d4l3k/go-bfloat16" "github.com/pdevine/tensor" "github.com/pdevine/tensor/native" @@ -19,49 +16,27 @@ type GemmaModel struct { ModelData } -func gemmaLayerHandler(w io.Writer, r safetensorWriterTo, f *os.File) error { - slog.Debug(fmt.Sprintf("converting '%s'", r.t.Name)) - - data := make([]byte, r.end-r.start) - if err := binary.Read(f, r.bo, data); err != nil { - return err - } - - tDataF32 := bfloat16.DecodeFloat32(data) - - var err error - tDataF32, err = addOnes(tDataF32, int(r.t.Shape[0])) - if err != nil { - return err - } - - if err := binary.Write(w, r.bo, tDataF32); err != nil { - return err - } - return nil -} - func addOnes(data []float32, vectorSize int) ([]float32, error) { n := tensor.New(tensor.WithShape(vectorSize), tensor.WithBacking(data)) ones := tensor.Ones(tensor.Float32, vectorSize) - var err error - n, err = n.Add(ones) + n, err := n.Add(ones) if err != nil { - return []float32{}, err + return nil, err } - newN, err := native.SelectF32(n, 0) + ts, err := native.SelectF32(n, 0) if err != nil { - return []float32{}, err + return nil, err } - var fullTensor []float32 - for _, v := range newN { - fullTensor = append(fullTensor, v...) + var f32s []float32 + for _, t := range ts { + f32s = append(f32s, t...) } - return fullTensor, nil + + return f32s, nil } func (m *GemmaModel) GetTensors() error { @@ -71,12 +46,10 @@ func (m *GemmaModel) GetTensors() error { } slog.Debug(fmt.Sprintf("Total tensors: %d", len(t))) - - m.Tensors = []llm.Tensor{} for _, l := range t { if strings.HasSuffix(l.Name, "norm.weight") { wt := l.WriterTo.(safetensorWriterTo) - wt.handler = gemmaLayerHandler + wt.repacker = m.Repack l.WriterTo = wt } m.Tensors = append(m.Tensors, l) @@ -94,6 +67,10 @@ func (m *GemmaModel) LoadVocab() error { return nil } +func (m *GemmaModel) Repack(_ string, data []float32, shape []uint64) ([]float32, error) { + return addOnes(data, int(shape[0])) +} + func (m *GemmaModel) WriteGGUF(ws io.WriteSeeker) error { kv := llm.KV{ "general.architecture": "gemma", diff --git a/convert/llama.go b/convert/llama.go index fb576e2e..7853c4cf 100644 --- a/convert/llama.go +++ b/convert/llama.go @@ -1,17 +1,17 @@ package convert import ( - "encoding/binary" + "cmp" + "errors" "fmt" "io" - "log/slog" + "os" + "path/filepath" "regexp" "strings" - "github.com/nlpodyssey/gopickle/pytorch" "github.com/pdevine/tensor" "github.com/pdevine/tensor/native" - "github.com/x448/float16" "github.com/ollama/ollama/llm" ) @@ -20,81 +20,12 @@ type LlamaModel struct { ModelData } -func llamaLayerHandler(w io.Writer, r torchWriterTo) error { - slog.Debug(fmt.Sprintf("repacking layer '%s'", r.t.Name)) - - data := r.storage.(*pytorch.HalfStorage).Data - tData := make([]uint16, len(data)) - for cnt, v := range data { - tData[cnt] = uint16(float16.Fromfloat32(v)) - } - - var err error - var heads uint32 - if strings.Contains(r.t.Name, "attn_q") { - heads = uint32(r.params.AttentionHeads) - } else if strings.Contains(r.t.Name, "attn_k") { - heads = uint32(r.params.KeyValHeads) - if heads == 0 { - heads = uint32(r.params.AttentionHeads) - } - } else { - return fmt.Errorf("unknown layer type") - } - - slog.Debug(fmt.Sprintf("heads = %d", heads)) - - tData, err = llamaRepack(tData, int(heads), r.t.Shape) - if err != nil { - return err - } - - if err = binary.Write(w, r.bo, tData); err != nil { - return err - } - return nil -} - -func llamaRepack(data []uint16, heads int, shape []uint64) ([]uint16, error) { - n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data)) - origShape := n.Shape().Clone() - - // reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf - if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil { - return nil, err - } - - if err := n.T(0, 2, 1, 3); err != nil { - return nil, err - } - - if err := n.Reshape(origShape...); err != nil { - return nil, err - } - - if err := n.Transpose(); err != nil { - return nil, err - } - newN, err := native.SelectU16(n, 1) - if err != nil { - return nil, err - } - - var fullTensor []uint16 - for _, v := range newN { - fullTensor = append(fullTensor, v...) - } - return fullTensor, nil -} - func (m *LlamaModel) GetTensors() error { t, err := m.Format.GetTensors(m.Path, m.Params) if err != nil { return err } - m.Tensors = []llm.Tensor{} - pattern := `^blk\.[0-9]+\.attn_(?Pq|k)\.weight$` re, err := regexp.Compile(pattern) if err != nil { @@ -104,10 +35,16 @@ func (m *LlamaModel) GetTensors() error { for _, l := range t { matches := re.FindAllStringSubmatch(l.Name, -1) if len(matches) > 0 { - slog.Debug(fmt.Sprintf("setting handler for: %s", l.Name)) - wt := l.WriterTo.(torchWriterTo) - wt.handler = llamaLayerHandler - l.WriterTo = wt + switch m.Format.(type) { + case *TorchFormat: + wt := l.WriterTo.(torchWriterTo) + wt.repacker = m.Repack + l.WriterTo = wt + case *SafetensorFormat: + wt := l.WriterTo.(safetensorWriterTo) + wt.repacker = m.Repack + l.WriterTo = wt + } } m.Tensors = append(m.Tensors, l) } @@ -115,19 +52,22 @@ func (m *LlamaModel) GetTensors() error { return nil } -func (m *LlamaModel) LoadVocab() error { - var v *Vocab - var err error - - slog.Debug("loading vocab") - v, err = LoadSentencePieceTokens(m.Path, m.Params) - if err != nil { +func (m *LlamaModel) LoadVocab() (err error) { + pre, ts, merges, err := parseTokens(filepath.Join(m.Path, "tokenizer.json")) + if errors.Is(err, os.ErrNotExist) { + return nil + } else if err != nil { return err } - slog.Debug("vocab loaded") + m.Vocab = &Vocab{} + for _, t := range ts { + m.Vocab.Tokens = append(m.Vocab.Tokens, t.Content) + m.Vocab.Types = append(m.Vocab.Types, t.Type()) + } - m.Vocab = v + m.Vocab.Merges = merges + m.Params.PreTokenizer = pre return nil } @@ -140,23 +80,79 @@ func (m *LlamaModel) WriteGGUF(ws io.WriteSeeker) error { "llama.embedding_length": uint32(m.Params.HiddenSize), "llama.block_count": uint32(m.Params.HiddenLayers), "llama.feed_forward_length": uint32(m.Params.IntermediateSize), + "llama.rope.freq_base": float32(m.Params.RopeFrequencyBase), "llama.rope.dimension_count": uint32(m.Params.HiddenSize / m.Params.AttentionHeads), "llama.attention.head_count": uint32(m.Params.AttentionHeads), "llama.attention.head_count_kv": uint32(m.Params.KeyValHeads), "llama.attention.layer_norm_rms_epsilon": float32(m.Params.NormEPS), "general.file_type": uint32(1), - "tokenizer.ggml.model": "llama", + "tokenizer.ggml.model": "gpt2", + "tokenizer.ggml.pre": m.Params.PreTokenizer, "tokenizer.ggml.tokens": m.Vocab.Tokens, - "tokenizer.ggml.scores": m.Vocab.Scores, "tokenizer.ggml.token_type": m.Vocab.Types, "tokenizer.ggml.bos_token_id": uint32(m.Params.BoSTokenID), "tokenizer.ggml.eos_token_id": uint32(m.Params.EoSTokenID), "tokenizer.ggml.unknown_token_id": uint32(0), - "tokenizer.ggml.add_bos_token": true, - "tokenizer.ggml.add_eos_token": false, + } + + if len(m.Vocab.Merges) > 0 { + kv["tokenizer.ggml.merges"] = m.Vocab.Merges + } else { + kv["tokenizer.ggml.scores"] = m.Vocab.Scores } return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors) } + +func (m *LlamaModel) Repack(name string, data []float32, shape []uint64) ([]float32, error) { + return llamaRepack(name, m.Params, data, shape) +} + +func llamaRepack(name string, params *Params, data []float32, shape []uint64) ([]float32, error) { + var dims []int + for _, dim := range shape { + if dim != 0 { + dims = append(dims, int(dim)) + } + } + + var heads int + if strings.HasSuffix(name, "attn_q.weight") { + heads = params.AttentionHeads + } else if strings.HasSuffix(name, "attn_k.weight") { + heads = cmp.Or(params.KeyValHeads, params.AttentionHeads) + } else { + return nil, fmt.Errorf("unknown tensor name: %s", name) + } + + n := tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data)) + if err := n.Reshape(append([]int{heads, 2, dims[0] / heads / 2}, dims[1:]...)...); err != nil { + return nil, err + } + + if err := n.T(0, 2, 1, 3); err != nil { + return nil, err + } + + if err := n.Reshape(dims...); err != nil { + return nil, err + } + + if err := n.Transpose(); err != nil { + return nil, err + } + + ts, err := native.SelectF32(n, 1) + if err != nil { + return nil, err + } + + var f32s []float32 + for _, t := range ts { + f32s = append(f32s, t...) + } + + return f32s, nil +} diff --git a/convert/mistral.go b/convert/mistral.go index f88de12b..da6874cf 100644 --- a/convert/mistral.go +++ b/convert/mistral.go @@ -1,17 +1,8 @@ package convert import ( - "encoding/binary" - "fmt" "io" - "os" "regexp" - "strings" - - "github.com/d4l3k/go-bfloat16" - "github.com/pdevine/tensor" - "github.com/pdevine/tensor/native" - "github.com/x448/float16" "github.com/ollama/ollama/llm" ) @@ -20,90 +11,12 @@ type MistralModel struct { ModelData } -func mistralLayerHandler(w io.Writer, r safetensorWriterTo, f *os.File) error { - layerSize := r.end - r.start - - var err error - tData := make([]uint16, layerSize/2) - if err = binary.Read(f, r.bo, tData); err != nil { - return err - } - - var heads uint32 - if strings.Contains(r.t.Name, "attn_q") { - heads = uint32(r.params.AttentionHeads) - } else if strings.Contains(r.t.Name, "attn_k") { - heads = uint32(r.params.KeyValHeads) - if heads == 0 { - heads = uint32(r.params.AttentionHeads) - } - } else { - return fmt.Errorf("unknown layer type") - } - - tData, err = repack(tData, int(heads), r.t.Shape) - if err != nil { - return err - } - - var buf []byte - for _, n := range tData { - buf = r.bo.AppendUint16(buf, n) - } - - tempBuf := make([]uint16, len(tData)) - tDataF32 := bfloat16.DecodeFloat32(buf) - for cnt, v := range tDataF32 { - tDataF16 := float16.Fromfloat32(v) - tempBuf[cnt] = uint16(tDataF16) - } - - if err = binary.Write(w, r.bo, tempBuf); err != nil { - return err - } - return nil -} - -func repack(data []uint16, heads int, shape []uint64) ([]uint16, error) { - n := tensor.New(tensor.WithShape(int(shape[0]), int(shape[1])), tensor.WithBacking(data)) - origShape := n.Shape().Clone() - - // reshape the tensor and swap axes 1 and 2 to unpack the layer for gguf - if err := n.Reshape(heads, 2, origShape[0]/heads/2, origShape[1]); err != nil { - return nil, err - } - - if err := n.T(0, 2, 1, 3); err != nil { - return nil, err - } - - if err := n.Reshape(origShape...); err != nil { - return nil, err - } - - if err := n.Transpose(); err != nil { - return nil, err - } - newN, err := native.SelectU16(n, 1) - if err != nil { - return nil, err - } - - var fullTensor []uint16 - for _, v := range newN { - fullTensor = append(fullTensor, v...) - } - return fullTensor, nil -} - func (m *MistralModel) GetTensors() error { t, err := m.Format.GetTensors(m.Path, m.Params) if err != nil { return err } - m.Tensors = []llm.Tensor{} - pattern := `^blk\.[0-9]+\.attn_(?Pq|k)\.weight$` re, err := regexp.Compile(pattern) if err != nil { @@ -114,7 +27,7 @@ func (m *MistralModel) GetTensors() error { matches := re.FindAllStringSubmatch(l.Name, -1) if len(matches) > 0 { wt := l.WriterTo.(safetensorWriterTo) - wt.handler = mistralLayerHandler + wt.repacker = m.Repack l.WriterTo = wt } m.Tensors = append(m.Tensors, l) @@ -160,3 +73,7 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error { return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors) } + +func (m *MistralModel) Repack(name string, data []float32, shape []uint64) ([]float32, error) { + return llamaRepack(name, m.Params, data, shape) +} diff --git a/convert/mixtral.go b/convert/mixtral.go index 940df55d..baea68cd 100644 --- a/convert/mixtral.go +++ b/convert/mixtral.go @@ -17,8 +17,6 @@ func (m *MixtralModel) GetTensors() error { return err } - m.Tensors = []llm.Tensor{} - pattern := `^blk\.[0-9]+\.attn_(?Pq|k)\.weight$` re, err := regexp.Compile(pattern) if err != nil { @@ -29,7 +27,7 @@ func (m *MixtralModel) GetTensors() error { matches := re.FindAllStringSubmatch(l.Name, -1) if len(matches) > 0 { wt := l.WriterTo.(safetensorWriterTo) - wt.handler = mistralLayerHandler + wt.repacker = m.Repack l.WriterTo = wt } m.Tensors = append(m.Tensors, l) @@ -83,3 +81,7 @@ func (m *MixtralModel) WriteGGUF(ws io.WriteSeeker) error { return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors) } + +func (m *MixtralModel) Repack(name string, data []float32, shape []uint64) ([]float32, error) { + return llamaRepack(name, m.Params, data, shape) +} diff --git a/convert/safetensors.go b/convert/safetensors.go index 69424c4d..9de9a002 100644 --- a/convert/safetensors.go +++ b/convert/safetensors.go @@ -11,6 +11,7 @@ import ( "path/filepath" "regexp" "slices" + "strings" "github.com/d4l3k/go-bfloat16" "github.com/mitchellh/mapstructure" @@ -26,9 +27,10 @@ type safetensorWriterTo struct { bo ByteOrder filename string + dtype string start, end, padding uint64 - handler func(w io.Writer, r safetensorWriterTo, f *os.File) error + repacker func(string, []float32, []uint64) ([]float32, error) } type tensorMetaData struct { @@ -97,6 +99,10 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) var tensors []llm.Tensor for _, k := range keys { + if strings.HasSuffix(k, "self_attn.rotary_emb.inv_freq") { + continue + } + vals := parsed[k].(map[string]interface{}) var data tensorMetaData if err = mapstructure.Decode(vals, &data); err != nil { @@ -131,6 +137,8 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) shape[i] = uint64(data.Shape[i]) } + slog.Debug(fmt.Sprintf("'%45s': '%30s' %10d [%#v]", k, ggufName, size, data.Shape)) + t := llm.Tensor{ Name: ggufName, Kind: kind, @@ -143,6 +151,7 @@ func (m *SafetensorFormat) readTensors(fn string, offset uint64, params *Params) params: params, bo: params.ByteOrder, filename: fn, + dtype: data.Type, start: uint64(data.Offsets[0]), end: uint64(data.Offsets[1]), padding: 8 + jsonSize, @@ -228,51 +237,54 @@ func (r safetensorWriterTo) WriteTo(w io.Writer) (n int64, err error) { return 0, err } - // use the handler if one is present - if r.handler != nil { - return 0, r.handler(w, r, f) - } - - remaining := r.end - r.start - - bufSize := uint64(10240) - var finished bool - for { - data := make([]byte, min(bufSize, remaining)) - - b, err := io.ReadFull(f, data) - remaining -= uint64(b) - - if err == io.EOF || remaining <= 0 { - finished = true - } else if err != nil { + var f32s []float32 + switch r.dtype { + case "F32": + f32s = make([]float32, (r.end-r.start)/4) + if err = binary.Read(f, r.bo, f32s); err != nil { + return 0, err + } + case "F16": + bts := make([]uint16, (r.end-r.start)/2) + if err = binary.Read(f, r.bo, bts); err != nil { return 0, err } - // convert bfloat16 -> ieee float32 - tDataF32 := bfloat16.DecodeFloat32(data) - - switch r.t.Kind { - case 0: - if err := binary.Write(w, r.bo, tDataF32); err != nil { - return 0, err - } - case 1: - // convert float32 -> float16 - tempBuf := make([]uint16, len(data)/2) - for cnt, v := range tDataF32 { - tDataF16 := float16.Fromfloat32(v) - tempBuf[cnt] = uint16(tDataF16) - } - if err := binary.Write(w, r.bo, tempBuf); err != nil { - return 0, err - } + for _, b := range bts { + f32s = append(f32s, float16.Frombits(b).Float32()) } - if finished { - break + + case "BF16": + bts := make([]byte, r.end-r.start) + if err = binary.Read(f, r.bo, bts); err != nil { + return 0, err + } + + f32s = bfloat16.DecodeFloat32(bts) + default: + return 0, fmt.Errorf("unknown data type: %s", r.dtype) + } + + if r.repacker != nil { + f32s, err = r.repacker(r.t.Name, f32s, r.t.Shape) + if err != nil { + return 0, err } } - return 0, nil + + switch r.t.Kind { + case 0: + return 0, binary.Write(w, r.bo, f32s) + case 1: + f16s := make([]uint16, len(f32s)) + for i := range f32s { + f16s[i] = float16.Fromfloat32(f32s[i]).Bits() + } + + return 0, binary.Write(w, r.bo, f16s) + default: + return 0, fmt.Errorf("unknown storage type: %d", r.t.Kind) + } } func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) { @@ -281,6 +293,15 @@ func (m *SafetensorFormat) GetModelArch(name, dirPath string, params *Params) (M return nil, fmt.Errorf("No architecture specified to convert") case 1: switch params.Architectures[0] { + case "LlamaForCausalLM": + return &LlamaModel{ + ModelData{ + Name: name, + Path: dirPath, + Params: params, + Format: m, + }, + }, nil case "MistralForCausalLM": return &MistralModel{ ModelData{ diff --git a/convert/tokenizer.go b/convert/tokenizer.go new file mode 100644 index 00000000..e0fe0bb7 --- /dev/null +++ b/convert/tokenizer.go @@ -0,0 +1,109 @@ +package convert + +import ( + "cmp" + "crypto/sha256" + "encoding/json" + "fmt" + "log/slog" + "os" + "slices" + + "golang.org/x/exp/maps" +) + +type Tokenizer struct { + Version string `json:"version"` + AddedTokens []Token `json:"added_tokens"` + Model TokenizerModel `json:"model"` + + PreTokenizer struct { + PreTokenziers []struct { + Type string `json:"type"` + Pattern struct { + Regex string `json:"Regex"` + } `json:"pattern"` + } `json:"pretokenizers"` + } `json:"pre_tokenizer"` +} + +type TokenizerModel struct { + Type string `json:"type"` + Vocab map[string]int `json:"vocab"` + Merges []string `json:"merges"` + Tokens []Token +} + +type Token struct { + ID int `json:"id"` + Content string `json:"content"` + Special bool `json:"special"` + UserDefined bool +} + +func (t *Token) Type() int32 { + switch { + case t.Special: + return tokenTypeControl + case t.UserDefined: + return tokenTypeUserDefined + default: + return tokenTypeNormal + } +} + +func (t *Tokenizer) maxID() int { + return max( + slices.Max(maps.Values(t.Model.Vocab)), + slices.MaxFunc(t.AddedTokens, func(a, b Token) int { + return cmp.Compare(a.ID, b.ID) + }).ID, + ) +} + +func parseTokens(dirpath string) (pre string, tokens []Token, merges []string, err error) { + f, err := os.Open(dirpath) + if err != nil { + panic(err) + } + defer f.Close() + + var t Tokenizer + if err := json.NewDecoder(f).Decode(&t); err != nil { + return "", nil, nil, err + } + + tokens = make([]Token, t.maxID()+1) + for k, v := range t.Model.Vocab { + tokens[v] = Token{ID: v, Content: k, Special: false, UserDefined: false} + } + + for _, v := range t.AddedTokens { + v.UserDefined = true + tokens[v.ID] = v + } + + sha256sum := sha256.New() + for _, pt := range t.PreTokenizer.PreTokenziers { + switch pt.Type { + case "Split": + if pt.Pattern.Regex != "" { + sha256sum.Write([]byte(pt.Pattern.Regex)) + } + } + } + + switch digest := fmt.Sprintf("%x", sha256sum.Sum(nil)); digest { + case "d98f9631be1e9607a9848c26c1f9eac1aa9fc21ac6ba82a2fc0741af9780a48f": + pre = "llama-bpe" + case "03df5c5863ad70781dcfdef491ead25140f895fe8010964be0daefe27be32b02": + pre = "deepseek-llm" + case "21cde974d587f0d54dc8d56b183cc1e6239600172035c68fbd6d4b9f8da0576e": + pre = "deepseek-coder" + default: + slog.Warn("unknown pretokenizer, using default", "digest", digest) + pre = "default" + } + + return pre, tokens, t.Model.Merges, nil +} diff --git a/convert/torch.go b/convert/torch.go index 92c58872..b7ae0f76 100644 --- a/convert/torch.go +++ b/convert/torch.go @@ -24,8 +24,8 @@ type torchWriterTo struct { params *Params bo ByteOrder - storage pytorch.StorageInterface - handler func(w io.Writer, r torchWriterTo) error + storage pytorch.StorageInterface + repacker func(string, []float32, []uint64) ([]float32, error) } type TorchFormat struct{} @@ -33,14 +33,14 @@ type TorchFormat struct{} func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, error) { slog.Debug("getting torch tensors") - files, err := filepath.Glob(filepath.Join(dirpath, "pytorch_model-*.bin")) - if err != nil { - slog.Error("didn't find any torch files") - return nil, err + var files []string + if pt, _ := filepath.Glob(filepath.Join(dirpath, "consolidated*.pth")); len(pt) > 0 { + files = append(files, pt...) + } else if pt, _ := filepath.Glob(filepath.Join(dirpath, "pytorch_model*.pth")); len(pt) > 0 { + files = append(files, pt...) } var offset uint64 - var tensors []llm.Tensor for _, fn := range files { m, err := pytorch.Load(fn) @@ -77,7 +77,7 @@ func (tf *TorchFormat) GetTensors(dirpath string, params *Params) ([]llm.Tensor, slog.Error(err.Error()) return nil, err } - slog.Debug(fmt.Sprintf("finding name for '%s' -> '%s'", k.(string), ggufName)) + slog.Debug(fmt.Sprintf("'%35s': '%30s' %10d [%#v]", k.(string), ggufName, size, tshape)) shape := []uint64{0, 0, 0, 0} for i := range tshape { @@ -120,7 +120,7 @@ func getAltParams(dirpath string) (*Params, error) { AttentionHeads int `json:"n_heads"` KeyValHeads int `json:"n_kv_heads"` HiddenLayers int `json:"n_layers"` - RopeTheta int `json:"rope_theta"` + RopeTheta float64 `json:"rope_theta"` NormEPS float64 `json:"norm_eps"` } @@ -133,6 +133,7 @@ func getAltParams(dirpath string) (*Params, error) { } params := &Params{ + Architectures: []string{"LlamaForCausalLM"}, HiddenSize: tparams.HiddenSize, AttentionHeads: tparams.AttentionHeads, KeyValHeads: tparams.KeyValHeads, @@ -229,37 +230,38 @@ func (m *TorchFormat) GetLayerName(n string) (string, error) { } func (r torchWriterTo) WriteTo(w io.Writer) (n int64, err error) { - // use the handler if one is present - if r.handler != nil { - return 0, r.handler(w, r) + var f32s []float32 + switch s := r.storage.(type) { + case *pytorch.FloatStorage: + f32s = s.Data + case *pytorch.HalfStorage: + f32s = s.Data + case *pytorch.BFloat16Storage: + f32s = s.Data + default: + return 0, fmt.Errorf("unknown data type: %T", s) } - switch r.storage.(type) { - case *pytorch.FloatStorage: - slog.Warn(fmt.Sprintf("unexpected storage found for layer '%s'; skipping", r.t.Name)) - return 0, nil - case *pytorch.HalfStorage: - switch r.t.Kind { - case 0: - data := r.storage.(*pytorch.HalfStorage).Data - slog.Debug(fmt.Sprintf("%35s F32 (%d)", r.t.Name, len(data))) - if err := binary.Write(w, r.bo, data); err != nil { - return 0, err - } - case 1: - data := r.storage.(*pytorch.HalfStorage).Data - tData := make([]uint16, len(data)) - for cnt, v := range data { - tData[cnt] = uint16(float16.Fromfloat32(v)) - } - slog.Debug(fmt.Sprintf("%35s F16 (%d)", r.t.Name, len(tData))) - if err := binary.Write(w, r.bo, tData); err != nil { - return 0, err - } + if r.repacker != nil { + f32s, err = r.repacker(r.t.Name, f32s, r.t.Shape) + if err != nil { + return 0, err } } - return 0, nil + switch r.t.Kind { + case 0: + return 0, binary.Write(w, r.bo, f32s) + case 1: + f16s := make([]uint16, len(f32s)) + for i := range f32s { + f16s[i] = float16.Fromfloat32(f32s[i]).Bits() + } + + return 0, binary.Write(w, r.bo, f16s) + default: + return 0, fmt.Errorf("unknown storage type: %d", r.t.Kind) + } } func (m *TorchFormat) GetModelArch(name, dirPath string, params *Params) (ModelArch, error) { diff --git a/go.mod b/go.mod index 5d0d3c33..255c8a04 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ go 1.22.0 require ( github.com/containerd/console v1.0.3 - github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 github.com/emirpasic/gods v1.18.1 github.com/gin-gonic/gin v1.10.0 github.com/golang/protobuf v1.5.4 // indirect @@ -18,6 +17,7 @@ require ( ) require ( + github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1 github.com/mattn/go-runewidth v0.0.14 github.com/nlpodyssey/gopickle v0.3.0 github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c diff --git a/llm/gguf.go b/llm/gguf.go index 5f6e8004..eb7d7b75 100644 --- a/llm/gguf.go +++ b/llm/gguf.go @@ -62,16 +62,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) { return model, nil } -const ( - _ uint32 = iota - GGUFTokenNormal - GGUFTokenUnknown - GGUFTokenControl - GGUFTokenUserDefined - GGUFTokenUnused - GGUFTokenByte -) - const ( ggufTypeUint8 uint32 = iota ggufTypeInt8 @@ -480,9 +470,11 @@ var ggufKVOrder = map[string][]string{ "gemma.attention.key_length", "gemma.attention.value_length", "general.file_type", + "tokenizer.ggml.pre", "tokenizer.ggml.model", "tokenizer.ggml.tokens", "tokenizer.ggml.scores", + "tokenizer.ggml.merges", "tokenizer.ggml.token_type", "tokenizer.ggml.bos_token_id", "tokenizer.ggml.eos_token_id",