package convert import ( "cmp" "encoding/json" "errors" "fmt" "io/fs" "os" "slices" "google.golang.org/protobuf/proto" "github.com/ollama/ollama/convert/sentencepiece" ) func parseSentencePiece(fsys fs.FS) (*Vocabulary, error) { ast, err := parseAdditionalSpecialTokens(fsys) if err != nil { return nil, err } bts, err := fs.ReadFile(fsys, "tokenizer.model") if err != nil { return nil, err } var spm sentencepiece.ModelProto if err := proto.Unmarshal(bts, &spm); err != nil { return nil, err } v := Vocabulary{Model: "llama"} for _, piece := range spm.GetPieces() { v.Tokens = append(v.Tokens, piece.GetPiece()) v.Scores = append(v.Scores, piece.GetScore()) switch t := piece.GetType(); t { case sentencepiece.ModelProto_SentencePiece_UNKNOWN, sentencepiece.ModelProto_SentencePiece_CONTROL, sentencepiece.ModelProto_SentencePiece_UNUSED, sentencepiece.ModelProto_SentencePiece_BYTE: v.Types = append(v.Types, int32(t)) default: tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL) if slices.Contains(ast, piece.GetPiece()) { tt = int32(sentencepiece.ModelProto_SentencePiece_CONTROL) } v.Types = append(v.Types, tt) } } f, err := fsys.Open("added_tokens.json") if errors.Is(err, os.ErrNotExist) { return &v, nil } else if err != nil { return nil, err } defer f.Close() var atm map[string]int if err := json.NewDecoder(f).Decode(&atm); err != nil { return nil, err } type t struct { id int content string } var ts []t for content, id := range atm { ts = append(ts, t{id, content}) } slices.SortFunc(ts, func(i, j t) int { return cmp.Compare(i.id, j.id) }) n := len(v.Tokens) for i, t := range ts { if t.id != i+n { return nil, fmt.Errorf("invalid token id: %d", t.id) } v.Tokens = append(v.Tokens, t.content) v.Scores = append(v.Scores, -1000.0) v.Types = append(v.Types, tokenTypeUserDefined) } return &v, nil } func parseAdditionalSpecialTokens(fsys fs.FS) ([]string, error) { f, err := fsys.Open("special_tokens_map.json") if errors.Is(err, os.ErrNotExist) { return nil, nil } else if err != nil { return nil, err } defer f.Close() var m struct { AdditionalSpecialTokens []string `json:"additional_special_tokens"` } if err := json.NewDecoder(f).Decode(&m); err != nil { return nil, err } return m.AdditionalSpecialTokens, nil }