add missing file

This commit is contained in:
Patrick Devine 2024-05-08 16:56:18 -07:00 committed by Michael Yang
parent d355d2020f
commit 2d315ba9a9

72
convert/tokenizer.go Normal file
View file

@ -0,0 +1,72 @@
package convert
import (
"encoding/json"
"io/ioutil"
"os"
)
type Tokenizer struct {
Version string `json:"version"`
AddedTokens []Token `json:"added_tokens"`
Model TokenizerModel `json:"model"`
}
type TokenizerModel struct {
Type string `json:"type"`
Vocab map[string]int `json:"vocab"`
Merges []string `json:"merges"`
Tokens []Token
}
type Token struct {
ID int `json:"id"`
Content string `json:"content"`
Special bool `json:"special"`
UserDefined bool
}
func (t *Tokenizer) getMaxID() int {
var maxID int
for _, v := range t.Model.Vocab {
maxID = max(maxID, v)
}
for _, v := range t.AddedTokens {
maxID = max(maxID, v.ID)
}
return maxID
}
func newTokenizer(dirpath string) (*Tokenizer, error) {
f, err := os.Open(dirpath)
if err != nil {
panic(err)
}
defer f.Close()
data, err := ioutil.ReadAll(f)
if err != nil {
return nil, err
}
var tdata Tokenizer
if err := json.Unmarshal(data, &tdata); err != nil {
return nil, err
}
maxID := tdata.getMaxID()
tdata.Model.Tokens = make([]Token, maxID+1)
for k, v := range tdata.Model.Vocab {
tdata.Model.Tokens[v] = Token{ID: v, Content: k, Special: false, UserDefined: false}
}
for _, v := range tdata.AddedTokens {
v.UserDefined = true
tdata.Model.Tokens[v.ID] = v
}
return &tdata, nil
}