Merge pull request #5025 from ollama/mxyng/revert-parser-scan

Revert "proper utf16 support"
This commit is contained in:
Michael Yang 2024-06-13 10:31:25 -07:00 committed by GitHub
commit e87fc7200d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -3,15 +3,12 @@ package parser
import ( import (
"bufio" "bufio"
"bytes" "bytes"
"encoding/binary"
"errors" "errors"
"fmt" "fmt"
"io" "io"
"log/slog"
"strconv" "strconv"
"strings" "strings"
"unicode/utf16" "unicode"
"unicode/utf8"
) )
type File struct { type File struct {
@ -72,29 +69,31 @@ func ParseFile(r io.Reader) (*File, error) {
var b bytes.Buffer var b bytes.Buffer
var role string var role string
var lineCount int
var linePos int
var utf16 bool
var f File var f File
br := bufio.NewReader(r) br := bufio.NewReader(r)
for {
var sc scannerDecoder = utf8ScannerDecoder{} r, _, err := br.ReadRune()
if bom, err := br.Peek(2); err != nil { if errors.Is(err, io.EOF) {
slog.Warn("error reading byte-order mark", "error", err) break
} else if bytes.Equal(bom, []byte{0xFE, 0xFF}) { } else if err != nil {
sc = utf16ScannerDecoder{binary.LittleEndian} return nil, err
//nolint:errcheck
br.Discard(2)
} else if bytes.Equal(bom, []byte{0xFF, 0xFE}) {
sc = utf16ScannerDecoder{binary.BigEndian}
//nolint:errcheck
br.Discard(2)
} }
scanner := bufio.NewScanner(br) // the utf16 byte order mark will be read as "unreadable" by ReadRune()
scanner.Split(sc.ScanBytes) if isUnreadable(r) && lineCount == 0 && linePos == 0 {
for scanner.Scan() { utf16 = true
r, err := sc.DecodeRune(scanner.Bytes()) continue
if err != nil { }
return nil, err
// skip the second byte if we're reading utf16
if utf16 && r == 0 {
continue
} }
next, r, err := parseRuneForState(r, curr) next, r, err := parseRuneForState(r, curr)
@ -104,6 +103,13 @@ func ParseFile(r io.Reader) (*File, error) {
return nil, err return nil, err
} }
if isNewline(r) {
lineCount++
linePos = 0
} else {
linePos++
}
// process the state transition, some transitions need to be intercepted and redirected // process the state transition, some transitions need to be intercepted and redirected
if next != curr { if next != curr {
switch curr { switch curr {
@ -303,6 +309,10 @@ func isNewline(r rune) bool {
return r == '\r' || r == '\n' return r == '\r' || r == '\n'
} }
func isUnreadable(r rune) bool {
return r == unicode.ReplacementChar
}
func isValidMessageRole(role string) bool { func isValidMessageRole(role string) bool {
return role == "system" || role == "user" || role == "assistant" return role == "system" || role == "user" || role == "assistant"
} }
@ -315,39 +325,3 @@ func isValidCommand(cmd string) bool {
return false return false
} }
} }
type scannerDecoder interface {
ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error)
DecodeRune([]byte) (rune, error)
}
type utf8ScannerDecoder struct{}
func (utf8ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
return scanBytesN(data, 1, atEOF)
}
func (utf8ScannerDecoder) DecodeRune(data []byte) (rune, error) {
r, _ := utf8.DecodeRune(data)
return r, nil
}
type utf16ScannerDecoder struct {
binary.ByteOrder
}
func (utf16ScannerDecoder) ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
return scanBytesN(data, 2, atEOF)
}
func (e utf16ScannerDecoder) DecodeRune(data []byte) (rune, error) {
return utf16.Decode([]uint16{e.ByteOrder.Uint16(data)})[0], nil
}
func scanBytesN(data []byte, n int, atEOF bool) (int, []byte, error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
return n, data[:n], nil
}