ollama/parser/parser.go

328 lines
6.2 KiB
Go
Raw Normal View History

package parser
import (
"bufio"
2023-07-25 17:22:23 +00:00
"bytes"
"errors"
2023-07-27 16:55:48 +00:00
"fmt"
"io"
2024-04-22 22:37:14 +00:00
"strconv"
"strings"
"unicode"
)
2024-04-30 17:55:19 +00:00
type File struct {
Commands []Command
}
func (f File) String() string {
var sb strings.Builder
for _, cmd := range f.Commands {
fmt.Fprintln(&sb, cmd.String())
}
return sb.String()
}
type Command struct {
Name string
Args string
}
2024-04-30 17:55:19 +00:00
func (c Command) String() string {
2024-05-01 17:01:09 +00:00
var sb strings.Builder
2024-04-30 17:55:19 +00:00
switch c.Name {
case "model":
2024-05-01 17:01:09 +00:00
fmt.Fprintf(&sb, "FROM %s", c.Args)
2024-04-30 17:55:19 +00:00
case "license", "template", "system", "adapter":
2024-05-01 17:01:09 +00:00
fmt.Fprintf(&sb, "%s %s", strings.ToUpper(c.Name), quote(c.Args))
2024-04-30 17:55:19 +00:00
case "message":
role, message, _ := strings.Cut(c.Args, ": ")
2024-05-01 17:01:09 +00:00
fmt.Fprintf(&sb, "MESSAGE %s %s", role, quote(message))
2024-04-30 17:55:19 +00:00
default:
2024-05-01 17:01:09 +00:00
fmt.Fprintf(&sb, "PARAMETER %s %s", c.Name, quote(c.Args))
2024-04-30 17:55:19 +00:00
}
2024-05-01 17:01:09 +00:00
return sb.String()
2024-04-30 17:55:19 +00:00
}
2024-04-22 22:37:14 +00:00
type state int
2024-04-22 22:37:14 +00:00
const (
stateNil state = iota
stateName
stateValue
stateParameter
stateMessage
stateComment
)
2024-04-24 23:12:56 +00:00
var (
2024-04-27 00:11:47 +00:00
errMissingFrom = errors.New("no FROM line")
errInvalidMessageRole = errors.New("message role must be one of \"system\", \"user\", or \"assistant\"")
errInvalidCommand = errors.New("command must be one of \"from\", \"license\", \"template\", \"system\", \"adapter\", \"parameter\", or \"message\"")
2024-04-24 23:12:56 +00:00
)
2024-04-22 22:37:14 +00:00
2024-04-30 17:55:19 +00:00
func ParseFile(r io.Reader) (*File, error) {
2024-04-22 22:37:14 +00:00
var cmd Command
var curr state
var b bytes.Buffer
var role string
var lineCount int
var linePos int
var utf16 bool
2024-04-30 17:55:19 +00:00
var f File
2024-04-22 22:37:14 +00:00
br := bufio.NewReader(r)
for {
r, _, err := br.ReadRune()
if errors.Is(err, io.EOF) {
break
} else if err != nil {
return nil, err
}
// the utf16 byte order mark will be read as "unreadable" by ReadRune()
if isUnreadable(r) && lineCount == 0 && linePos == 0 {
utf16 = true
continue
}
// skip the second byte if we're reading utf16
if utf16 && r == 0 {
continue
}
2024-04-22 22:37:14 +00:00
next, r, err := parseRuneForState(r, curr)
if errors.Is(err, io.ErrUnexpectedEOF) {
return nil, fmt.Errorf("%w: %s", err, b.String())
} else if err != nil {
return nil, err
}
if isNewline(r) {
lineCount++
linePos = 0
} else {
linePos++
}
2024-04-26 22:13:27 +00:00
// process the state transition, some transitions need to be intercepted and redirected
2024-04-22 22:37:14 +00:00
if next != curr {
switch curr {
2024-04-27 00:11:47 +00:00
case stateName:
if !isValidCommand(b.String()) {
return nil, errInvalidCommand
}
2024-04-26 22:13:27 +00:00
// next state sometimes depends on the current buffer value
2024-04-22 22:37:14 +00:00
switch s := strings.ToLower(b.String()); s {
case "from":
cmd.Name = "model"
case "parameter":
2024-04-26 22:13:27 +00:00
// transition to stateParameter which sets command name
2024-04-22 22:37:14 +00:00
next = stateParameter
case "message":
2024-04-26 22:13:27 +00:00
// transition to stateMessage which validates the message role
2024-04-22 22:37:14 +00:00
next = stateMessage
fallthrough
default:
cmd.Name = s
}
2024-04-27 00:11:47 +00:00
case stateParameter:
cmd.Name = b.String()
2024-04-22 22:37:14 +00:00
case stateMessage:
2024-04-26 22:13:27 +00:00
if !isValidMessageRole(b.String()) {
2024-04-27 00:11:47 +00:00
return nil, errInvalidMessageRole
2024-04-22 22:37:14 +00:00
}
role = b.String()
case stateComment, stateNil:
// pass
case stateValue:
s, ok := unquote(b.String())
if !ok || isSpace(r) {
if _, err := b.WriteRune(r); err != nil {
return nil, err
}
continue
}
if role != "" {
s = role + ": " + s
role = ""
}
cmd.Args = s
2024-04-30 17:55:19 +00:00
f.Commands = append(f.Commands, cmd)
2023-08-10 23:09:02 +00:00
}
2024-04-22 22:37:14 +00:00
b.Reset()
curr = next
}
if strconv.IsPrint(r) {
if _, err := b.WriteRune(r); err != nil {
return nil, err
2023-08-10 23:22:08 +00:00
}
2024-04-22 22:37:14 +00:00
}
}
// flush the buffer
switch curr {
case stateComment, stateNil:
// pass; nothing to flush
case stateValue:
2024-04-25 02:17:26 +00:00
s, ok := unquote(b.String())
if !ok {
2024-04-22 22:37:14 +00:00
return nil, io.ErrUnexpectedEOF
}
2024-04-25 02:17:26 +00:00
if role != "" {
s = role + ": " + s
}
cmd.Args = s
2024-04-30 17:55:19 +00:00
f.Commands = append(f.Commands, cmd)
2024-04-22 22:37:14 +00:00
default:
return nil, io.ErrUnexpectedEOF
}
2024-04-30 17:55:19 +00:00
for _, cmd := range f.Commands {
2024-04-22 22:37:14 +00:00
if cmd.Name == "model" {
2024-04-30 17:55:19 +00:00
return &f, nil
2024-04-22 22:37:14 +00:00
}
}
2024-04-24 23:12:56 +00:00
return nil, errMissingFrom
}
2024-04-22 22:37:14 +00:00
func parseRuneForState(r rune, cs state) (state, rune, error) {
switch cs {
case stateNil:
switch {
case r == '#':
return stateComment, 0, nil
case isSpace(r), isNewline(r):
return stateNil, 0, nil
default:
return stateName, r, nil
}
case stateName:
switch {
case isAlpha(r):
return stateName, r, nil
case isSpace(r):
return stateValue, 0, nil
default:
2024-04-27 00:11:47 +00:00
return stateNil, 0, errInvalidCommand
2024-04-22 22:37:14 +00:00
}
case stateValue:
switch {
case isNewline(r):
return stateNil, r, nil
case isSpace(r):
return stateNil, r, nil
default:
return stateValue, r, nil
}
case stateParameter:
switch {
case isAlpha(r), isNumber(r), r == '_':
return stateParameter, r, nil
case isSpace(r):
return stateValue, 0, nil
default:
return stateNil, 0, io.ErrUnexpectedEOF
}
case stateMessage:
switch {
case isAlpha(r):
return stateMessage, r, nil
case isSpace(r):
return stateValue, 0, nil
default:
return stateNil, 0, io.ErrUnexpectedEOF
}
case stateComment:
switch {
case isNewline(r):
return stateNil, 0, nil
default:
return stateComment, 0, nil
}
default:
return stateNil, 0, errors.New("")
2023-07-27 16:55:48 +00:00
}
2024-04-22 22:37:14 +00:00
}
2023-07-27 16:55:48 +00:00
2024-04-25 01:49:14 +00:00
func quote(s string) string {
if strings.Contains(s, "\n") || strings.HasPrefix(s, " ") || strings.HasSuffix(s, " ") {
2024-04-25 01:49:14 +00:00
if strings.Contains(s, "\"") {
return `"""` + s + `"""`
}
return `"` + s + `"`
2024-04-25 01:49:14 +00:00
}
return s
}
2024-04-22 22:37:14 +00:00
func unquote(s string) (string, bool) {
// TODO: single quotes
if len(s) >= 3 && s[:3] == `"""` {
if len(s) >= 6 && s[len(s)-3:] == `"""` {
return s[3 : len(s)-3], true
}
return "", false
2023-07-27 16:55:48 +00:00
}
2024-04-22 22:37:14 +00:00
if len(s) >= 1 && s[0] == '"' {
if len(s) >= 2 && s[len(s)-1] == '"' {
return s[1 : len(s)-1], true
}
return "", false
2023-07-27 16:55:48 +00:00
}
2024-04-22 22:37:14 +00:00
return s, true
2023-07-27 16:55:48 +00:00
}
2024-04-22 22:37:14 +00:00
func isAlpha(r rune) bool {
return r >= 'a' && r <= 'z' || r >= 'A' && r <= 'Z'
}
2024-04-22 22:37:14 +00:00
func isNumber(r rune) bool {
return r >= '0' && r <= '9'
}
2024-04-22 22:37:14 +00:00
func isSpace(r rune) bool {
return r == ' ' || r == '\t'
}
2024-04-22 22:37:14 +00:00
func isNewline(r rune) bool {
return r == '\r' || r == '\n'
}
func isUnreadable(r rune) bool {
return r == unicode.ReplacementChar
}
2024-04-26 22:13:27 +00:00
func isValidMessageRole(role string) bool {
2024-04-22 22:37:14 +00:00
return role == "system" || role == "user" || role == "assistant"
}
2024-04-27 00:11:47 +00:00
func isValidCommand(cmd string) bool {
switch strings.ToLower(cmd) {
case "from", "license", "template", "system", "adapter", "parameter", "message":
return true
default:
return false
}
}