139 lines
3.6 KiB
Go
139 lines
3.6 KiB
Go
// Copyright 2016 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package precis
|
|
|
|
import "errors"
|
|
|
|
// This file contains tables and code related to context rules.
|
|
|
|
type catBitmap uint16
|
|
|
|
const (
|
|
// These bits, once set depending on the current value, are never unset.
|
|
bJapanese catBitmap = 1 << iota
|
|
bArabicIndicDigit
|
|
bExtendedArabicIndicDigit
|
|
|
|
// These bits are set on each iteration depending on the current value.
|
|
bJoinStart
|
|
bJoinMid
|
|
bJoinEnd
|
|
bVirama
|
|
bLatinSmallL
|
|
bGreek
|
|
bHebrew
|
|
|
|
// These bits indicated which of the permanent bits need to be set at the
|
|
// end of the checks.
|
|
bMustHaveJapn
|
|
|
|
permanent = bJapanese | bArabicIndicDigit | bExtendedArabicIndicDigit | bMustHaveJapn
|
|
)
|
|
|
|
const finalShift = 10
|
|
|
|
var errContext = errors.New("precis: contextual rule violated")
|
|
|
|
func init() {
|
|
// Programmatically set these required bits as, manually setting them seems
|
|
// too error prone.
|
|
for i, ct := range categoryTransitions {
|
|
categoryTransitions[i].keep |= permanent
|
|
categoryTransitions[i].accept |= ct.term
|
|
}
|
|
}
|
|
|
|
var categoryTransitions = []struct {
|
|
keep catBitmap // mask selecting which bits to keep from the previous state
|
|
set catBitmap // mask for which bits to set for this transition
|
|
|
|
// These bitmaps are used for rules that require lookahead.
|
|
// term&accept == term must be true, which is enforced programmatically.
|
|
term catBitmap // bits accepted as termination condition
|
|
accept catBitmap // bits that pass, but not sufficient as termination
|
|
|
|
// The rule function cannot take a *context as an argument, as it would
|
|
// cause the context to escape, adding significant overhead.
|
|
rule func(beforeBits catBitmap) (doLookahead bool, err error)
|
|
}{
|
|
joiningL: {set: bJoinStart},
|
|
joiningD: {set: bJoinStart | bJoinEnd},
|
|
joiningT: {keep: bJoinStart, set: bJoinMid},
|
|
joiningR: {set: bJoinEnd},
|
|
viramaModifier: {set: bVirama},
|
|
viramaJoinT: {set: bVirama | bJoinMid},
|
|
latinSmallL: {set: bLatinSmallL},
|
|
greek: {set: bGreek},
|
|
greekJoinT: {set: bGreek | bJoinMid},
|
|
hebrew: {set: bHebrew},
|
|
hebrewJoinT: {set: bHebrew | bJoinMid},
|
|
japanese: {set: bJapanese},
|
|
katakanaMiddleDot: {set: bMustHaveJapn},
|
|
|
|
zeroWidthNonJoiner: {
|
|
term: bJoinEnd,
|
|
accept: bJoinMid,
|
|
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
|
if before&bVirama != 0 {
|
|
return false, nil
|
|
}
|
|
if before&bJoinStart == 0 {
|
|
return false, errContext
|
|
}
|
|
return true, nil
|
|
},
|
|
},
|
|
zeroWidthJoiner: {
|
|
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
|
if before&bVirama == 0 {
|
|
err = errContext
|
|
}
|
|
return false, err
|
|
},
|
|
},
|
|
middleDot: {
|
|
term: bLatinSmallL,
|
|
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
|
if before&bLatinSmallL == 0 {
|
|
return false, errContext
|
|
}
|
|
return true, nil
|
|
},
|
|
},
|
|
greekLowerNumeralSign: {
|
|
set: bGreek,
|
|
term: bGreek,
|
|
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
|
return true, nil
|
|
},
|
|
},
|
|
hebrewPreceding: {
|
|
set: bHebrew,
|
|
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
|
if before&bHebrew == 0 {
|
|
err = errContext
|
|
}
|
|
return false, err
|
|
},
|
|
},
|
|
arabicIndicDigit: {
|
|
set: bArabicIndicDigit,
|
|
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
|
if before&bExtendedArabicIndicDigit != 0 {
|
|
err = errContext
|
|
}
|
|
return false, err
|
|
},
|
|
},
|
|
extendedArabicIndicDigit: {
|
|
set: bExtendedArabicIndicDigit,
|
|
rule: func(before catBitmap) (doLookAhead bool, err error) {
|
|
if before&bArabicIndicDigit != 0 {
|
|
err = errContext
|
|
}
|
|
return false, err
|
|
},
|
|
},
|
|
}
|