diff options
Diffstat (limited to 'internal/lexer')
-rw-r--r-- | internal/lexer/lexer.go | 182 | ||||
-rw-r--r-- | internal/lexer/lextype_string.go | 34 | ||||
-rw-r--r-- | internal/lexer/scanners.go | 32 | ||||
-rw-r--r-- | internal/lexer/statefunc.go | 17 | ||||
-rw-r--r-- | internal/lexer/states.go | 110 |
5 files changed, 375 insertions, 0 deletions
diff --git a/internal/lexer/lexer.go b/internal/lexer/lexer.go new file mode 100644 index 0000000..342864d --- /dev/null +++ b/internal/lexer/lexer.go @@ -0,0 +1,182 @@ +package lexer + +import ( + "fmt" + "strings" + "unicode/utf8" +) + +const eof rune = -1 + +type Lexem struct { + Type lexType // Type of Lexem. + Value string // Value of Lexem. + Start int // Start position at input string. + End int // End position at input string. +} + +//go:generate stringer -type=lexType +type lexType int + +const ( + LEOF lexType = iota + LError + LObjectStart + LObjectEnd + LObjectKey + LObjectValue + LArrayStart + LArrayEnd + LString + LNumber + LBoolean + LNull +) + +// Lexer holds current scanner state. +type Lexer struct { + Input string // Input string. + Start int // Start position of current lexem. + Pos int // Pos at input string. + Output chan Lexem // Lexems channel. + width int // Width of last rune. + states stateStack // Stack of states to realize PrevState. +} + +// newLexer returns new scanner for input string. +func NewLexer(input string) *Lexer { + return &Lexer{ + Input: input, + Start: 0, + Pos: 0, + Output: make(chan Lexem, 2), + width: 0, + } +} + +// Run lexing. +func (l *Lexer) Run(init stateFunc) { + for state := init; state != nil; { + state = state(l) + } + close(l.Output) +} + +// PopState returns previous state function. +func (l *Lexer) PopState() stateFunc { + return l.states.Pop() +} + +// PushState pushes state before going deeper states. +func (l *Lexer) PushState(s stateFunc) { + l.states.Push(s) +} + +// Emit current lexem to output. +func (l *Lexer) Emit(typ lexType) { + l.Output <- Lexem{ + Type: typ, + Value: l.Input[l.Start:l.Pos], + Start: l.Start, + End: l.Pos, + } + l.Start = l.Pos +} + +// Errorf produces error lexem and stops scanning. +func (l *Lexer) Errorf(format string, args ...interface{}) stateFunc { + l.Output <- Lexem{ + Type: LError, + Value: fmt.Sprintf(format, args...), + Start: l.Start, + End: l.Pos, + } + return nil +} + +// Next rune from input. +func (l *Lexer) Next() (r rune) { + if int(l.Pos) >= len(l.Input) { + l.width = 0 + return eof + } + r, l.width = utf8.DecodeRuneInString(l.Input[l.Pos:]) + l.Pos += l.width + return r +} + +// Back move position to previos rune. +func (l *Lexer) Back() { + l.Pos -= l.width +} + +// Ignore previosly buffered text. +func (l *Lexer) Ignore() { + l.Start = l.Pos + l.width = 0 +} + +// Peek rune at current position without moving position. +func (l *Lexer) Peek() (r rune) { + r = l.Next() + l.Back() + return r +} + +// Accept any rune from valid string. Returns true if Next rune was in valid string. +func (l *Lexer) Accept(valid string) bool { + if strings.ContainsRune(valid, l.Next()) { + return true + } + l.Back() + return false +} + +// AcceptString returns true if given string was at position. +func (l *Lexer) AcceptString(s string, caseInsentive bool) bool { + input := l.Input[l.Start:] + if caseInsentive { + input = strings.ToLower(input) + s = strings.ToLower(s) + } + if strings.HasPrefix(input, s) { + l.width = 0 + l.Pos += len(s) + return true + } + return false +} + +// AcceptAnyOf substrings. Retuns true if any of substrings was found. +func (l *Lexer) AcceptAnyOf(s []string, caseInsentive bool) bool { + for _, substring := range s { + if l.AcceptString(substring, caseInsentive) { + return true + } + } + return false +} + +// AcceptWhile passing symbols from input while they at `valid` string. +func (l *Lexer) AcceptWhile(valid string) bool { + isValid := false + for l.Accept(valid) { + isValid = true + } + return isValid +} + +// AcceptWhileNot passing symbols from input while they NOT in `invalid` string. +func (l *Lexer) AcceptWhileNot(invalid string) bool { + isValid := false + for !strings.ContainsRune(invalid, l.Next()) { + isValid = true + } + l.Back() + return isValid +} + +// AtStart returns true if current lexem not empty +func (l *Lexer) AtStart() bool { + return l.Pos == l.Start +} diff --git a/internal/lexer/lextype_string.go b/internal/lexer/lextype_string.go new file mode 100644 index 0000000..fe895d2 --- /dev/null +++ b/internal/lexer/lextype_string.go @@ -0,0 +1,34 @@ +// Code generated by "stringer -type=lexType"; DO NOT EDIT. + +package lexer + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[LEOF-0] + _ = x[LError-1] + _ = x[LObjectStart-2] + _ = x[LObjectEnd-3] + _ = x[LObjectKey-4] + _ = x[LObjectValue-5] + _ = x[LArrayStart-6] + _ = x[LArrayEnd-7] + _ = x[LString-8] + _ = x[LNumber-9] + _ = x[LBoolean-10] + _ = x[LNull-11] +} + +const _lexType_name = "LEOFLErrorLObjectStartLObjectEndLObjectKeyLObjectValueLArrayStartLArrayEndLStringLNumberLBooleanLNull" + +var _lexType_index = [...]uint8{0, 4, 10, 22, 32, 42, 54, 65, 74, 81, 88, 96, 101} + +func (i lexType) String() string { + if i < 0 || i >= lexType(len(_lexType_index)-1) { + return "lexType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _lexType_name[_lexType_index[i]:_lexType_index[i+1]] +} diff --git a/internal/lexer/scanners.go b/internal/lexer/scanners.go new file mode 100644 index 0000000..6181c2d --- /dev/null +++ b/internal/lexer/scanners.go @@ -0,0 +1,32 @@ +package lexer + +func scanNumber(l *Lexer) bool { + l.AcceptWhile("0123456789") + if l.AtStart() { + // not found any digit + return false + } + l.Accept(".") + l.AcceptWhile("0123456789") + return !l.AtStart() +} + +func scanQuotedString(l *Lexer, quote rune) bool { + start := l.Pos + if l.Next() != quote { + l.Back() + return false + } + for { + ch := l.Next() + switch ch { + case eof: + l.Pos = start // Return position to start + return false // Unclosed quote string? + case '\\': + l.Next() // Skip next char + case quote: + return true // Closing quote + } + } +} diff --git a/internal/lexer/statefunc.go b/internal/lexer/statefunc.go new file mode 100644 index 0000000..8d0e42a --- /dev/null +++ b/internal/lexer/statefunc.go @@ -0,0 +1,17 @@ +package lexer + +type stateFunc func(*Lexer) stateFunc + +type stateStack []stateFunc + +func (ss *stateStack) Push(s stateFunc) { + *ss = append(*ss, s) +} + +func (ss *stateStack) Pop() (s stateFunc) { + if len(*ss) == 0 { + return nil + } + *ss, s = (*ss)[:len(*ss)-1], (*ss)[len(*ss)-1] + return s +} diff --git a/internal/lexer/states.go b/internal/lexer/states.go new file mode 100644 index 0000000..818ccf6 --- /dev/null +++ b/internal/lexer/states.go @@ -0,0 +1,110 @@ +package lexer + +func InitJson(l *Lexer) stateFunc { + ignoreWhiteSpace(l) + switch { + case l.Accept("{"): + l.Emit(LObjectStart) + return stateInObject + case l.Accept("["): + l.Emit(LArrayStart) + case l.Peek() == eof: + return nil + } + return l.Errorf("Unknown token: %s", string(l.Peek())) +} + +func stateInObject(l *Lexer) stateFunc { + // we in object, so we expect field keys and values + ignoreWhiteSpace(l) + if l.Accept("}") { + l.Emit(LObjectEnd) + // If meet close object return to previous state (including initial) + return l.PopState() + } + ignoreWhiteSpace(l) + l.Accept(",") + ignoreWhiteSpace(l) + if !scanQuotedString(l, '"') { + return l.Errorf("Unknown token: %s", string(l.Peek())) + } + l.Emit(LObjectKey) + ignoreWhiteSpace(l) + if !l.Accept(":") { + return l.Errorf("Expected ':'") + } + ignoreWhiteSpace(l) + l.Emit(LObjectValue) + switch { + case scanQuotedString(l, '"'): + l.Emit(LString) + ignoreWhiteSpace(l) + l.Accept(",") + l.Ignore() + ignoreWhiteSpace(l) + return stateInObject + case scanNumber(l): + l.Emit(LNumber) + ignoreWhiteSpace(l) + l.Accept(",") + l.Ignore() + ignoreWhiteSpace(l) + return stateInObject + case l.AcceptAnyOf([]string{"true", "false"}, true): + l.Emit(LBoolean) + ignoreWhiteSpace(l) + l.Accept(",") + l.Ignore() + ignoreWhiteSpace(l) + return stateInObject + case l.AcceptString("null", true): + l.Emit(LNull) + ignoreWhiteSpace(l) + l.Accept(",") + l.Ignore() + ignoreWhiteSpace(l) + return stateInObject + case l.Accept("{"): + l.Emit(LObjectStart) + l.PushState(stateInObject) + return stateInObject + case l.Accept("["): + l.Emit(LArrayStart) + l.PushState(stateInObject) + return stateInArray + } + return l.Errorf("Unknown token: %s", string(l.Peek())) +} + +func stateInArray(l *Lexer) stateFunc { + ignoreWhiteSpace(l) + l.Accept(",") + ignoreWhiteSpace(l) + switch { + case scanQuotedString(l, '"'): + l.Emit(LString) + case scanNumber(l): + l.Emit(LNumber) + case l.AcceptAnyOf([]string{"true", "false"}, true): + l.Emit(LBoolean) + case l.AcceptString("null", true): + l.Emit(LNull) + case l.Accept("{"): + l.Emit(LObjectStart) + l.PushState(stateInArray) + return stateInObject + case l.Accept("["): + l.Emit(LArrayStart) + l.PushState(stateInArray) + return stateInArray + case l.Accept("]"): + l.Emit(LArrayEnd) + return l.PopState() + } + return stateInArray +} + +func ignoreWhiteSpace(l *Lexer) { + l.AcceptWhile(" \n\t") // ignore whitespaces + l.Ignore() +} |