aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexander Kiryukhin <a.kiryukhin@mail.ru>2021-03-06 22:30:32 +0300
committerAlexander Kiryukhin <a.kiryukhin@mail.ru>2021-03-06 22:30:32 +0300
commit93740d2d153b3da3a1be5707db1400106e3f6491 (patch)
treeddf705b638434710fa6304201f560f018f4864fe
Initial
-rw-r--r--example/math_expression/main.go54
-rw-r--r--example/math_expression/rpn.go84
-rw-r--r--example/math_expression/stack.go26
-rw-r--r--go.mod3
-rw-r--r--lexem.go20
-rw-r--r--lexer.go141
-rw-r--r--scanners.go25
-rw-r--r--scanners_test.go53
-rw-r--r--statefunc.go4
9 files changed, 410 insertions, 0 deletions
diff --git a/example/math_expression/main.go b/example/math_expression/main.go
new file mode 100644
index 0000000..35f7c3b
--- /dev/null
+++ b/example/math_expression/main.go
@@ -0,0 +1,54 @@
+package main
+
+import (
+ "fmt"
+
+ "github.com/neonxp/unilex"
+)
+
+var output []unilex.Lexem = []unilex.Lexem{}
+var opPriority = map[string]int{
+ "^": 3,
+ "!": 3,
+ "*": 2,
+ "/": 2,
+ "+": 1,
+ "-": 1,
+}
+
+func main() {
+
+ l := unilex.New("10 * (20.0 + 30.0)")
+
+ go l.Run(lexExpression) // Start lexer
+
+ // Read infix expression lexems from lexer and convert them to RPN (reverse polish notation)
+ rpn := infixToRPNotation(l)
+ fmt.Println("RPN:", rpn)
+
+ // Calculate RPN
+ result := calculateRPN(rpn)
+ fmt.Println("Result:", result)
+}
+
+func lexExpression(l *unilex.Lexer) unilex.StateFunc {
+ l.AcceptWhile(" \t")
+ l.Ignore() // Ignore whitespaces
+
+ switch {
+ case l.Accept("("):
+ l.Emit("LP")
+ case l.Accept(")"):
+ l.Emit("RP")
+ case unilex.ScanNumber(l):
+ l.Emit("NUMBER")
+ case l.Accept("+-*/^!"):
+ l.Emit("OPERATOR")
+ case l.Peek() == unilex.EOF:
+ return nil
+ default:
+ return l.Errorf("Unexpected symbol")
+ }
+
+ return lexExpression
+}
diff --git a/example/math_expression/rpn.go b/example/math_expression/rpn.go
new file mode 100644
index 0000000..b65983d
--- /dev/null
+++ b/example/math_expression/rpn.go
@@ -0,0 +1,84 @@
+package main
+
+// Helper functions to convert infix notation to RPN and calculates expression result.
+
+import (
+ "fmt"
+ "log"
+ "strconv"
+
+ "github.com/neonxp/unilex"
+)
+
+func infixToRPNotation(l *unilex.Lexer) []unilex.Lexem {
+ output := []unilex.Lexem{}
+ stack := lexemStack{}
+parseLoop:
+ for ll := range l.Output { // Read lexems from Lexer output channel, convert starts as soon as first lexems scanned!
+ fmt.Printf("Lexem: %+v\n", ll)
+
+ switch {
+ case ll.Type == "NUMBER", ll.Type == "OPERATOR" && ll.Value == "!":
+ output = append(output, ll)
+ case ll.Type == "LP":
+ stack.Push(ll)
+ case ll.Type == "RP":
+ for {
+ cl := stack.Pop()
+ if cl.Type == "LP" {
+ break
+ }
+ if cl.Type == unilex.LEOF {
+ log.Fatalf("No pair for parenthesis at %d", ll.Start)
+ }
+ output = append(output, cl)
+ }
+ case ll.Type == "OPERATOR":
+ for {
+ if stack.Head().Type == "OPERATOR" && (opPriority[stack.Head().Value] > opPriority[ll.Value]) {
+ output = append(output, stack.Pop())
+ continue
+ }
+ break
+ }
+ stack.Push(ll)
+ case ll.Type == unilex.LEOF:
+ break parseLoop
+ }
+ }
+
+ for stack.Head().Type != unilex.LEOF {
+ output = append(output, stack.Pop())
+ }
+
+ return output
+}
+
+func calculateRPN(rpnLexems []unilex.Lexem) string {
+ stack := lexemStack{}
+ for _, op := range rpnLexems {
+ if op.Type == "NUMBER" {
+ stack.Push(op)
+ } else {
+ switch op.Value {
+ case "+":
+ a1, _ := strconv.ParseFloat(stack.Pop().Value, 64)
+ a2, _ := strconv.ParseFloat(stack.Pop().Value, 64)
+ stack.Push(unilex.Lexem{Type: "NUMBER", Value: strconv.FormatFloat(a2+a1, 'f', -1, 64)})
+ case "-":
+ a1, _ := strconv.ParseFloat(stack.Pop().Value, 64)
+ a2, _ := strconv.ParseFloat(stack.Pop().Value, 64)
+ stack.Push(unilex.Lexem{Type: "NUMBER", Value: strconv.FormatFloat(a2-a1, 'f', -1, 64)})
+ case "*":
+ a1, _ := strconv.ParseFloat(stack.Pop().Value, 64)
+ a2, _ := strconv.ParseFloat(stack.Pop().Value, 64)
+ stack.Push(unilex.Lexem{Type: "NUMBER", Value: strconv.FormatFloat(a2*a1, 'f', -1, 64)})
+ case "/":
+ a1, _ := strconv.ParseFloat(stack.Pop().Value, 64)
+ a2, _ := strconv.ParseFloat(stack.Pop().Value, 64)
+ stack.Push(unilex.Lexem{Type: "NUMBER", Value: strconv.FormatFloat(a2/a1, 'f', -1, 64)})
+ }
+ }
+ }
+ return stack.Head().Value
+}
diff --git a/example/math_expression/stack.go b/example/math_expression/stack.go
new file mode 100644
index 0000000..2ca8acc
--- /dev/null
+++ b/example/math_expression/stack.go
@@ -0,0 +1,26 @@
+package main
+
+// Simple lexem stack implementation.
+
+import "github.com/neonxp/unilex"
+
+type lexemStack []unilex.Lexem
+
+func (ls *lexemStack) Head() (l unilex.Lexem) {
+ if len(*ls) == 0 {
+ return unilex.Lexem{Type: unilex.LEOF}
+ }
+ return (*ls)[len(*ls)-1]
+}
+
+func (ls *lexemStack) Push(l unilex.Lexem) {
+ *ls = append(*ls, l)
+}
+
+func (ls *lexemStack) Pop() (l unilex.Lexem) {
+ if len(*ls) == 0 {
+ return unilex.Lexem{Type: unilex.LEOF}
+ }
+ *ls, l = (*ls)[:len(*ls)-1], (*ls)[len(*ls)-1]
+ return l
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..59574b8
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module github.com/neonxp/unilex
+
+go 1.16
diff --git a/lexem.go b/lexem.go
new file mode 100644
index 0000000..bd24ea9
--- /dev/null
+++ b/lexem.go
@@ -0,0 +1,20 @@
+package unilex
+
+// Lexem represents part of parsed string.
+type Lexem struct {
+ Type LexType // Type of Lexem.
+ Value string // Value of Lexem.
+ Start int // Start position at input string.
+ End int // End position at input string.
+}
+
+// LexType represents type of current lexem.
+type LexType string
+
+// Some std lexem types
+const (
+ // LError represents lexing error.
+ LError LexType = "ERROR"
+ // LEOF represents end of input.
+ LEOF LexType = "EOF"
+)
diff --git a/lexer.go b/lexer.go
new file mode 100644
index 0000000..10317bd
--- /dev/null
+++ b/lexer.go
@@ -0,0 +1,141 @@
+package unilex
+
+import (
+ "fmt"
+ "strings"
+ "unicode/utf8"
+)
+
+// EOF const.
+const EOF rune = -1
+
+// Lexer holds current scanner state.
+type Lexer struct {
+ Input string // Input string.
+ Start int // Start position of current lexem.
+ Pos int // Pos at input string.
+ Output chan Lexem // Lexems channel.
+ width int // Width of last rune.
+}
+
+// New returns new scanner for input string.
+func New(input string) *Lexer {
+ return &Lexer{
+ Input: input,
+ Start: 0,
+ Pos: 0,
+ Output: make(chan Lexem, 2),
+ width: 0,
+ }
+}
+
+// Run lexing.
+func (l *Lexer) Run(init StateFunc) {
+ for state := init; state != nil; {
+ state = state(l)
+ }
+ close(l.Output)
+}
+
+// Emit current lexem to output.
+func (l *Lexer) Emit(typ LexType) {
+ l.Output <- Lexem{
+ Type: typ,
+ Value: l.Input[l.Start:l.Pos],
+ Start: l.Start,
+ End: l.Pos,
+ }
+ l.Start = l.Pos
+}
+
+// Errorf produces error lexem and stops scanning.
+func (l *Lexer) Errorf(format string, args ...interface{}) StateFunc {
+ l.Output <- Lexem{
+ Type: LError,
+ Value: fmt.Sprintf(format, args...),
+ Start: l.Start,
+ End: l.Pos,
+ }
+ return nil
+}
+
+// Next rune from input.
+func (l *Lexer) Next() (r rune) {
+ if int(l.Pos) >= len(l.Input) {
+ l.width = 0
+ return EOF
+ }
+ r, l.width = utf8.DecodeRuneInString(l.Input[l.Pos:])
+ l.Pos += l.width
+ return r
+}
+
+// Back move position to previos rune.
+func (l *Lexer) Back() {
+ l.Pos -= l.width
+}
+
+// Ignore previosly buffered text.
+func (l *Lexer) Ignore() {
+ l.Start = l.Pos
+ l.width = 0
+}
+
+// Peek rune at current position without moving position.
+func (l *Lexer) Peek() (r rune) {
+ r = l.Next()
+ l.Back()
+ return r
+}
+
+// Accept any rune from valid string. Returns true if next rune was in valid string.
+func (l *Lexer) Accept(valid string) bool {
+ if strings.ContainsRune(valid, l.Next()) {
+ return true
+ }
+ l.Back()
+ return false
+}
+
+// AcceptString returns true if given string was at position.
+func (l *Lexer) AcceptString(s string, caseInsentive bool) bool {
+ input := l.Input
+ if caseInsentive {
+ input = strings.ToLower(input)
+ s = strings.ToLower(s)
+ }
+ if strings.HasPrefix(input, s) {
+ l.width = 0
+ l.Pos += len(s)
+ return true
+ }
+ return false
+}
+
+// AcceptAnyOf substrings. Retuns true if any of substrings was found.
+func (l *Lexer) AcceptAnyOf(s []string, caseInsentive bool) bool {
+ for _, substring := range s {
+ if l.AcceptString(substring, caseInsentive) {
+ return true
+ }
+ }
+ return false
+}
+
+// AcceptWhile passing symbols from input while they at `valid` string.
+func (l *Lexer) AcceptWhile(valid string) {
+ for l.Accept(valid) {
+ }
+}
+
+// AcceptWhileNot passing symbols from input while they NOT in `invalid` string.
+func (l *Lexer) AcceptWhileNot(invalid string) {
+ for !strings.ContainsRune(invalid, l.Next()) {
+ }
+ l.Back()
+}
+
+// AtStart returns true if current lexem not empty
+func (l *Lexer) AtStart() bool {
+ return l.Pos == l.Start
+}
diff --git a/scanners.go b/scanners.go
new file mode 100644
index 0000000..b5811af
--- /dev/null
+++ b/scanners.go
@@ -0,0 +1,25 @@
+package unilex
+
+// ScanNumber simplest scanner that accepts decimal int and float.
+func ScanNumber(l *Lexer) bool {
+ l.AcceptWhile("0123456789")
+ if l.AtStart() {
+ // not found any digit
+ return false
+ }
+ l.Accept(".")
+ l.AcceptWhile("0123456789")
+ return !l.AtStart()
+}
+
+// ScanAlphaNum returns true if next input token contains alphanum sequence that not starts from digit and not contains.
+// spaces or special characters.
+func ScanAlphaNum(l *Lexer) bool {
+ digits := "0123456789"
+ alpha := "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM"
+ if !l.Accept(alpha) {
+ return false
+ }
+ l.AcceptWhile(alpha + digits)
+ return true
+}
diff --git a/scanners_test.go b/scanners_test.go
new file mode 100644
index 0000000..cab697e
--- /dev/null
+++ b/scanners_test.go
@@ -0,0 +1,53 @@
+package unilex
+
+import "testing"
+
+func TestScanNumber(t *testing.T) {
+ testCases := []struct {
+ Input string
+ Expected bool
+ Pos int
+ }{
+ {"asd", false, 0},
+ {"asd123", false, 0},
+ {"123", true, 3},
+ {"123asd", true, 3},
+ {"123.321", true, 7},
+ }
+ for _, tc := range testCases {
+ l := New(tc.Input)
+ actual := ScanNumber(l)
+ if actual != tc.Expected {
+ t.Errorf("Input: %s expected scan result: %v actual: %v", tc.Input, tc.Expected, actual)
+ }
+ if l.Pos != tc.Pos {
+ t.Errorf("Input: %s expected scan position: %d actual: %d", tc.Input, tc.Pos, l.Pos)
+ }
+ }
+}
+
+func TestScanAlphaNum(t *testing.T) {
+ testCases := []struct {
+ Input string
+ Expected bool
+ Pos int
+ }{
+ {"asd", true, 3},
+ {"asd123", true, 6},
+ {"123", false, 0},
+ {"123asd", false, 0},
+ {"123.321", false, 0},
+ {"asd!dsa", true, 3},
+ {"asd dsa", true, 3},
+ }
+ for _, tc := range testCases {
+ l := New(tc.Input)
+ actual := ScanAlphaNum(l)
+ if actual != tc.Expected {
+ t.Errorf("Input: %s expected scan result: %v actual: %v", tc.Input, tc.Expected, actual)
+ }
+ if l.Pos != tc.Pos {
+ t.Errorf("Input: %s expected scan position: %d actual: %d", tc.Input, tc.Pos, l.Pos)
+ }
+ }
+}
diff --git a/statefunc.go b/statefunc.go
new file mode 100644
index 0000000..5980ecc
--- /dev/null
+++ b/statefunc.go
@@ -0,0 +1,4 @@
+package unilex
+
+// StateFunc represents function that scans lexems and returns new state function or nil if lexing completed.
+type StateFunc func(*Lexer) StateFunc