commit ccc1981707f5fecf94a8f0ca982dcef26ed1d075 Author: ChronosX88 Date: Sun Mar 22 18:24:45 2020 +0400 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7447f89 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/bin \ No newline at end of file diff --git a/example/main.go b/example/main.go new file mode 100644 index 0000000..1cd0cdc --- /dev/null +++ b/example/main.go @@ -0,0 +1,41 @@ +package main + +import ( + "flag" + "fmt" + "os" + + "github.com/ChronosX88/vala-parser/scanner" +) + +func check(e error) { + if e != nil { + panic(e) + } +} + +func main() { + var filePath string + flag.StringVar(&filePath, "path", "", "Path to the file which need to read") + flag.Parse() + if filePath == "" { + panic(fmt.Errorf("file path isn't specified")) + } + f, err := os.Open(filePath) + check(err) + fileInfo, err := f.Stat() + check(err) + if fileInfo.IsDir() { + panic(fmt.Errorf("file is a dir, not a file")) + } + s := scanner.NewScanner(f) + for { + tok := s.Scan() + if tok.Kind == scanner.EOF { + os.Exit(0) + } else if tok.Kind == scanner.Whitespace { + continue + } + fmt.Println(tok) + } +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..000763e --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/ChronosX88/vala-parser + +go 1.14 diff --git a/parser/parser.go b/parser/parser.go new file mode 100644 index 0000000..0bfe2c2 --- /dev/null +++ b/parser/parser.go @@ -0,0 +1 @@ +package parser diff --git a/scanner/scanner.go b/scanner/scanner.go new file mode 100644 index 0000000..a9e6845 --- /dev/null +++ b/scanner/scanner.go @@ -0,0 +1,286 @@ +package scanner + +import ( + "bytes" + "fmt" + "io" + + "github.com/ChronosX88/vala-parser/utils" +) + +const ( + eof = rune(0) // end of file +) + +type Scanner struct { + buf *bytes.Reader +} + +func NewScanner(reader io.Reader) *Scanner { + buffer := new(bytes.Buffer) + buffer.ReadFrom(reader) + r := bytes.NewReader(buffer.Bytes()) + return &Scanner{ + buf: r, + } +} + +func (s *Scanner) Scan() Token { + // Read the next rune. + ch := s.read() + + // If we see whitespace then consume all contiguous whitespace. + // If we see a letter then consume as an ident or reserved word. + // If we see a digit then consume as a number. + if isWhitespace(ch) { + s.unread() + return s.scanWhitespace() + } else if isLetter(ch) { + s.unread() + return s.scanIdent() + } else if isSpecialSymbol(ch) { + s.unread() + return s.scanSpecSymbol() + } else if isDigit(ch) { + s.unread() + return s.scanNumber() + } + + // Otherwise read the individual character. + switch ch { + case eof: + return Token{EOF, ""} + } + + return Token{Illegal, string(ch)} +} + +// scanWhitespace consumes the current rune and all contiguous whitespace. +func (s *Scanner) scanWhitespace() Token { + // Create a buffer and read the current character into it. + var buf bytes.Buffer + buf.WriteRune(s.read()) + + // Read every subsequent whitespace character into the buffer. + // Non-whitespace characters and EOF will cause the loop to exit. + for { + if ch := s.read(); ch == eof { + break + } else if !isWhitespace(ch) { + s.unread() + break + } else { + buf.WriteRune(ch) + } + } + + return Token{Whitespace, buf.String()} +} + +// scanIdent consumes the current rune and all contiguous ident runes. +func (s *Scanner) scanIdent() Token { + // Create a buffer and read the current character into it. + var buf bytes.Buffer + //buf.WriteRune(s.read()) + + // Read every subsequent ident character into the buffer. + // Non-ident characters and EOF will cause the loop to exit. + for { + if ch := s.read(); ch == eof { + break + } else if !isLetter(ch) && !isDigit(ch) && ch != '_' { + s.unread() + break + } else { + _, _ = buf.WriteRune(ch) + } + } + + parsedToken := Token{ + Kind: -1, + Literal: buf.String(), + } + + // If the string matches a keyword then return that keyword. + switch buf.String() { + case Using.String(): // using + parsedToken.Kind = Using + case Namespace.String(): // namespace + parsedToken.Kind = Namespace + case PublicModifier.String(): // public + parsedToken.Kind = PublicModifier + case PrivateModifier.String(): // private + parsedToken.Kind = PrivateModifier + case Class.String(): // class + parsedToken.Kind = Class + case Var.String(): // var + parsedToken.Kind = Var + case Return.String(): // return + parsedToken.Kind = Return + case Null.String(): // null + parsedToken.Kind = Null + case If.String(): // if + parsedToken.Kind = If + case ProtectedModifier.String(): // protected + parsedToken.Kind = ProtectedModifier + case False.String(): // false + parsedToken.Kind = False + case True.String(): // true + parsedToken.Kind = True + case New.String(): // new + parsedToken.Kind = New + default: + parsedToken.Kind = Identifier + } + + return parsedToken +} + +func (s *Scanner) scanSpecSymbol() Token { + // Create a buffer and read the current character into it. + var buf bytes.Buffer + buf.WriteRune(s.read()) + + // Read every subsequent ident character into the buffer. + // Non-ident characters and EOF will cause the loop to exit. + for { + if ch := s.read(); ch == eof { + break + } else if !isSpecialSymbol(ch) { + s.unread() + break + } else { + _, _ = buf.WriteRune(ch) + } + } + + parsedToken := Token{ + Kind: -1, + Literal: buf.String(), + } + + // If the string matches a keyword then return that keyword. + matchSpecSymbol(&parsedToken) + + if parsedToken.Kind == Illegal && len(parsedToken.Literal) > 1 { // then two or more special characters in a row detected + for i := 0; i < len(parsedToken.Literal)-1; i++ { + s.buf.Seek(-1, io.SeekCurrent) + } + parsedToken.Literal = string(utils.RuneAt(parsedToken.Literal, 0)) + matchSpecSymbol(&parsedToken) + } + + return parsedToken +} + +func (s *Scanner) scanNumber() Token { + var buf bytes.Buffer + buf.WriteRune(s.read()) + + for { + if ch := s.read(); ch == eof { + break + } else if !isDigit(ch) && !isXDigit(ch) && (ch != 'x') && (ch != '.') { + fmt.Println(string(ch)) + s.unread() + break + } else { + _, _ = buf.WriteRune(ch) + } + } + + parsedToken := Token{ + Kind: IntegerLiteral, + Literal: buf.String(), + } + + for _, v := range []rune(parsedToken.Literal) { + if v == '.' { + parsedToken.Kind = RealLiteral + } + } + + return parsedToken +} + +func matchSpecSymbol(tok *Token) { + switch tok.Literal { + case Add.String(): // + + tok.Kind = Add + case Minus.String(): // - + tok.Kind = Minus + case Mult.String(): // * + tok.Kind = Mult + case Divide.String(): // / + tok.Kind = Divide + case Colon.String(): // : + tok.Kind = Colon + case Comma.String(): // , + tok.Kind = Comma + case Semicolon.String(): // ; + tok.Kind = Semicolon + case OpenBrace.String(): // { + tok.Kind = OpenBrace + case CloseBrace.String(): // } + tok.Kind = CloseBrace + case StringLiteral.String(): // " + tok.Kind = StringLiteral + case Percent.String(): // % + tok.Kind = Percent + case OpenParens.String(): // ( + tok.Kind = OpenParens + case CloseParens.String(): // ) + tok.Kind = CloseParens + case Dot.String(): // . + tok.Kind = Dot + case OpenMultilineComments.String(): // /* + tok.Kind = OpenMultilineComments + case CloseMultilineComments.String(): // */ + tok.Kind = CloseMultilineComments + case LambdaArrow.String(): // => + tok.Kind = LambdaArrow + case Equal.String(): // == + tok.Kind = Equal + case OpenSingleComments.String(): + tok.Kind = OpenSingleComments + case Assign.String(): + tok.Kind = Assign + case OpenBracket.String(): + tok.Kind = OpenBracket + case CloseBracket.String(): + tok.Kind = CloseBracket + } +} + +// read reads the next rune from the buffered reader. +// Returns the rune(0) if an error occurs (or io.EOF is returned). +func (s *Scanner) read() rune { + ch, _, err := s.buf.ReadRune() + if err != nil { + return eof + } + return ch +} + +// unread places the previously read rune back on the reader. +func (s *Scanner) unread() { + err := s.buf.UnreadRune() + if err != nil { + fmt.Println("Error when unread: " + err.Error()) + } +} + +// isWhitespace returns true if the rune is a space, tab, or newline. +func isWhitespace(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' } + +// isLetter returns true if the rune is a letter. +func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') } + +// isDigit returns true if the rune is a digit. +func isDigit(ch rune) bool { return (ch >= '0' && ch <= '9') } + +func isSpecialSymbol(ch rune) bool { + return (ch >= '!' && ch <= '/') || (ch >= ':' && ch <= '?') || (ch >= '[' && ch <= '`') || (ch >= '{' && ch <= '~') && (ch != '_') +} + +func isXDigit(ch rune) bool { return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') } diff --git a/scanner/token.go b/scanner/token.go new file mode 100644 index 0000000..46d0760 --- /dev/null +++ b/scanner/token.go @@ -0,0 +1,220 @@ +package scanner + +type TokenKind int + +const ( + Using TokenKind = iota // using + Class // class + Identifier // any identifier + Colon // : + Comma // , + Semicolon // ; + OpenBrace // { + CloseBrace // } + StringLiteral // " + Namespace // namespace + PublicModifier // public + PrivateModifier // private + Add // + + Minus // - + Mult // * + Divide // / + Var // var + Whitespace // + OpenParens // ( + CloseParens // ) + Percent // % + OpenMultilineComments // /* + CloseMultilineComments // */ + Dot // . + Return // return + Null // null + LambdaArrow // => + If // if + Equal // == + OpenSingleComments // // + Assign // = + OpenBracket // [ + CloseBracket // ] + ProtectedModifier // protected + False // false + True // true + New // new + IntegerLiteral // any int number + RealLiteral // any real number + EOF // end of file + Illegal = -1 // just illegal token +) + +func (tok TokenKind) String() string { + switch tok { + case Class: + return "class" + case Colon: + return ":" + case Comma: + return "," + case Semicolon: + return ";" + case OpenBrace: + return "{" + case CloseBrace: + return "}" + case StringLiteral: + return "\"" + case Using: + return "using" + case Var: + return "var" + case Add: + return "+" + case Minus: + return "-" + case Mult: + return "*" + case Divide: + return "/" + case Namespace: + return "namespace" + case PublicModifier: + return "public" + case PrivateModifier: + return "private" + case OpenParens: + return "(" + case CloseParens: + return ")" + case Percent: + return "%" + case OpenMultilineComments: + return "/*" + case CloseMultilineComments: + return "*/" + case Dot: + return "." + case Return: + return "return" + case Null: + return "null" + case LambdaArrow: + return "=>" + case If: + return "if" + case Equal: + return "==" + case OpenSingleComments: + return "//" + case Assign: + return "=" + case OpenBracket: + return "[" + case CloseBracket: + return "]" + case ProtectedModifier: + return "protected" + case False: + return "false" + case True: + return "true" + case New: + return "new" + } + return "" +} + +func (tok TokenKind) PrettyString() string { + switch tok { + case Class: + return "CLASS" + case Colon: + return "COLON" + case Comma: + return "COMMA" + case Semicolon: + return "SEMICOLON" + case OpenBrace: + return "OPEN_BRACE" + case CloseBrace: + return "CLOSE_BRACE" + case StringLiteral: + return "STRING_LITERAL" + case Using: + return "USING" + case Var: + return "VAR" + case Add: + return "OP_SUM" + case Minus: + return "OP_MINUS" + case Mult: + return "OP_MULT" + case Divide: + return "OP_DIVIDE" + case Namespace: + return "NAMESPACE" + case PublicModifier: + return "PUBLIC_MODIFIER" + case PrivateModifier: + return "PRIVATE_MODIFIER" + case OpenParens: + return "OPEN_PARENS" + case CloseParens: + return "CLOSE_PARENS" + case Percent: + return "PERCENT" + case OpenMultilineComments: + return "OPEN_MULTILINE_COMMENTS" + case CloseMultilineComments: + return "CLOSE_MULTILINE_COMMENTS" + case Dot: + return "DOT" + case Return: + return "RETURN" + case Null: + return "NULL" + case LambdaArrow: + return "LAMBDA_ARROW" + case If: + return "IF_STMT" + case Illegal: + return "ILLEGAL" + case Equal: + return "EQUAL" + case OpenSingleComments: + return "OPEN_SINGLE_COMMENTS" + case Assign: + return "ASSIGN_OP" + case OpenBracket: + return "OPEN_BRACKET" + case CloseBracket: + return "CLOSE_BRACKET" + case ProtectedModifier: + return "PROTECTED_MODIFIER" + case False: + return "FALSE" + case True: + return "TRUE" + case New: + return "NEW" + case IntegerLiteral: + return "INTEGER_LITERAL" + case RealLiteral: + return "REAL_LITERAL" + } + return "" +} + +type Token struct { + Kind TokenKind + Literal string +} + +func (t Token) String() string { + if t.Kind != Identifier && t.Kind != Whitespace { + return "Token{kind: " + t.Kind.PrettyString() + ", literal: " + t.Literal + "}" + } else if t.Kind == Whitespace { + return "Token{kind: WHITESPACE}" + } else { + return "Token{kind: IDENTIFIER, literal: " + t.Literal + "}" + } +} diff --git a/utils/utils.go b/utils/utils.go new file mode 100644 index 0000000..4b33471 --- /dev/null +++ b/utils/utils.go @@ -0,0 +1,9 @@ +package utils + +func RuneAt(s string, idx int) rune { + rs := []rune(s) + if idx >= len(rs) { + return 0 + } + return rs[idx] +}