1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // A scanner for Go source text. Takes a []byte as source which can
6 // then be tokenized through repeated calls to the Scan function.
7 // For a sample use of a scanner, see the implementation of Tokenize.
20 // A Scanner holds the scanner's internal state while processing
21 // a given text. It can be allocated as part of another data
22 // structure but must be initialized via Init before use. For
23 // a sample use, see the implementation of Tokenize.
28 err ErrorHandler // error reporting; or nil
29 mode uint // scanning mode
32 filename string // current filename; may change via //line filename:line comment
33 line int // current line
34 column int // current column
36 ch int // current character
37 offset int // character offset
38 rdOffset int // reading offset (position after current character)
39 insertSemi bool // insert a semicolon before next newline
41 // public state - ok to modify
42 ErrorCount int // number of errors encountered
46 // Read the next Unicode char into S.ch.
47 // S.ch < 0 means end-of-file.
49 func (S *Scanner) next() {
51 if S.rdOffset < len(S.src) {
57 r, w := int(S.src[S.rdOffset]), 1
60 S.error("illegal character NUL")
63 r, w = utf8.DecodeRune(S.src[S.rdOffset:])
64 if r == utf8.RuneError && w == 1 {
65 S.error("illegal UTF-8 encoding")
80 // The mode parameter to the Init function is a set of flags (or 0).
81 // They control scanner behavior.
84 ScanComments = 1 << iota // return comments as COMMENT tokens
85 AllowIllegalChars // do not report an error for illegal chars
86 InsertSemis // automatically insert semicolons
90 // Init prepares the scanner S to tokenize the text src. Calls to Scan
91 // will use the error handler err if they encounter a syntax error and
92 // err is not nil. Also, for each error encountered, the Scanner field
93 // ErrorCount is incremented by one. The filename parameter is used as
94 // filename in the token.Position returned by Scan for each token. The
95 // mode parameter determines how comments and illegal characters are
98 func (S *Scanner) Init(filename string, src []byte, err ErrorHandler, mode uint) {
99 // Explicitly initialize all fields since a scanner may be reused.
104 S.filename = filename
118 func charString(ch int) string {
144 return "'" + s + "' (U+" + strconv.Itob(ch, 16) + ")"
148 func (S *Scanner) error(msg string) {
149 S.errorAt(token.Position{S.filename, S.offset, S.line, S.column}, msg)
153 func (S *Scanner) errorAt(pos token.Position, msg string) {
155 S.err.Error(pos, msg)
161 var prefix = []byte("//line ")
163 func (S *Scanner) interpretLineComment(text []byte) {
164 if bytes.HasPrefix(text, prefix) {
165 // get filename and line number, if any
166 if i := bytes.Index(text, []byte{':'}); i > 0 {
167 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
168 // valid //line filename:line comment;
169 // update scanner position
170 S.filename = string(text[len(prefix):i])
171 S.line = line - 1 // -1 since the '\n' has not been consumed yet
178 func (S *Scanner) scanComment() {
179 // initial '/' already consumed; S.ch == '/' || S.ch == '*'
180 offs := S.offset - 1 // position of initial '/'
182 pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
187 for S.ch != '\n' && S.ch >= 0 {
191 // comment starts at the beginning of the current line
192 S.interpretLineComment(S.src[offs:S.offset])
202 if ch == '*' && S.ch == '/' {
208 S.errorAt(pos, "comment not terminated")
212 func (S *Scanner) findLineEnd() bool {
213 // initial '/' already consumed
215 defer func(line, col, offs int) {
216 // reset scanner state to where it was upon calling findLineEnd
217 // (we don't scan //line comments and ignore errors thus
218 // S.filename and S.ErrorCount don't change)
223 S.rdOffset = offs + 1
224 S.next() // consume initial '/' again
225 }(S.line, S.column-1, S.offset-1)
227 // read ahead until a newline, EOF, or non-comment token is found
228 for S.ch == '/' || S.ch == '*' {
230 //-style comment always contains a newline
233 /*-style comment: look for newline */
241 if ch == '*' && S.ch == '/' {
246 S.skipWhitespace() // S.insertSemi is set
247 if S.ch < 0 || S.ch == '\n' {
254 S.next() // consume '/'
261 func isLetter(ch int) bool {
262 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
266 func isDigit(ch int) bool {
267 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
271 func (S *Scanner) scanIdentifier() token.Token {
273 for isLetter(S.ch) || isDigit(S.ch) {
276 return token.Lookup(S.src[offs:S.offset])
280 func digitVal(ch int) int {
282 case '0' <= ch && ch <= '9':
284 case 'a' <= ch && ch <= 'f':
286 case 'A' <= ch && ch <= 'F':
289 return 16 // larger than any legal digit val
293 func (S *Scanner) scanMantissa(base int) {
294 for digitVal(S.ch) < base {
300 func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
301 // digitVal(S.ch) < 10
304 if seenDecimalPoint {
312 pos := token.Position{S.filename, S.offset, S.line, S.column}
314 if S.ch == 'x' || S.ch == 'X' {
319 // octal int or float
320 seenDecimalDigit := false
322 if S.ch == '8' || S.ch == '9' {
323 // illegal octal int or float
324 seenDecimalDigit = true
327 if S.ch == '.' || S.ch == 'e' || S.ch == 'E' || S.ch == 'i' {
331 if seenDecimalDigit {
332 S.errorAt(pos, "illegal octal number")
338 // decimal int or float
349 if S.ch == 'e' || S.ch == 'E' {
352 if S.ch == '-' || S.ch == '+' {
368 func (S *Scanner) scanEscape(quote int) {
369 pos := token.Position{S.filename, S.offset, S.line, S.column}
371 var i, base, max uint32
373 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
376 case '0', '1', '2', '3', '4', '5', '6', '7':
377 i, base, max = 3, 8, 255
380 i, base, max = 2, 16, 255
383 i, base, max = 4, 16, unicode.MaxRune
386 i, base, max = 8, 16, unicode.MaxRune
388 S.next() // always make progress
389 S.errorAt(pos, "unknown escape sequence")
394 for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
395 d := uint32(digitVal(S.ch))
397 S.error("illegal character in escape sequence")
403 // in case of an error, consume remaining chars
404 for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
407 if x > max || 0xd800 <= x && x < 0xe000 {
408 S.errorAt(pos, "escape sequence is invalid Unicode code point")
413 func (S *Scanner) scanChar() {
414 // '\'' opening already consumed
415 pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
422 if ch == '\n' || ch < 0 {
423 S.errorAt(pos, "character literal not terminated")
435 S.errorAt(pos, "illegal character literal")
440 func (S *Scanner) scanString() {
441 // '"' opening already consumed
442 pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
447 if ch == '\n' || ch < 0 {
448 S.errorAt(pos, "string not terminated")
460 func (S *Scanner) scanRawString() {
461 // '`' opening already consumed
462 pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
468 S.errorAt(pos, "string not terminated")
477 func (S *Scanner) skipWhitespace() {
478 for S.ch == ' ' || S.ch == '\t' || S.ch == '\n' && !S.insertSemi || S.ch == '\r' {
484 // Helper functions for scanning multi-byte tokens such as >> += >>= .
485 // Different routines recognize different length tok_i based on matches
486 // of ch_i. If a token ends in '=', the result is tok1 or tok3
487 // respectively. Otherwise, the result is tok0 if there was no other
488 // matching character, or tok2 if the matching character was ch2.
490 func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token {
499 func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token {
512 func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token {
529 var newline = []byte{'\n'}
531 // Scan scans the next token and returns the token position pos,
532 // the token tok, and the literal text lit corresponding to the
533 // token. The source end is indicated by token.EOF.
535 // If the returned token is token.SEMICOLON, the corresponding
536 // literal value is ";" if the semicolon was present in the source,
537 // and "\n" if the semicolon was inserted because of a newline or
540 // For more tolerant parsing, Scan will return a valid token if
541 // possible even if a syntax error was encountered. Thus, even
542 // if the resulting token sequence contains no illegal tokens,
543 // a client may not assume that no error occurred. Instead it
544 // must check the scanner's ErrorCount or the number of calls
545 // of the error handler, if there was one installed.
547 func (S *Scanner) Scan() (pos token.Position, tok token.Token, lit []byte) {
551 // current token start
553 pos, tok = token.Position{S.filename, S.offset, S.line, S.column}, token.ILLEGAL
556 // determine token value
559 tok = S.scanIdentifier()
561 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
564 case digitVal(ch) < 10:
566 tok = S.scanNumber(false)
568 S.next() // always make progress
572 S.insertSemi = false // EOF consumed
573 return pos, token.SEMICOLON, newline
577 // we only reach here if S.insertSemi was
578 // set in the first place and exited early
579 // from S.skipWhitespace()
580 S.insertSemi = false // newline consumed
581 return pos, token.SEMICOLON, newline
595 tok = S.switch2(token.COLON, token.DEFINE)
597 if digitVal(S.ch) < 10 {
599 tok = S.scanNumber(true)
600 } else if S.ch == '.' {
612 tok = token.SEMICOLON
629 tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
630 if tok == token.INC {
634 tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
635 if tok == token.DEC {
639 tok = S.switch2(token.MUL, token.MUL_ASSIGN)
641 if S.ch == '/' || S.ch == '*' {
644 col := S.column - 1 // beginning of comment
645 if S.insertSemi && S.findLineEnd() {
646 // reset position to the beginning of the comment
651 S.rdOffset = offs + 1
652 S.insertSemi = false // newline consumed
653 return pos, token.SEMICOLON, newline
656 if S.mode&ScanComments == 0 {
658 S.insertSemi = false // newline consumed
663 tok = S.switch2(token.QUO, token.QUO_ASSIGN)
666 tok = S.switch2(token.REM, token.REM_ASSIGN)
668 tok = S.switch2(token.XOR, token.XOR_ASSIGN)
674 tok = S.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
677 tok = S.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
679 tok = S.switch2(token.ASSIGN, token.EQL)
681 tok = S.switch2(token.NOT, token.NEQ)
685 tok = S.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
687 tok = S.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
690 tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
692 if S.mode&AllowIllegalChars == 0 {
693 S.errorAt(pos, "illegal character "+charString(ch))
695 insertSemi = S.insertSemi // preserve insertSemi info
699 if S.mode&InsertSemis != 0 {
700 S.insertSemi = insertSemi
702 return pos, tok, S.src[offs:S.offset]
706 // Tokenize calls a function f with the token position, token value, and token
707 // text for each token in the source src. The other parameters have the same
708 // meaning as for the Init function. Tokenize keeps scanning until f returns
709 // false (usually when the token value is token.EOF). The result is the number
710 // of errors encountered.
712 func Tokenize(filename string, src []byte, err ErrorHandler, mode uint, f func(pos token.Position, tok token.Token, lit []byte) bool) int {
714 s.Init(filename, src, err, mode)
716 // action happens in f