libgo/go/go/scanner/scanner.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // A scanner for Go source text. Takes a []byte as source which can
   6 // then be tokenized through repeated calls to the Scan function.
   7 // For a sample use of a scanner, see the implementation of Tokenize.
   8 //
   9 package scanner
  10
  11 import (
  12         "bytes"
  13         "go/token"
  14         "strconv"
  15         "unicode"
  16         "utf8"
  17 )
  18
  19
  20 // A Scanner holds the scanner's internal state while processing
  21 // a given text.  It can be allocated as part of another data
  22 // structure but must be initialized via Init before use. For
  23 // a sample use, see the implementation of Tokenize.
  24 //
  25 type Scanner struct {
  26         // immutable state
  27         src  []byte       // source
  28         err  ErrorHandler // error reporting; or nil
  29         mode uint         // scanning mode
  30
  31         // scanning state
  32         filename string // current filename; may change via //line filename:line comment
  33         line     int    // current line
  34         column   int    // current column
  35
  36         ch         int  // current character
  37         offset     int  // character offset
  38         rdOffset   int  // reading offset (position after current character)
  39         insertSemi bool // insert a semicolon before next newline
  40
  41         // public state - ok to modify
  42         ErrorCount int // number of errors encountered
  43 }
  44
  45
  46 // Read the next Unicode char into S.ch.
  47 // S.ch < 0 means end-of-file.
  48 //
  49 func (S *Scanner) next() {
  50         S.column++
  51         if S.rdOffset < len(S.src) {
  52                 S.offset = S.rdOffset
  53                 if S.ch == '\n' {
  54                         S.line++
  55                         S.column = 1
  56                 }
  57                 r, w := int(S.src[S.rdOffset]), 1
  58                 switch {
  59                 case r == 0:
  60                         S.error("illegal character NUL")
  61                 case r >= 0x80:
  62                         // not ASCII
  63                         r, w = utf8.DecodeRune(S.src[S.rdOffset:])
  64                         if r == utf8.RuneError && w == 1 {
  65                                 S.error("illegal UTF-8 encoding")
  66                         }
  67                 }
  68                 S.rdOffset += w
  69                 S.ch = r
  70         } else {
  71                 S.offset = len(S.src)
  72                 if S.ch == '\n' {
  73                         S.column = 1
  74                 }
  75                 S.ch = -1 // eof
  76         }
  77 }
  78
  79
  80 // The mode parameter to the Init function is a set of flags (or 0).
  81 // They control scanner behavior.
  82 //
  83 const (
  84         ScanComments      = 1 << iota // return comments as COMMENT tokens
  85         AllowIllegalChars             // do not report an error for illegal chars
  86         InsertSemis                   // automatically insert semicolons
  87 )
  88
  89
  90 // Init prepares the scanner S to tokenize the text src. Calls to Scan
  91 // will use the error handler err if they encounter a syntax error and
  92 // err is not nil. Also, for each error encountered, the Scanner field
  93 // ErrorCount is incremented by one. The filename parameter is used as
  94 // filename in the token.Position returned by Scan for each token. The
  95 // mode parameter determines how comments and illegal characters are
  96 // handled.
  97 //
  98 func (S *Scanner) Init(filename string, src []byte, err ErrorHandler, mode uint) {
  99         // Explicitly initialize all fields since a scanner may be reused.
 100         S.src = src
 101         S.err = err
 102         S.mode = mode
 103
 104         S.filename = filename
 105         S.line = 1
 106         S.column = 0
 107
 108         S.ch = ' '
 109         S.offset = 0
 110         S.rdOffset = 0
 111         S.insertSemi = false
 112         S.ErrorCount = 0
 113
 114         S.next()
 115 }
 116
 117
 118 func charString(ch int) string {
 119         var s string
 120         switch ch {
 121         case -1:
 122                 return `EOF`
 123         case '\a':
 124                 s = `\a`
 125         case '\b':
 126                 s = `\b`
 127         case '\f':
 128                 s = `\f`
 129         case '\n':
 130                 s = `\n`
 131         case '\r':
 132                 s = `\r`
 133         case '\t':
 134                 s = `\t`
 135         case '\v':
 136                 s = `\v`
 137         case '\\':
 138                 s = `\\`
 139         case '\'':
 140                 s = `\'`
 141         default:
 142                 s = string(ch)
 143         }
 144         return "'" + s + "' (U+" + strconv.Itob(ch, 16) + ")"
 145 }
 146
 147
 148 func (S *Scanner) error(msg string) {
 149         S.errorAt(token.Position{S.filename, S.offset, S.line, S.column}, msg)
 150 }
 151
 152
 153 func (S *Scanner) errorAt(pos token.Position, msg string) {
 154         if S.err != nil {
 155                 S.err.Error(pos, msg)
 156         }
 157         S.ErrorCount++
 158 }
 159
 160
 161 var prefix = []byte("//line ")
 162
 163 func (S *Scanner) interpretLineComment(text []byte) {
 164         if bytes.HasPrefix(text, prefix) {
 165                 // get filename and line number, if any
 166                 if i := bytes.Index(text, []byte{':'}); i > 0 {
 167                         if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
 168                                 // valid //line filename:line comment;
 169                                 // update scanner position
 170                                 S.filename = string(text[len(prefix):i])
 171                                 S.line = line - 1 // -1 since the '\n' has not been consumed yet
 172                         }
 173                 }
 174         }
 175 }
 176
 177
 178 func (S *Scanner) scanComment() {
 179         // initial '/' already consumed; S.ch == '/' || S.ch == '*'
 180         offs := S.offset - 1 // position of initial '/'
 181         col := S.column - 1
 182         pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
 183
 184         if S.ch == '/' {
 185                 //-style comment
 186                 S.next()
 187                 for S.ch != '\n' && S.ch >= 0 {
 188                         S.next()
 189                 }
 190                 if col == 1 {
 191                         // comment starts at the beginning of the current line
 192                         S.interpretLineComment(S.src[offs:S.offset])
 193                 }
 194                 return
 195         }
 196
 197         /*-style comment */
 198         S.next()
 199         for S.ch >= 0 {
 200                 ch := S.ch
 201                 S.next()
 202                 if ch == '*' && S.ch == '/' {
 203                         S.next()
 204                         return
 205                 }
 206         }
 207
 208         S.errorAt(pos, "comment not terminated")
 209 }
 210
 211
 212 func (S *Scanner) findLineEnd() bool {
 213         // initial '/' already consumed
 214
 215         defer func(line, col, offs int) {
 216                 // reset scanner state to where it was upon calling findLineEnd
 217                 // (we don't scan //line comments and ignore errors thus
 218                 // S.filename and S.ErrorCount don't change)
 219                 S.line = line
 220                 S.column = col
 221                 S.ch = '/'
 222                 S.offset = offs
 223                 S.rdOffset = offs + 1
 224                 S.next() // consume initial '/' again
 225         }(S.line, S.column-1, S.offset-1)
 226
 227         // read ahead until a newline, EOF, or non-comment token is found
 228         for S.ch == '/' || S.ch == '*' {
 229                 if S.ch == '/' {
 230                         //-style comment always contains a newline
 231                         return true
 232                 }
 233                 /*-style comment: look for newline */
 234                 S.next()
 235                 for S.ch >= 0 {
 236                         ch := S.ch
 237                         if ch == '\n' {
 238                                 return true
 239                         }
 240                         S.next()
 241                         if ch == '*' && S.ch == '/' {
 242                                 S.next()
 243                                 break
 244                         }
 245                 }
 246                 S.skipWhitespace() // S.insertSemi is set
 247                 if S.ch < 0 || S.ch == '\n' {
 248                         return true
 249                 }
 250                 if S.ch != '/' {
 251                         // non-comment token
 252                         return false
 253                 }
 254                 S.next() // consume '/'
 255         }
 256
 257         return false
 258 }
 259
 260
 261 func isLetter(ch int) bool {
 262         return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
 263 }
 264
 265
 266 func isDigit(ch int) bool {
 267         return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
 268 }
 269
 270
 271 func (S *Scanner) scanIdentifier() token.Token {
 272         offs := S.offset
 273         for isLetter(S.ch) || isDigit(S.ch) {
 274                 S.next()
 275         }
 276         return token.Lookup(S.src[offs:S.offset])
 277 }
 278
 279
 280 func digitVal(ch int) int {
 281         switch {
 282         case '0' <= ch && ch <= '9':
 283                 return ch - '0'
 284         case 'a' <= ch && ch <= 'f':
 285                 return ch - 'a' + 10
 286         case 'A' <= ch && ch <= 'F':
 287                 return ch - 'A' + 10
 288         }
 289         return 16 // larger than any legal digit val
 290 }
 291
 292
 293 func (S *Scanner) scanMantissa(base int) {
 294         for digitVal(S.ch) < base {
 295                 S.next()
 296         }
 297 }
 298
 299
 300 func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
 301         // digitVal(S.ch) < 10
 302         tok := token.INT
 303
 304         if seenDecimalPoint {
 305                 tok = token.FLOAT
 306                 S.scanMantissa(10)
 307                 goto exponent
 308         }
 309
 310         if S.ch == '0' {
 311                 // int or float
 312                 pos := token.Position{S.filename, S.offset, S.line, S.column}
 313                 S.next()
 314                 if S.ch == 'x' || S.ch == 'X' {
 315                         // hexadecimal int
 316                         S.next()
 317                         S.scanMantissa(16)
 318                 } else {
 319                         // octal int or float
 320                         seenDecimalDigit := false
 321                         S.scanMantissa(8)
 322                         if S.ch == '8' || S.ch == '9' {
 323                                 // illegal octal int or float
 324                                 seenDecimalDigit = true
 325                                 S.scanMantissa(10)
 326                         }
 327                         if S.ch == '.' || S.ch == 'e' || S.ch == 'E' || S.ch == 'i' {
 328                                 goto fraction
 329                         }
 330                         // octal int
 331                         if seenDecimalDigit {
 332                                 S.errorAt(pos, "illegal octal number")
 333                         }
 334                 }
 335                 goto exit
 336         }
 337
 338         // decimal int or float
 339         S.scanMantissa(10)
 340
 341 fraction:
 342         if S.ch == '.' {
 343                 tok = token.FLOAT
 344                 S.next()
 345                 S.scanMantissa(10)
 346         }
 347
 348 exponent:
 349         if S.ch == 'e' || S.ch == 'E' {
 350                 tok = token.FLOAT
 351                 S.next()
 352                 if S.ch == '-' || S.ch == '+' {
 353                         S.next()
 354                 }
 355                 S.scanMantissa(10)
 356         }
 357
 358         if S.ch == 'i' {
 359                 tok = token.IMAG
 360                 S.next()
 361         }
 362
 363 exit:
 364         return tok
 365 }
 366
 367
 368 func (S *Scanner) scanEscape(quote int) {
 369         pos := token.Position{S.filename, S.offset, S.line, S.column}
 370
 371         var i, base, max uint32
 372         switch S.ch {
 373         case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
 374                 S.next()
 375                 return
 376         case '0', '1', '2', '3', '4', '5', '6', '7':
 377                 i, base, max = 3, 8, 255
 378         case 'x':
 379                 S.next()
 380                 i, base, max = 2, 16, 255
 381         case 'u':
 382                 S.next()
 383                 i, base, max = 4, 16, unicode.MaxRune
 384         case 'U':
 385                 S.next()
 386                 i, base, max = 8, 16, unicode.MaxRune
 387         default:
 388                 S.next() // always make progress
 389                 S.errorAt(pos, "unknown escape sequence")
 390                 return
 391         }
 392
 393         var x uint32
 394         for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
 395                 d := uint32(digitVal(S.ch))
 396                 if d >= base {
 397                         S.error("illegal character in escape sequence")
 398                         break
 399                 }
 400                 x = x*base + d
 401                 S.next()
 402         }
 403         // in case of an error, consume remaining chars
 404         for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
 405                 S.next()
 406         }
 407         if x > max || 0xd800 <= x && x < 0xe000 {
 408                 S.errorAt(pos, "escape sequence is invalid Unicode code point")
 409         }
 410 }
 411
 412
 413 func (S *Scanner) scanChar() {
 414         // '\'' opening already consumed
 415         pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
 416
 417         n := 0
 418         for S.ch != '\'' {
 419                 ch := S.ch
 420                 n++
 421                 S.next()
 422                 if ch == '\n' || ch < 0 {
 423                         S.errorAt(pos, "character literal not terminated")
 424                         n = 1
 425                         break
 426                 }
 427                 if ch == '\\' {
 428                         S.scanEscape('\'')
 429                 }
 430         }
 431
 432         S.next()
 433
 434         if n != 1 {
 435                 S.errorAt(pos, "illegal character literal")
 436         }
 437 }
 438
 439
 440 func (S *Scanner) scanString() {
 441         // '"' opening already consumed
 442         pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
 443
 444         for S.ch != '"' {
 445                 ch := S.ch
 446                 S.next()
 447                 if ch == '\n' || ch < 0 {
 448                         S.errorAt(pos, "string not terminated")
 449                         break
 450                 }
 451                 if ch == '\\' {
 452                         S.scanEscape('"')
 453                 }
 454         }
 455
 456         S.next()
 457 }
 458
 459
 460 func (S *Scanner) scanRawString() {
 461         // '`' opening already consumed
 462         pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
 463
 464         for S.ch != '`' {
 465                 ch := S.ch
 466                 S.next()
 467                 if ch < 0 {
 468                         S.errorAt(pos, "string not terminated")
 469                         break
 470                 }
 471         }
 472
 473         S.next()
 474 }
 475
 476
 477 func (S *Scanner) skipWhitespace() {
 478         for S.ch == ' ' || S.ch == '\t' || S.ch == '\n' && !S.insertSemi || S.ch == '\r' {
 479                 S.next()
 480         }
 481 }
 482
 483
 484 // Helper functions for scanning multi-byte tokens such as >> += >>= .
 485 // Different routines recognize different length tok_i based on matches
 486 // of ch_i. If a token ends in '=', the result is tok1 or tok3
 487 // respectively. Otherwise, the result is tok0 if there was no other
 488 // matching character, or tok2 if the matching character was ch2.
 489
 490 func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token {
 491         if S.ch == '=' {
 492                 S.next()
 493                 return tok1
 494         }
 495         return tok0
 496 }
 497
 498
 499 func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token {
 500         if S.ch == '=' {
 501                 S.next()
 502                 return tok1
 503         }
 504         if S.ch == ch2 {
 505                 S.next()
 506                 return tok2
 507         }
 508         return tok0
 509 }
 510
 511
 512 func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token {
 513         if S.ch == '=' {
 514                 S.next()
 515                 return tok1
 516         }
 517         if S.ch == ch2 {
 518                 S.next()
 519                 if S.ch == '=' {
 520                         S.next()
 521                         return tok3
 522                 }
 523                 return tok2
 524         }
 525         return tok0
 526 }
 527
 528
 529 var newline = []byte{'\n'}
 530
 531 // Scan scans the next token and returns the token position pos,
 532 // the token tok, and the literal text lit corresponding to the
 533 // token. The source end is indicated by token.EOF.
 534 //
 535 // If the returned token is token.SEMICOLON, the corresponding
 536 // literal value is ";" if the semicolon was present in the source,
 537 // and "\n" if the semicolon was inserted because of a newline or
 538 // at EOF.
 539 //
 540 // For more tolerant parsing, Scan will return a valid token if
 541 // possible even if a syntax error was encountered. Thus, even
 542 // if the resulting token sequence contains no illegal tokens,
 543 // a client may not assume that no error occurred. Instead it
 544 // must check the scanner's ErrorCount or the number of calls
 545 // of the error handler, if there was one installed.
 546 //
 547 func (S *Scanner) Scan() (pos token.Position, tok token.Token, lit []byte) {
 548 scanAgain:
 549         S.skipWhitespace()
 550
 551         // current token start
 552         insertSemi := false
 553         pos, tok = token.Position{S.filename, S.offset, S.line, S.column}, token.ILLEGAL
 554         offs := S.offset
 555
 556         // determine token value
 557         switch ch := S.ch; {
 558         case isLetter(ch):
 559                 tok = S.scanIdentifier()
 560                 switch tok {
 561                 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
 562                         insertSemi = true
 563                 }
 564         case digitVal(ch) < 10:
 565                 insertSemi = true
 566                 tok = S.scanNumber(false)
 567         default:
 568                 S.next() // always make progress
 569                 switch ch {
 570                 case -1:
 571                         if S.insertSemi {
 572                                 S.insertSemi = false // EOF consumed
 573                                 return pos, token.SEMICOLON, newline
 574                         }
 575                         tok = token.EOF
 576                 case '\n':
 577                         // we only reach here if S.insertSemi was
 578                         // set in the first place and exited early
 579                         // from S.skipWhitespace()
 580                         S.insertSemi = false // newline consumed
 581                         return pos, token.SEMICOLON, newline
 582                 case '"':
 583                         insertSemi = true
 584                         tok = token.STRING
 585                         S.scanString()
 586                 case '\'':
 587                         insertSemi = true
 588                         tok = token.CHAR
 589                         S.scanChar()
 590                 case '`':
 591                         insertSemi = true
 592                         tok = token.STRING
 593                         S.scanRawString()
 594                 case ':':
 595                         tok = S.switch2(token.COLON, token.DEFINE)
 596                 case '.':
 597                         if digitVal(S.ch) < 10 {
 598                                 insertSemi = true
 599                                 tok = S.scanNumber(true)
 600                         } else if S.ch == '.' {
 601                                 S.next()
 602                                 if S.ch == '.' {
 603                                         S.next()
 604                                         tok = token.ELLIPSIS
 605                                 }
 606                         } else {
 607                                 tok = token.PERIOD
 608                         }
 609                 case ',':
 610                         tok = token.COMMA
 611                 case ';':
 612                         tok = token.SEMICOLON
 613                 case '(':
 614                         tok = token.LPAREN
 615                 case ')':
 616                         insertSemi = true
 617                         tok = token.RPAREN
 618                 case '[':
 619                         tok = token.LBRACK
 620                 case ']':
 621                         insertSemi = true
 622                         tok = token.RBRACK
 623                 case '{':
 624                         tok = token.LBRACE
 625                 case '}':
 626                         insertSemi = true
 627                         tok = token.RBRACE
 628                 case '+':
 629                         tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
 630                         if tok == token.INC {
 631                                 insertSemi = true
 632                         }
 633                 case '-':
 634                         tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
 635                         if tok == token.DEC {
 636                                 insertSemi = true
 637                         }
 638                 case '*':
 639                         tok = S.switch2(token.MUL, token.MUL_ASSIGN)
 640                 case '/':
 641                         if S.ch == '/' || S.ch == '*' {
 642                                 // comment
 643                                 line := S.line
 644                                 col := S.column - 1 // beginning of comment
 645                                 if S.insertSemi && S.findLineEnd() {
 646                                         // reset position to the beginning of the comment
 647                                         S.line = line
 648                                         S.column = col
 649                                         S.ch = '/'
 650                                         S.offset = offs
 651                                         S.rdOffset = offs + 1
 652                                         S.insertSemi = false // newline consumed
 653                                         return pos, token.SEMICOLON, newline
 654                                 }
 655                                 S.scanComment()
 656                                 if S.mode&ScanComments == 0 {
 657                                         // skip comment
 658                                         S.insertSemi = false // newline consumed
 659                                         goto scanAgain
 660                                 }
 661                                 tok = token.COMMENT
 662                         } else {
 663                                 tok = S.switch2(token.QUO, token.QUO_ASSIGN)
 664                         }
 665                 case '%':
 666                         tok = S.switch2(token.REM, token.REM_ASSIGN)
 667                 case '^':
 668                         tok = S.switch2(token.XOR, token.XOR_ASSIGN)
 669                 case '<':
 670                         if S.ch == '-' {
 671                                 S.next()
 672                                 tok = token.ARROW
 673                         } else {
 674                                 tok = S.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
 675                         }
 676                 case '>':
 677                         tok = S.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
 678                 case '=':
 679                         tok = S.switch2(token.ASSIGN, token.EQL)
 680                 case '!':
 681                         tok = S.switch2(token.NOT, token.NEQ)
 682                 case '&':
 683                         if S.ch == '^' {
 684                                 S.next()
 685                                 tok = S.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
 686                         } else {
 687                                 tok = S.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
 688                         }
 689                 case '|':
 690                         tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
 691                 default:
 692                         if S.mode&AllowIllegalChars == 0 {
 693                                 S.errorAt(pos, "illegal character "+charString(ch))
 694                         }
 695                         insertSemi = S.insertSemi // preserve insertSemi info
 696                 }
 697         }
 698
 699         if S.mode&InsertSemis != 0 {
 700                 S.insertSemi = insertSemi
 701         }
 702         return pos, tok, S.src[offs:S.offset]
 703 }
 704
 705
 706 // Tokenize calls a function f with the token position, token value, and token
 707 // text for each token in the source src. The other parameters have the same
 708 // meaning as for the Init function. Tokenize keeps scanning until f returns
 709 // false (usually when the token value is token.EOF). The result is the number
 710 // of errors encountered.
 711 //
 712 func Tokenize(filename string, src []byte, err ErrorHandler, mode uint, f func(pos token.Position, tok token.Token, lit []byte) bool) int {
 713         var s Scanner
 714         s.Init(filename, src, err, mode)
 715         for f(s.Scan()) {
 716                 // action happens in f
 717         }
 718         return s.ErrorCount
 719 }