]> git.ipfire.org Git - thirdparty/gcc.git/blob - libgo/go/go/scanner/scanner.go
Add Go frontend, libgo library, and Go testsuite.
[thirdparty/gcc.git] / libgo / go / go / scanner / scanner.go
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // A scanner for Go source text. Takes a []byte as source which can
6 // then be tokenized through repeated calls to the Scan function.
7 // For a sample use of a scanner, see the implementation of Tokenize.
8 //
9 package scanner
10
11 import (
12 "bytes"
13 "go/token"
14 "strconv"
15 "unicode"
16 "utf8"
17 )
18
19
20 // A Scanner holds the scanner's internal state while processing
21 // a given text. It can be allocated as part of another data
22 // structure but must be initialized via Init before use. For
23 // a sample use, see the implementation of Tokenize.
24 //
25 type Scanner struct {
26 // immutable state
27 src []byte // source
28 err ErrorHandler // error reporting; or nil
29 mode uint // scanning mode
30
31 // scanning state
32 filename string // current filename; may change via //line filename:line comment
33 line int // current line
34 column int // current column
35
36 ch int // current character
37 offset int // character offset
38 rdOffset int // reading offset (position after current character)
39 insertSemi bool // insert a semicolon before next newline
40
41 // public state - ok to modify
42 ErrorCount int // number of errors encountered
43 }
44
45
46 // Read the next Unicode char into S.ch.
47 // S.ch < 0 means end-of-file.
48 //
49 func (S *Scanner) next() {
50 S.column++
51 if S.rdOffset < len(S.src) {
52 S.offset = S.rdOffset
53 if S.ch == '\n' {
54 S.line++
55 S.column = 1
56 }
57 r, w := int(S.src[S.rdOffset]), 1
58 switch {
59 case r == 0:
60 S.error("illegal character NUL")
61 case r >= 0x80:
62 // not ASCII
63 r, w = utf8.DecodeRune(S.src[S.rdOffset:])
64 if r == utf8.RuneError && w == 1 {
65 S.error("illegal UTF-8 encoding")
66 }
67 }
68 S.rdOffset += w
69 S.ch = r
70 } else {
71 S.offset = len(S.src)
72 if S.ch == '\n' {
73 S.column = 1
74 }
75 S.ch = -1 // eof
76 }
77 }
78
79
80 // The mode parameter to the Init function is a set of flags (or 0).
81 // They control scanner behavior.
82 //
83 const (
84 ScanComments = 1 << iota // return comments as COMMENT tokens
85 AllowIllegalChars // do not report an error for illegal chars
86 InsertSemis // automatically insert semicolons
87 )
88
89
90 // Init prepares the scanner S to tokenize the text src. Calls to Scan
91 // will use the error handler err if they encounter a syntax error and
92 // err is not nil. Also, for each error encountered, the Scanner field
93 // ErrorCount is incremented by one. The filename parameter is used as
94 // filename in the token.Position returned by Scan for each token. The
95 // mode parameter determines how comments and illegal characters are
96 // handled.
97 //
98 func (S *Scanner) Init(filename string, src []byte, err ErrorHandler, mode uint) {
99 // Explicitly initialize all fields since a scanner may be reused.
100 S.src = src
101 S.err = err
102 S.mode = mode
103
104 S.filename = filename
105 S.line = 1
106 S.column = 0
107
108 S.ch = ' '
109 S.offset = 0
110 S.rdOffset = 0
111 S.insertSemi = false
112 S.ErrorCount = 0
113
114 S.next()
115 }
116
117
118 func charString(ch int) string {
119 var s string
120 switch ch {
121 case -1:
122 return `EOF`
123 case '\a':
124 s = `\a`
125 case '\b':
126 s = `\b`
127 case '\f':
128 s = `\f`
129 case '\n':
130 s = `\n`
131 case '\r':
132 s = `\r`
133 case '\t':
134 s = `\t`
135 case '\v':
136 s = `\v`
137 case '\\':
138 s = `\\`
139 case '\'':
140 s = `\'`
141 default:
142 s = string(ch)
143 }
144 return "'" + s + "' (U+" + strconv.Itob(ch, 16) + ")"
145 }
146
147
148 func (S *Scanner) error(msg string) {
149 S.errorAt(token.Position{S.filename, S.offset, S.line, S.column}, msg)
150 }
151
152
153 func (S *Scanner) errorAt(pos token.Position, msg string) {
154 if S.err != nil {
155 S.err.Error(pos, msg)
156 }
157 S.ErrorCount++
158 }
159
160
161 var prefix = []byte("//line ")
162
163 func (S *Scanner) interpretLineComment(text []byte) {
164 if bytes.HasPrefix(text, prefix) {
165 // get filename and line number, if any
166 if i := bytes.Index(text, []byte{':'}); i > 0 {
167 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
168 // valid //line filename:line comment;
169 // update scanner position
170 S.filename = string(text[len(prefix):i])
171 S.line = line - 1 // -1 since the '\n' has not been consumed yet
172 }
173 }
174 }
175 }
176
177
178 func (S *Scanner) scanComment() {
179 // initial '/' already consumed; S.ch == '/' || S.ch == '*'
180 offs := S.offset - 1 // position of initial '/'
181 col := S.column - 1
182 pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
183
184 if S.ch == '/' {
185 //-style comment
186 S.next()
187 for S.ch != '\n' && S.ch >= 0 {
188 S.next()
189 }
190 if col == 1 {
191 // comment starts at the beginning of the current line
192 S.interpretLineComment(S.src[offs:S.offset])
193 }
194 return
195 }
196
197 /*-style comment */
198 S.next()
199 for S.ch >= 0 {
200 ch := S.ch
201 S.next()
202 if ch == '*' && S.ch == '/' {
203 S.next()
204 return
205 }
206 }
207
208 S.errorAt(pos, "comment not terminated")
209 }
210
211
212 func (S *Scanner) findLineEnd() bool {
213 // initial '/' already consumed
214
215 defer func(line, col, offs int) {
216 // reset scanner state to where it was upon calling findLineEnd
217 // (we don't scan //line comments and ignore errors thus
218 // S.filename and S.ErrorCount don't change)
219 S.line = line
220 S.column = col
221 S.ch = '/'
222 S.offset = offs
223 S.rdOffset = offs + 1
224 S.next() // consume initial '/' again
225 }(S.line, S.column-1, S.offset-1)
226
227 // read ahead until a newline, EOF, or non-comment token is found
228 for S.ch == '/' || S.ch == '*' {
229 if S.ch == '/' {
230 //-style comment always contains a newline
231 return true
232 }
233 /*-style comment: look for newline */
234 S.next()
235 for S.ch >= 0 {
236 ch := S.ch
237 if ch == '\n' {
238 return true
239 }
240 S.next()
241 if ch == '*' && S.ch == '/' {
242 S.next()
243 break
244 }
245 }
246 S.skipWhitespace() // S.insertSemi is set
247 if S.ch < 0 || S.ch == '\n' {
248 return true
249 }
250 if S.ch != '/' {
251 // non-comment token
252 return false
253 }
254 S.next() // consume '/'
255 }
256
257 return false
258 }
259
260
261 func isLetter(ch int) bool {
262 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
263 }
264
265
266 func isDigit(ch int) bool {
267 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
268 }
269
270
271 func (S *Scanner) scanIdentifier() token.Token {
272 offs := S.offset
273 for isLetter(S.ch) || isDigit(S.ch) {
274 S.next()
275 }
276 return token.Lookup(S.src[offs:S.offset])
277 }
278
279
280 func digitVal(ch int) int {
281 switch {
282 case '0' <= ch && ch <= '9':
283 return ch - '0'
284 case 'a' <= ch && ch <= 'f':
285 return ch - 'a' + 10
286 case 'A' <= ch && ch <= 'F':
287 return ch - 'A' + 10
288 }
289 return 16 // larger than any legal digit val
290 }
291
292
293 func (S *Scanner) scanMantissa(base int) {
294 for digitVal(S.ch) < base {
295 S.next()
296 }
297 }
298
299
300 func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
301 // digitVal(S.ch) < 10
302 tok := token.INT
303
304 if seenDecimalPoint {
305 tok = token.FLOAT
306 S.scanMantissa(10)
307 goto exponent
308 }
309
310 if S.ch == '0' {
311 // int or float
312 pos := token.Position{S.filename, S.offset, S.line, S.column}
313 S.next()
314 if S.ch == 'x' || S.ch == 'X' {
315 // hexadecimal int
316 S.next()
317 S.scanMantissa(16)
318 } else {
319 // octal int or float
320 seenDecimalDigit := false
321 S.scanMantissa(8)
322 if S.ch == '8' || S.ch == '9' {
323 // illegal octal int or float
324 seenDecimalDigit = true
325 S.scanMantissa(10)
326 }
327 if S.ch == '.' || S.ch == 'e' || S.ch == 'E' || S.ch == 'i' {
328 goto fraction
329 }
330 // octal int
331 if seenDecimalDigit {
332 S.errorAt(pos, "illegal octal number")
333 }
334 }
335 goto exit
336 }
337
338 // decimal int or float
339 S.scanMantissa(10)
340
341 fraction:
342 if S.ch == '.' {
343 tok = token.FLOAT
344 S.next()
345 S.scanMantissa(10)
346 }
347
348 exponent:
349 if S.ch == 'e' || S.ch == 'E' {
350 tok = token.FLOAT
351 S.next()
352 if S.ch == '-' || S.ch == '+' {
353 S.next()
354 }
355 S.scanMantissa(10)
356 }
357
358 if S.ch == 'i' {
359 tok = token.IMAG
360 S.next()
361 }
362
363 exit:
364 return tok
365 }
366
367
368 func (S *Scanner) scanEscape(quote int) {
369 pos := token.Position{S.filename, S.offset, S.line, S.column}
370
371 var i, base, max uint32
372 switch S.ch {
373 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
374 S.next()
375 return
376 case '0', '1', '2', '3', '4', '5', '6', '7':
377 i, base, max = 3, 8, 255
378 case 'x':
379 S.next()
380 i, base, max = 2, 16, 255
381 case 'u':
382 S.next()
383 i, base, max = 4, 16, unicode.MaxRune
384 case 'U':
385 S.next()
386 i, base, max = 8, 16, unicode.MaxRune
387 default:
388 S.next() // always make progress
389 S.errorAt(pos, "unknown escape sequence")
390 return
391 }
392
393 var x uint32
394 for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
395 d := uint32(digitVal(S.ch))
396 if d >= base {
397 S.error("illegal character in escape sequence")
398 break
399 }
400 x = x*base + d
401 S.next()
402 }
403 // in case of an error, consume remaining chars
404 for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
405 S.next()
406 }
407 if x > max || 0xd800 <= x && x < 0xe000 {
408 S.errorAt(pos, "escape sequence is invalid Unicode code point")
409 }
410 }
411
412
413 func (S *Scanner) scanChar() {
414 // '\'' opening already consumed
415 pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
416
417 n := 0
418 for S.ch != '\'' {
419 ch := S.ch
420 n++
421 S.next()
422 if ch == '\n' || ch < 0 {
423 S.errorAt(pos, "character literal not terminated")
424 n = 1
425 break
426 }
427 if ch == '\\' {
428 S.scanEscape('\'')
429 }
430 }
431
432 S.next()
433
434 if n != 1 {
435 S.errorAt(pos, "illegal character literal")
436 }
437 }
438
439
440 func (S *Scanner) scanString() {
441 // '"' opening already consumed
442 pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
443
444 for S.ch != '"' {
445 ch := S.ch
446 S.next()
447 if ch == '\n' || ch < 0 {
448 S.errorAt(pos, "string not terminated")
449 break
450 }
451 if ch == '\\' {
452 S.scanEscape('"')
453 }
454 }
455
456 S.next()
457 }
458
459
460 func (S *Scanner) scanRawString() {
461 // '`' opening already consumed
462 pos := token.Position{S.filename, S.offset - 1, S.line, S.column - 1}
463
464 for S.ch != '`' {
465 ch := S.ch
466 S.next()
467 if ch < 0 {
468 S.errorAt(pos, "string not terminated")
469 break
470 }
471 }
472
473 S.next()
474 }
475
476
477 func (S *Scanner) skipWhitespace() {
478 for S.ch == ' ' || S.ch == '\t' || S.ch == '\n' && !S.insertSemi || S.ch == '\r' {
479 S.next()
480 }
481 }
482
483
484 // Helper functions for scanning multi-byte tokens such as >> += >>= .
485 // Different routines recognize different length tok_i based on matches
486 // of ch_i. If a token ends in '=', the result is tok1 or tok3
487 // respectively. Otherwise, the result is tok0 if there was no other
488 // matching character, or tok2 if the matching character was ch2.
489
490 func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token {
491 if S.ch == '=' {
492 S.next()
493 return tok1
494 }
495 return tok0
496 }
497
498
499 func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token {
500 if S.ch == '=' {
501 S.next()
502 return tok1
503 }
504 if S.ch == ch2 {
505 S.next()
506 return tok2
507 }
508 return tok0
509 }
510
511
512 func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token {
513 if S.ch == '=' {
514 S.next()
515 return tok1
516 }
517 if S.ch == ch2 {
518 S.next()
519 if S.ch == '=' {
520 S.next()
521 return tok3
522 }
523 return tok2
524 }
525 return tok0
526 }
527
528
529 var newline = []byte{'\n'}
530
531 // Scan scans the next token and returns the token position pos,
532 // the token tok, and the literal text lit corresponding to the
533 // token. The source end is indicated by token.EOF.
534 //
535 // If the returned token is token.SEMICOLON, the corresponding
536 // literal value is ";" if the semicolon was present in the source,
537 // and "\n" if the semicolon was inserted because of a newline or
538 // at EOF.
539 //
540 // For more tolerant parsing, Scan will return a valid token if
541 // possible even if a syntax error was encountered. Thus, even
542 // if the resulting token sequence contains no illegal tokens,
543 // a client may not assume that no error occurred. Instead it
544 // must check the scanner's ErrorCount or the number of calls
545 // of the error handler, if there was one installed.
546 //
547 func (S *Scanner) Scan() (pos token.Position, tok token.Token, lit []byte) {
548 scanAgain:
549 S.skipWhitespace()
550
551 // current token start
552 insertSemi := false
553 pos, tok = token.Position{S.filename, S.offset, S.line, S.column}, token.ILLEGAL
554 offs := S.offset
555
556 // determine token value
557 switch ch := S.ch; {
558 case isLetter(ch):
559 tok = S.scanIdentifier()
560 switch tok {
561 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
562 insertSemi = true
563 }
564 case digitVal(ch) < 10:
565 insertSemi = true
566 tok = S.scanNumber(false)
567 default:
568 S.next() // always make progress
569 switch ch {
570 case -1:
571 if S.insertSemi {
572 S.insertSemi = false // EOF consumed
573 return pos, token.SEMICOLON, newline
574 }
575 tok = token.EOF
576 case '\n':
577 // we only reach here if S.insertSemi was
578 // set in the first place and exited early
579 // from S.skipWhitespace()
580 S.insertSemi = false // newline consumed
581 return pos, token.SEMICOLON, newline
582 case '"':
583 insertSemi = true
584 tok = token.STRING
585 S.scanString()
586 case '\'':
587 insertSemi = true
588 tok = token.CHAR
589 S.scanChar()
590 case '`':
591 insertSemi = true
592 tok = token.STRING
593 S.scanRawString()
594 case ':':
595 tok = S.switch2(token.COLON, token.DEFINE)
596 case '.':
597 if digitVal(S.ch) < 10 {
598 insertSemi = true
599 tok = S.scanNumber(true)
600 } else if S.ch == '.' {
601 S.next()
602 if S.ch == '.' {
603 S.next()
604 tok = token.ELLIPSIS
605 }
606 } else {
607 tok = token.PERIOD
608 }
609 case ',':
610 tok = token.COMMA
611 case ';':
612 tok = token.SEMICOLON
613 case '(':
614 tok = token.LPAREN
615 case ')':
616 insertSemi = true
617 tok = token.RPAREN
618 case '[':
619 tok = token.LBRACK
620 case ']':
621 insertSemi = true
622 tok = token.RBRACK
623 case '{':
624 tok = token.LBRACE
625 case '}':
626 insertSemi = true
627 tok = token.RBRACE
628 case '+':
629 tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
630 if tok == token.INC {
631 insertSemi = true
632 }
633 case '-':
634 tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
635 if tok == token.DEC {
636 insertSemi = true
637 }
638 case '*':
639 tok = S.switch2(token.MUL, token.MUL_ASSIGN)
640 case '/':
641 if S.ch == '/' || S.ch == '*' {
642 // comment
643 line := S.line
644 col := S.column - 1 // beginning of comment
645 if S.insertSemi && S.findLineEnd() {
646 // reset position to the beginning of the comment
647 S.line = line
648 S.column = col
649 S.ch = '/'
650 S.offset = offs
651 S.rdOffset = offs + 1
652 S.insertSemi = false // newline consumed
653 return pos, token.SEMICOLON, newline
654 }
655 S.scanComment()
656 if S.mode&ScanComments == 0 {
657 // skip comment
658 S.insertSemi = false // newline consumed
659 goto scanAgain
660 }
661 tok = token.COMMENT
662 } else {
663 tok = S.switch2(token.QUO, token.QUO_ASSIGN)
664 }
665 case '%':
666 tok = S.switch2(token.REM, token.REM_ASSIGN)
667 case '^':
668 tok = S.switch2(token.XOR, token.XOR_ASSIGN)
669 case '<':
670 if S.ch == '-' {
671 S.next()
672 tok = token.ARROW
673 } else {
674 tok = S.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
675 }
676 case '>':
677 tok = S.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
678 case '=':
679 tok = S.switch2(token.ASSIGN, token.EQL)
680 case '!':
681 tok = S.switch2(token.NOT, token.NEQ)
682 case '&':
683 if S.ch == '^' {
684 S.next()
685 tok = S.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
686 } else {
687 tok = S.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
688 }
689 case '|':
690 tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
691 default:
692 if S.mode&AllowIllegalChars == 0 {
693 S.errorAt(pos, "illegal character "+charString(ch))
694 }
695 insertSemi = S.insertSemi // preserve insertSemi info
696 }
697 }
698
699 if S.mode&InsertSemis != 0 {
700 S.insertSemi = insertSemi
701 }
702 return pos, tok, S.src[offs:S.offset]
703 }
704
705
706 // Tokenize calls a function f with the token position, token value, and token
707 // text for each token in the source src. The other parameters have the same
708 // meaning as for the Init function. Tokenize keeps scanning until f returns
709 // false (usually when the token value is token.EOF). The result is the number
710 // of errors encountered.
711 //
712 func Tokenize(filename string, src []byte, err ErrorHandler, mode uint, f func(pos token.Position, tok token.Token, lit []byte) bool) int {
713 var s Scanner
714 s.Init(filename, src, err, mode)
715 for f(s.Scan()) {
716 // action happens in f
717 }
718 return s.ErrorCount
719 }