]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/rust/lex/rust-lex.cc
Update copyright years.
[thirdparty/gcc.git] / gcc / rust / lex / rust-lex.cc
CommitLineData
83ffe9cd 1// Copyright (C) 2020-2023 Free Software Foundation, Inc.
18f6990f
JP
2
3// This file is part of GCC.
4
5// GCC is free software; you can redistribute it and/or modify it under
6// the terms of the GNU General Public License as published by the Free
7// Software Foundation; either version 3, or (at your option) any later
8// version.
9
10// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11// WARRANTY; without even the implied warranty of MERCHANTABILITY or
12// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13// for more details.
14
15// You should have received a copy of the GNU General Public License
16// along with GCC; see the file COPYING3. If not see
17// <http://www.gnu.org/licenses/>.
18
19#include "rust-system.h"
20#include "rust-lex.h"
21#include "rust-diagnostics.h"
22#include "rust-linemap.h"
23#include "rust-session-manager.h"
24#include "safe-ctype.h"
25
26namespace Rust {
27// TODO: move to separate compilation unit?
28// overload += for uint32_t to allow 32-bit encoded utf-8 to be added
29std::string &
30operator+= (std::string &str, Codepoint char32)
31{
32 if (char32.value < 0x80)
33 {
34 str += static_cast<char> (char32.value);
35 }
36 else if (char32.value < (0x1F + 1) << (1 * 6))
37 {
38 str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
39 str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
40 }
41 else if (char32.value < (0x0F + 1) << (2 * 6))
42 {
43 str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
44 str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
45 str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
46 }
47 else if (char32.value < (0x07 + 1) << (3 * 6))
48 {
49 str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
50 str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
51 str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
52 str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
53 }
54 else
55 {
56 rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value);
57 }
58 return str;
59}
60
61std::string
62Codepoint::as_string ()
63{
64 std::string str;
65
66 // str += Codepoint (value);
67 str += *this;
68
69 return str;
70}
71
72/* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
73 * for handling. */
74bool
75is_float_digit (char number)
76{
77 return ISDIGIT (number) || number == 'E' || number == 'e';
78}
79
80/* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
81 * whatever is different */
82bool
83is_x_digit (char number)
84{
85 return ISXDIGIT (number);
86}
87
88bool
89is_octal_digit (char number)
90{
91 return number >= '0' && number <= '7';
92}
93
94bool
95is_bin_digit (char number)
96{
97 return number == '0' || number == '1';
98}
99
100bool
101check_valid_float_dot_end (char character)
102{
103 return character != '.' && character != '_' && !ISALPHA (character);
104}
105
106// ISSPACE from safe-ctype but may change in future
107bool
108is_whitespace (char character)
109{
110 return ISSPACE (character);
111}
112
113bool
114is_non_decimal_int_literal_separator (char character)
115{
116 return character == 'x' || character == 'o' || character == 'b';
117}
118
119Lexer::Lexer (const std::string &input)
120 : input (RAIIFile::create_error ()), current_line (1), current_column (1),
121 line_map (nullptr), raw_input_source (new BufferInputSource (input, 0)),
122 input_queue{*raw_input_source}, token_queue (TokenSource (this))
123{}
124
125Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap)
126 : input (std::move (file_input)), current_line (1), current_column (1),
127 line_map (linemap),
128 raw_input_source (new FileInputSource (input.get_raw ())),
129 input_queue{*raw_input_source}, token_queue (TokenSource (this))
130{
131 // inform line_table that file is being entered and is in line 1
132 if (linemap)
133 line_map->start_file (filename, current_line);
134}
135
136Lexer::~Lexer ()
137{
138 /* ok apparently stop (which is equivalent of original code in destructor) is
139 * meant to be called after all files have finished parsing, for cleanup. On
140 * the other hand, actual code that it calls to leave a certain line map is
141 * mentioned in GCC docs as being useful for "just leaving an included header"
142 * and stuff like that, so this line mapping functionality may need fixing.
143 * FIXME: find out whether this occurs. */
144
145 // line_map->stop();
146}
147
148/* TODO: need to optimise somehow to avoid the virtual function call in the
149 * tight loop. Best idea at the moment is CRTP, but that might make lexer
150 * implementation annoying when storing the "base class" (i.e. would need
151 * template parameter everywhere), although in practice it would mostly just
152 * look ugly and make enclosing classes like Parser also require a type
153 * parameter. At this point a macro might be better. OK I guess macros can be
154 * replaced by constexpr if or something if possible. */
155Location
156Lexer::get_current_location ()
157{
158 if (line_map)
159 return line_map->get_location (current_column);
160 else
161 // If we have no linemap, we're lexing something without proper locations
162 return Location ();
163}
164
165int
166Lexer::peek_input (int n)
167{
168 return input_queue.peek (n);
169}
170
171int
172Lexer::peek_input ()
173{
174 return peek_input (0);
175}
176
177void
178Lexer::skip_input (int n)
179{
180 input_queue.skip (n);
181}
182
183void
184Lexer::skip_input ()
185{
186 skip_input (0);
187}
188
189void
190Lexer::replace_current_token (TokenPtr replacement)
191{
192 token_queue.replace_current_value (replacement);
193
194 rust_debug ("called 'replace_current_token' - this is deprecated");
195}
196
197/* shitty anonymous namespace that can only be accessed inside the compilation
198 * unit - used for classify_keyword binary search in sorted array of keywords
199 * created with x-macros. */
200namespace {
201// TODO: make constexpr when update to c++20
202const std::string keyword_index[] = {
203#define RS_TOKEN(x, y)
204#define RS_TOKEN_KEYWORD(name, keyword) keyword,
205 RS_TOKEN_LIST
206#undef RS_TOKEN_KEYWORD
207#undef RS_TOKEN
208};
209
210constexpr TokenId keyword_keys[] = {
211#define RS_TOKEN(x, y)
212#define RS_TOKEN_KEYWORD(name, keyword) name,
213 RS_TOKEN_LIST
214#undef RS_TOKEN_KEYWORD
215#undef RS_TOKEN
216};
217
218constexpr int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index);
219} // namespace
220
221/* Determines whether the string passed in is a keyword or not. If it is, it
222 * returns the keyword name. */
223TokenId
224Lexer::classify_keyword (const std::string &str)
225{
226 const std::string *last = keyword_index + num_keywords;
227 const std::string *idx = std::lower_bound (keyword_index, last, str);
228
229 if (idx == last || str != *idx)
230 return IDENTIFIER;
231
232 // TODO: possibly replace this x-macro system with something like hash map?
233
234 // We now have the expected token ID of the reserved keyword. However, some
235 // keywords are reserved starting in certain editions. For example, `try` is
236 // only a reserved keyword in editions >=2018. The language might gain new
237 // reserved keywords in the future.
238 //
239 // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
240 auto id = keyword_keys[idx - keyword_index];
241
242 // `try` is not a reserved keyword before 2018
243 if (Session::get_instance ().options.get_edition ()
244 == CompileOptions::Edition::E2015
245 && id == TRY)
246 return IDENTIFIER;
247
248 return id;
249}
250
251TokenPtr
252Lexer::build_token ()
253{
254 // loop to go through multiple characters to build a single token
255 while (true)
256 {
257 Location loc = get_current_location ();
258 current_char = peek_input ();
259 skip_input ();
260
261 // detect UTF8 bom
262 //
263 // Must be the first thing on the first line.
264 // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
265 // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
266 if (current_line == 1 && current_column == 1 && current_char == 0xef
267 && peek_input () == 0xbb && peek_input (1) == 0xbf)
268 {
269 skip_input (1);
270 current_char = peek_input ();
271 skip_input ();
272 }
273
274 // detect shebang
275 // Must be the first thing on the first line, starting with #!
276 // But since an attribute can also start with an #! we don't count it as a
277 // shebang line when after any whitespace or comments there is a [. If it
278 // is a shebang line we simple drop the line. Otherwise we don't consume
279 // any characters and fall through to the real tokenizer.
280 if (current_line == 1 && current_column == 1 && current_char == '#'
281 && peek_input () == '!')
282 {
283 int n = 1;
284 while (true)
285 {
286 int next_char = peek_input (n);
287 if (is_whitespace (next_char))
288 n++;
289 else if ((next_char == '/' && peek_input (n + 1) == '/'
290 && peek_input (n + 2) != '!'
291 && peek_input (n + 2) != '/')
292 || (next_char == '/' && peek_input (n + 1) == '/'
293 && peek_input (n + 2) == '/'
294 && peek_input (n + 3) == '/'))
295 {
296 // two // or four ////
297 // A single line comment
298 // (but not an inner or outer doc comment)
299 n += 2;
300 next_char = peek_input (n);
301 while (next_char != '\n' && next_char != EOF)
302 {
303 n++;
304 next_char = peek_input (n);
305 }
306 if (next_char == '\n')
307 n++;
308 }
309 else if (next_char == '/' && peek_input (n + 1) == '*'
310 && peek_input (n + 2) == '*'
311 && peek_input (n + 3) == '/')
312 {
313 /**/
314 n += 4;
315 }
316 else if (next_char == '/' && peek_input (n + 1) == '*'
317 && peek_input (n + 2) == '*' && peek_input (n + 3) == '*'
318 && peek_input (n + 4) == '/')
319 {
320 /***/
321 n += 5;
322 }
323 else if ((next_char == '/' && peek_input (n + 1) == '*'
324 && peek_input (n + 2) != '*'
325 && peek_input (n + 2) != '!')
326 || (next_char == '/' && peek_input (n + 1) == '*'
327 && peek_input (n + 2) == '*'
328 && peek_input (n + 3) == '*'))
329 {
330 // one /* or three /***
331 // Start of a block comment
332 // (but not an inner or outer doc comment)
333 n += 2;
334 int level = 1;
335 while (level > 0)
336 {
337 if (peek_input (n) == EOF)
338 break;
339 else if (peek_input (n) == '/'
340 && peek_input (n + 1) == '*')
341 {
342 n += 2;
343 level += 1;
344 }
345 else if (peek_input (n) == '*'
346 && peek_input (n + 1) == '/')
347 {
348 n += 2;
349 level -= 1;
350 }
351 else
352 n++;
353 }
354 }
355 else if (next_char != '[')
356 {
357 // definitely shebang, ignore the first line
358 while (current_char != '\n' && current_char != EOF)
359 {
360 current_char = peek_input ();
361 skip_input ();
362 }
363
364 // newline
365 current_line++;
366 current_column = 1;
367 // tell line_table that new line starts
368 start_line (current_line, max_column_hint);
369 break;
370 }
371 else
372 break; /* Definitely not a shebang line. */
373 }
374 }
375
376 // return end of file token if end of file
377 if (current_char == EOF)
378 return Token::make (END_OF_FILE, loc);
379
380 // if not end of file, start tokenising
381 switch (current_char)
382 {
383 /* ignore whitespace characters for tokens but continue updating
384 * location */
385 case '\n': // newline
386 current_line++;
387 current_column = 1;
388 // tell line_table that new line starts
389 start_line (current_line, max_column_hint);
390 continue;
391 case '\r': // cr
392 // Ignore, we expect a newline (lf) soon.
393 continue;
394 case ' ': // space
395 current_column++;
396 continue;
397 case '\t': // tab
398 // width of a tab is not well-defined, assume 8 spaces
399 current_column += 8;
400 continue;
401
402 // punctuation - actual tokens
403 case '=':
404 if (peek_input () == '>')
405 {
406 // match arm arrow
407 skip_input ();
408 current_column += 2;
409
410 return Token::make (MATCH_ARROW, loc);
411 }
412 else if (peek_input () == '=')
413 {
414 // equality operator
415 skip_input ();
416 current_column += 2;
417
418 return Token::make (EQUAL_EQUAL, loc);
419 }
420 else
421 {
422 // assignment operator
423 current_column++;
424 return Token::make (EQUAL, loc);
425 }
426 case '(':
427 current_column++;
428 return Token::make (LEFT_PAREN, loc);
429 case '-':
430 if (peek_input () == '>')
431 {
432 // return type specifier
433 skip_input ();
434 current_column += 2;
435
436 return Token::make (RETURN_TYPE, loc);
437 }
438 else if (peek_input () == '=')
439 {
440 // minus-assign
441 skip_input ();
442 current_column += 2;
443
444 return Token::make (MINUS_EQ, loc);
445 }
446 else
447 {
448 // minus
449 current_column++;
450 return Token::make (MINUS, loc);
451 }
452 case '+':
453 if (peek_input () == '=')
454 {
455 // add-assign
456 skip_input ();
457 current_column += 2;
458
459 return Token::make (PLUS_EQ, loc);
460 }
461 else
462 {
463 // add
464 current_column++;
465 return Token::make (PLUS, loc);
466 }
467 case ')':
468 current_column++;
469 return Token::make (RIGHT_PAREN, loc);
470 case ';':
471 current_column++;
472 return Token::make (SEMICOLON, loc);
473 case '*':
474 if (peek_input () == '=')
475 {
476 // multiplication-assign
477 skip_input ();
478 current_column += 2;
479
480 return Token::make (ASTERISK_EQ, loc);
481 }
482 else
483 {
484 // multiplication
485 current_column++;
486 return Token::make (ASTERISK, loc);
487 }
488 case ',':
489 current_column++;
490 return Token::make (COMMA, loc);
491 case '/':
492 if (peek_input () == '=')
493 {
494 // division-assign
495 skip_input ();
496 current_column += 2;
497
498 return Token::make (DIV_EQ, loc);
499 }
500 else if ((peek_input () == '/' && peek_input (1) != '!'
501 && peek_input (1) != '/')
502 || (peek_input () == '/' && peek_input (1) == '/'
503 && peek_input (2) == '/'))
504 {
505 // two // or four ////
506 // single line comment
507 // (but not an inner or outer doc comment)
508 skip_input ();
509 current_column += 2;
510 current_char = peek_input ();
511
512 // basically ignore until line finishes
513 while (current_char != '\n' && current_char != EOF)
514 {
515 skip_input ();
516 current_column++; // not used
517 current_char = peek_input ();
518 }
519 continue;
520 }
521 else if (peek_input () == '/'
522 && (peek_input (1) == '!' || peek_input (1) == '/'))
523 {
524 /* single line doc comment, inner or outer. */
525 bool is_inner = peek_input (1) == '!';
526 skip_input (1);
527 current_column += 3;
528
529 std::string str;
530 str.reserve (32);
531 current_char = peek_input ();
532 while (current_char != '\n')
533 {
534 skip_input ();
535 if (current_char == '\r')
536 {
537 char next_char = peek_input ();
538 if (next_char == '\n')
539 {
540 current_char = '\n';
541 break;
542 }
543 rust_error_at (
544 loc, "Isolated CR %<\\r%> not allowed in doc comment");
545 current_char = next_char;
546 continue;
547 }
548 if (current_char == EOF)
549 {
550 rust_error_at (
551 loc, "unexpected EOF while looking for end of comment");
552 break;
553 }
554 str += current_char;
555 current_char = peek_input ();
556 }
557 skip_input ();
558 current_line++;
559 current_column = 1;
560 // tell line_table that new line starts
561 start_line (current_line, max_column_hint);
562
563 str.shrink_to_fit ();
564 if (is_inner)
565 return Token::make_inner_doc_comment (loc, std::move (str));
566 else
567 return Token::make_outer_doc_comment (loc, std::move (str));
568 }
569 else if (peek_input () == '*' && peek_input (1) == '*'
570 && peek_input (2) == '/')
571 {
572 /**/
573 skip_input (2);
574 current_column += 4;
575 continue;
576 }
577 else if (peek_input () == '*' && peek_input (1) == '*'
578 && peek_input (2) == '*' && peek_input (3) == '/')
579 {
580 /***/
581 skip_input (3);
582 current_column += 5;
583 continue;
584 }
585 else if ((peek_input () == '*' && peek_input (1) != '!'
586 && peek_input (1) != '*')
587 || (peek_input () == '*' && peek_input (1) == '*'
588 && peek_input (2) == '*'))
589 {
590 // one /* or three /***
591 // block comment
592 // (but not an inner or outer doc comment)
593 skip_input ();
594 current_column += 2;
595
596 int level = 1;
597 while (level > 0)
598 {
599 current_char = peek_input ();
600
601 if (current_char == EOF)
602 {
603 rust_error_at (
604 loc, "unexpected EOF while looking for end of comment");
605 break;
606 }
607
608 // if /* found
609 if (current_char == '/' && peek_input (1) == '*')
610 {
611 // skip /* characters
612 skip_input (1);
613
614 current_column += 2;
615
616 level += 1;
617 continue;
618 }
619
620 // ignore until */ is found
621 if (current_char == '*' && peek_input (1) == '/')
622 {
623 // skip */ characters
624 skip_input (1);
625
626 current_column += 2;
627
628 level -= 1;
629 continue;
630 }
631
632 if (current_char == '\n')
633 {
634 skip_input ();
635 current_line++;
636 current_column = 1;
637 // tell line_table that new line starts
638 start_line (current_line, max_column_hint);
639 continue;
640 }
641
642 skip_input ();
643 current_column++;
644 }
645
646 // refresh new token
647 continue;
648 }
649 else if (peek_input () == '*'
650 && (peek_input (1) == '!' || peek_input (1) == '*'))
651 {
652 // block doc comment, inner /*! or outer /**
653 bool is_inner = peek_input (1) == '!';
654 skip_input (1);
655 current_column += 3;
656
657 std::string str;
658 str.reserve (96);
659
660 int level = 1;
661 while (level > 0)
662 {
663 current_char = peek_input ();
664
665 if (current_char == EOF)
666 {
667 rust_error_at (
668 loc, "unexpected EOF while looking for end of comment");
669 break;
670 }
671
672 // if /* found
673 if (current_char == '/' && peek_input (1) == '*')
674 {
675 // skip /* characters
676 skip_input (1);
677 current_column += 2;
678
679 level += 1;
680 str += "/*";
681 continue;
682 }
683
684 // ignore until */ is found
685 if (current_char == '*' && peek_input (1) == '/')
686 {
687 // skip */ characters
688 skip_input (1);
689 current_column += 2;
690
691 level -= 1;
692 if (level > 0)
693 str += "*/";
694 continue;
695 }
696
697 if (current_char == '\r' && peek_input (1) != '\n')
698 rust_error_at (
699 loc, "Isolated CR %<\\r%> not allowed in doc comment");
700
701 if (current_char == '\n')
702 {
703 skip_input ();
704 current_line++;
705 current_column = 1;
706 // tell line_table that new line starts
707 start_line (current_line, max_column_hint);
708 str += '\n';
709 continue;
710 }
711
712 str += current_char;
713 skip_input ();
714 current_column++;
715 }
716
717 str.shrink_to_fit ();
718 if (is_inner)
719 return Token::make_inner_doc_comment (loc, std::move (str));
720 else
721 return Token::make_outer_doc_comment (loc, std::move (str));
722 }
723 else
724 {
725 // division
726 current_column++;
727 return Token::make (DIV, loc);
728 }
729 case '%':
730 if (peek_input () == '=')
731 {
732 // modulo-assign
733 skip_input ();
734 current_column += 2;
735
736 return Token::make (PERCENT_EQ, loc);
737 }
738 else
739 {
740 // modulo
741 current_column++;
742 return Token::make (PERCENT, loc);
743 }
744 case '^':
745 if (peek_input () == '=')
746 {
747 // xor-assign?
748 skip_input ();
749 current_column += 2;
750
751 return Token::make (CARET_EQ, loc);
752 }
753 else
754 {
755 // xor?
756 current_column++;
757 return Token::make (CARET, loc);
758 }
759 case '<':
760 if (peek_input () == '<')
761 {
762 if (peek_input (1) == '=')
763 {
764 // left-shift assign
765 skip_input (1);
766 current_column += 3;
767
768 return Token::make (LEFT_SHIFT_EQ, loc);
769 }
770 else
771 {
772 // left-shift
773 skip_input ();
774 current_column += 2;
775
776 return Token::make (LEFT_SHIFT, loc);
777 }
778 }
779 else if (peek_input () == '=')
780 {
781 // smaller than or equal to
782 skip_input ();
783 current_column += 2;
784
785 return Token::make (LESS_OR_EQUAL, loc);
786 }
787 else
788 {
789 // smaller than
790 current_column++;
791 return Token::make (LEFT_ANGLE, loc);
792 }
793 break;
794 case '>':
795 if (peek_input () == '>')
796 {
797 if (peek_input (1) == '=')
798 {
799 // right-shift-assign
800 skip_input (1);
801 current_column += 3;
802
803 return Token::make (RIGHT_SHIFT_EQ, loc);
804 }
805 else
806 {
807 // right-shift
808 skip_input ();
809 current_column += 2;
810
811 return Token::make (RIGHT_SHIFT, loc);
812 }
813 }
814 else if (peek_input () == '=')
815 {
816 // larger than or equal to
817 skip_input ();
818 current_column += 2;
819
820 return Token::make (GREATER_OR_EQUAL, loc);
821 }
822 else
823 {
824 // larger than
825 current_column++;
826 return Token::make (RIGHT_ANGLE, loc);
827 }
828 case ':':
829 if (peek_input () == ':')
830 {
831 // scope resolution ::
832 skip_input ();
833 current_column += 2;
834
835 return Token::make (SCOPE_RESOLUTION, loc);
836 }
837 else
838 {
839 // single colon :
840 current_column++;
841 return Token::make (COLON, loc);
842 }
843 case '!':
844 // no special handling for macros in lexer?
845 if (peek_input () == '=')
846 {
847 // not equal boolean operator
848 skip_input ();
849 current_column += 2;
850
851 return Token::make (NOT_EQUAL, loc);
852 }
853 else
854 {
855 // not equal unary operator
856 current_column++;
857
858 return Token::make (EXCLAM, loc);
859 }
860 case '?':
861 current_column++;
862 return Token::make (QUESTION_MARK, loc);
863 case '#':
864 current_column++;
865 return Token::make (HASH, loc);
866 case '[':
867 current_column++;
868 return Token::make (LEFT_SQUARE, loc);
869 case ']':
870 current_column++;
871 return Token::make (RIGHT_SQUARE, loc);
872 case '{':
873 current_column++;
874 return Token::make (LEFT_CURLY, loc);
875 case '}':
876 current_column++;
877 return Token::make (RIGHT_CURLY, loc);
878 case '@':
879 current_column++;
880 return Token::make (PATTERN_BIND, loc);
881 case '$':
882 current_column++;
883 return Token::make (DOLLAR_SIGN, loc);
884 case '~':
885 current_column++;
886 return Token::make (TILDE, loc);
887 case '\\':
888 current_column++;
889 return Token::make (BACKSLASH, loc);
890 case '`':
891 current_column++;
892 return Token::make (BACKTICK, loc);
893 case '|':
894 if (peek_input () == '=')
895 {
896 // bitwise or-assign?
897 skip_input ();
898 current_column += 2;
899
900 return Token::make (PIPE_EQ, loc);
901 }
902 else if (peek_input () == '|')
903 {
904 // logical or
905 skip_input ();
906 current_column += 2;
907
908 return Token::make (OR, loc);
909 }
910 else
911 {
912 // bitwise or
913 current_column++;
914
915 return Token::make (PIPE, loc);
916 }
917 case '&':
918 if (peek_input () == '=')
919 {
920 // bitwise and-assign?
921 skip_input ();
922 current_column += 2;
923
924 return Token::make (AMP_EQ, loc);
925 }
926 else if (peek_input () == '&')
927 {
928 // logical and
929 skip_input ();
930 current_column += 2;
931
932 return Token::make (LOGICAL_AND, loc);
933 }
934 else
935 {
936 // bitwise and/reference
937 current_column++;
938
939 return Token::make (AMP, loc);
940 }
941 case '.':
942 if (peek_input () == '.')
943 {
944 if (peek_input (1) == '.')
945 {
946 // ellipsis
947 skip_input (1);
948 current_column += 3;
949
950 return Token::make (ELLIPSIS, loc);
951 }
952 else if (peek_input (1) == '=')
953 {
954 // ..=
955 skip_input (1);
956 current_column += 3;
957
958 return Token::make (DOT_DOT_EQ, loc);
959 }
960 else
961 {
962 // ..
963 skip_input ();
964 current_column += 2;
965
966 return Token::make (DOT_DOT, loc);
967 }
968 }
969 else /*if (!ISDIGIT (peek_input ()))*/
970 {
971 // single dot .
972 // Only if followed by a non-number - otherwise is float
973 // nope, float cannot start with '.'.
974 current_column++;
975 return Token::make (DOT, loc);
976 }
977 }
978 // TODO: special handling of _ in the lexer? instead of being identifier
979
980 // byte character, byte string and raw byte string literals
981 if (current_char == 'b')
982 {
983 if (peek_input () == '\'')
984 return parse_byte_char (loc);
985 else if (peek_input () == '"')
986 return parse_byte_string (loc);
987 else if (peek_input () == 'r'
988 && (peek_input (1) == '#' || peek_input (1) == '"'))
989 return parse_raw_byte_string (loc);
990 }
991
992 // raw identifiers and raw strings
993 if (current_char == 'r')
994 {
995 int peek = peek_input ();
996 int peek1 = peek_input (1);
997
998 if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
999 {
1000 TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
1001 if (raw_ident_ptr != nullptr)
1002 return raw_ident_ptr;
1003 else
1004 continue; /* input got parsed, it just wasn't valid. An error
1005 was produced. */
1006 }
1007 else
1008 {
1009 TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
1010 if (maybe_raw_string_ptr != nullptr)
1011 return maybe_raw_string_ptr;
1012 }
1013 }
1014
1015 // find identifiers and keywords
1016 if (ISALPHA (current_char) || current_char == '_')
1017 return parse_identifier_or_keyword (loc);
1018
1019 // int and float literals
1020 if (ISDIGIT (current_char))
1021 { // _ not allowed as first char
1022 if (current_char == '0'
1023 && is_non_decimal_int_literal_separator (peek_input ()))
1024 {
1025 // handle binary, octal, hex literals
1026 TokenPtr non_dec_int_lit_ptr
1027 = parse_non_decimal_int_literals (loc);
1028 if (non_dec_int_lit_ptr != nullptr)
1029 return non_dec_int_lit_ptr;
1030 }
1031 else
1032 {
1033 // handle decimals (integer or float)
1034 TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
1035 if (decimal_or_float_ptr != nullptr)
1036 return decimal_or_float_ptr;
1037 }
1038 }
1039
1040 // string literals
1041 if (current_char == '"')
1042 return parse_string (loc);
1043
1044 // char literals and lifetime names
1045 if (current_char == '\'')
1046 {
1047 TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
1048 if (char_or_lifetime_ptr != nullptr)
1049 return char_or_lifetime_ptr;
1050 }
1051
1052 // DEBUG: check for specific character problems:
1053 if (current_char == '0')
1054 rust_debug ("'0' uncaught before unexpected character");
1055 else if (current_char == ']')
1056 rust_debug ("']' uncaught before unexpected character");
1057 else if (current_char == 0x5d)
1058 rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
1059 "unexpected character");
1060
1061 // didn't match anything so error
1062 rust_error_at (loc, "unexpected character %<%x%>", current_char);
1063 current_column++;
1064 }
1065}
1066
1067// Parses in a type suffix.
1068std::pair<PrimitiveCoreType, int>
1069Lexer::parse_in_type_suffix ()
1070{
1071 std::string suffix;
1072 suffix.reserve (5);
1073
1074 int additional_length_offset = 0;
1075
1076 // get suffix
1077 while (ISALPHA (current_char) || ISDIGIT (current_char)
1078 || current_char == '_')
1079 {
1080 if (current_char == '_')
1081 {
1082 // don't add _ to suffix
1083 skip_input ();
1084 current_char = peek_input ();
1085
1086 additional_length_offset++;
1087
1088 continue;
1089 }
1090
1091 additional_length_offset++;
1092
1093 suffix += current_char;
1094 skip_input ();
1095 current_char = peek_input ();
1096 }
1097
1098 if (suffix.empty ())
1099 {
1100 // no type suffix: do nothing but also no error
1101 return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
1102 }
1103 else if (suffix == "f32")
1104 {
1105 return std::make_pair (CORETYPE_F32, additional_length_offset);
1106 }
1107 else if (suffix == "f64")
1108 {
1109 return std::make_pair (CORETYPE_F64, additional_length_offset);
1110 }
1111 else if (suffix == "i8")
1112 {
1113 return std::make_pair (CORETYPE_I8, additional_length_offset);
1114 }
1115 else if (suffix == "i16")
1116 {
1117 return std::make_pair (CORETYPE_I16, additional_length_offset);
1118 }
1119 else if (suffix == "i32")
1120 {
1121 return std::make_pair (CORETYPE_I32, additional_length_offset);
1122 }
1123 else if (suffix == "i64")
1124 {
1125 return std::make_pair (CORETYPE_I64, additional_length_offset);
1126 }
1127 else if (suffix == "i128")
1128 {
1129 return std::make_pair (CORETYPE_I128, additional_length_offset);
1130 }
1131 else if (suffix == "isize")
1132 {
1133 return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
1134 }
1135 else if (suffix == "u8")
1136 {
1137 return std::make_pair (CORETYPE_U8, additional_length_offset);
1138 }
1139 else if (suffix == "u16")
1140 {
1141 return std::make_pair (CORETYPE_U16, additional_length_offset);
1142 }
1143 else if (suffix == "u32")
1144 {
1145 return std::make_pair (CORETYPE_U32, additional_length_offset);
1146 }
1147 else if (suffix == "u64")
1148 {
1149 return std::make_pair (CORETYPE_U64, additional_length_offset);
1150 }
1151 else if (suffix == "u128")
1152 {
1153 return std::make_pair (CORETYPE_U128, additional_length_offset);
1154 }
1155 else if (suffix == "usize")
1156 {
1157 return std::make_pair (CORETYPE_USIZE, additional_length_offset);
1158 }
1159 else
1160 {
1161 rust_error_at (get_current_location (), "unknown number suffix %qs",
1162 suffix.c_str ());
1163
1164 return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
1165 }
1166}
1167
1168// Parses in the exponent part (if any) of a float literal.
1169std::pair<std::string, int>
1170Lexer::parse_in_exponent_part ()
1171{
1172 int additional_length_offset = 0;
1173 std::string str;
1174 if (current_char == 'E' || current_char == 'e')
1175 {
1176 // add exponent to string as strtod works with it
1177 str += current_char;
1178 skip_input ();
1179 current_char = peek_input ();
1180
1181 additional_length_offset++;
1182
1183 // special - and + handling
1184 if (current_char == '-')
1185 {
1186 str += '-';
1187
1188 skip_input ();
1189 current_char = peek_input ();
1190
1191 additional_length_offset++;
1192 }
1193 else if (current_char == '+')
1194 {
1195 // don't add + but still skip input
1196 skip_input ();
1197 current_char = peek_input ();
1198
1199 additional_length_offset++;
1200 }
1201
1202 // parse another decimal number for exponent
1203 auto str_length = parse_in_decimal ();
1204 str += std::get<0> (str_length);
1205 additional_length_offset += std::get<1> (str_length);
1206 }
1207 return std::make_pair (str, additional_length_offset);
1208}
1209
1210// Parses a decimal integer.
1211std::tuple<std::string, int, bool>
1212Lexer::parse_in_decimal ()
1213{
1214 /* A pure decimal contains only digits. */
1215 bool pure_decimal = true;
1216 int additional_length_offset = 0;
1217 std::string str;
1218 while (ISDIGIT (current_char) || current_char == '_')
1219 {
1220 if (current_char == '_')
1221 {
1222 pure_decimal = false;
1223 // don't add _ to number
1224 skip_input ();
1225 current_char = peek_input ();
1226
1227 additional_length_offset++;
1228
1229 continue;
1230 }
1231
1232 additional_length_offset++;
1233
1234 str += current_char;
1235 skip_input ();
1236 current_char = peek_input ();
1237 }
1238 return std::make_tuple (str, additional_length_offset, pure_decimal);
1239}
1240
1241/* Parses escapes (and string continues) in "byte" strings and characters. Does
1242 * not support unicode. */
1243std::tuple<char, int, bool>
1244Lexer::parse_escape (char opening_char)
1245{
1246 int additional_length_offset = 0;
1247 char output_char = 0;
1248
1249 // skip to actual letter
1250 skip_input ();
1251 current_char = peek_input ();
1252 additional_length_offset++;
1253
1254 switch (current_char)
1255 {
1256 case 'x': {
1257 auto hex_escape_pair = parse_partial_hex_escape ();
1258 long hexLong = hex_escape_pair.first;
1259 additional_length_offset += hex_escape_pair.second;
1260
1261 if (hexLong > 255 || hexLong < 0)
1262 rust_error_at (
1263 get_current_location (),
1264 "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
1265 static_cast<unsigned int> (hexLong));
1266 /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1267 * support %X directly */
1268 char hexChar = static_cast<char> (hexLong);
1269
1270 output_char = hexChar;
1271 }
1272 break;
1273 case 'n':
1274 output_char = '\n';
1275 break;
1276 case 'r':
1277 output_char = '\r';
1278 break;
1279 case 't':
1280 output_char = '\t';
1281 break;
1282 case '\\':
1283 output_char = '\\';
1284 break;
1285 case '0':
1286 output_char = '\0';
1287 break;
1288 case '\'':
1289 output_char = '\'';
1290 break;
1291 case '"':
1292 output_char = '"';
1293 break;
1294 case 'u':
1295 rust_error_at (get_current_location (),
1296 "cannot have a unicode escape \\u in a byte %s",
1297 opening_char == '\'' ? "character" : "string");
1298 // Try to parse it anyway, just to skip it
1299 parse_partial_unicode_escape ();
1300 return std::make_tuple (output_char, additional_length_offset, false);
1301 case '\r':
1302 case '\n':
1303 // string continue
1304 return std::make_tuple (0, parse_partial_string_continue (), true);
1305 default:
1306 rust_error_at (get_current_location (),
1307 "unknown escape sequence %<\\%c%>", current_char);
1308 // returns false if no parsing could be done
1309 // return false;
1310 return std::make_tuple (output_char, additional_length_offset, false);
1311 break;
1312 }
1313 // all non-special cases (string continue) should skip their used char
1314 skip_input ();
1315 current_char = peek_input ();
1316 additional_length_offset++;
1317
1318 // returns true if parsing was successful
1319 // return true;
1320 return std::make_tuple (output_char, additional_length_offset, false);
1321}
1322
1323/* Parses an escape (or string continue) in a string or character. Supports
1324 * unicode escapes. */
1325std::tuple<Codepoint, int, bool>
1326Lexer::parse_utf8_escape (char opening_char)
1327{
1328 Codepoint output_char;
1329 int additional_length_offset = 0;
1330
1331 // skip to actual letter
1332 skip_input ();
1333 current_char = peek_input ();
1334 additional_length_offset++;
1335
1336 switch (current_char)
1337 {
1338 case 'x': {
1339 auto hex_escape_pair = parse_partial_hex_escape ();
1340 long hexLong = hex_escape_pair.first;
1341 additional_length_offset += hex_escape_pair.second;
1342
1343 if (hexLong > 127 || hexLong < 0)
1344 rust_error_at (
1345 get_current_location (),
1346 "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
1347 static_cast<unsigned int> (hexLong));
1348 /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1349 * support %X directly */
1350 char hexChar = static_cast<char> (hexLong);
1351
1352 output_char = hexChar;
1353 }
1354 break;
1355 case 'n':
1356 output_char = '\n';
1357 break;
1358 case 'r':
1359 output_char = '\r';
1360 break;
1361 case 't':
1362 output_char = '\t';
1363 break;
1364 case '\\':
1365 output_char = '\\';
1366 break;
1367 case '0':
1368 output_char = '\0';
1369 break;
1370 case '\'':
1371 output_char = '\'';
1372 break;
1373 case '"':
1374 output_char = '"';
1375 break;
1376 case 'u': {
1377 auto unicode_escape_pair = parse_partial_unicode_escape ();
1378 output_char = unicode_escape_pair.first;
1379 additional_length_offset += unicode_escape_pair.second;
1380
1381 return std::make_tuple (output_char, additional_length_offset, false);
1382 }
1383 break;
1384 case '\r':
1385 case '\n':
1386 // string continue
1387 return std::make_tuple (0, parse_partial_string_continue (), true);
1388 default:
1389 rust_error_at (get_current_location (),
1390 "unknown escape sequence %<\\%c%>", current_char);
1391 // returns false if no parsing could be done
1392 // return false;
1393 return std::make_tuple (output_char, additional_length_offset, false);
1394 break;
1395 }
1396 /* all non-special cases (unicode, string continue) should skip their used
1397 * char */
1398 skip_input ();
1399 current_char = peek_input ();
1400 additional_length_offset++;
1401
1402 // returns true if parsing was successful
1403 // return true;
1404 return std::make_tuple (output_char, additional_length_offset, false);
1405}
1406
1407// Parses the body of a string continue that has been found in an escape.
1408int
1409Lexer::parse_partial_string_continue ()
1410{
1411 int additional_length_offset = 1;
1412
1413 // string continue
1414 while (is_whitespace (current_char))
1415 {
1416 if (current_char == '\n')
1417 {
1418 current_line++;
1419 current_column = 1;
1420 // tell line_table that new line starts
1421 start_line (current_line, max_column_hint);
1422
1423 // reset "length"
1424 additional_length_offset = 1;
1425
1426 // get next char
1427 skip_input ();
1428 current_char = peek_input ();
1429
1430 continue;
1431 }
1432
1433 skip_input ();
1434 current_char = peek_input ();
1435 additional_length_offset++;
1436 }
1437
1438 return additional_length_offset;
1439}
1440
1441/* Parses the body of a '\x' escape. Note that it does not check that the number
1442 * is valid and smaller than 255. */
1443std::pair<long, int>
1444Lexer::parse_partial_hex_escape ()
1445{
1446 // hex char string (null-terminated)
1447 char hexNum[3] = {0, 0, 0};
1448
1449 // first hex char
1450 current_char = peek_input (1);
1451 int additional_length_offset = 1;
1452
1453 if (!is_x_digit (current_char))
1454 {
1455 rust_error_at (get_current_location (),
1456 "invalid character %<\\x%c%> in \\x sequence",
1457 current_char);
1458 return std::make_pair (0, 0);
1459 }
1460 hexNum[0] = current_char;
1461
1462 // second hex char
1463 skip_input ();
1464 current_char = peek_input (1);
1465 additional_length_offset++;
1466
1467 if (!is_x_digit (current_char))
1468 {
1469 rust_error_at (get_current_location (),
1470 "invalid character %<\\x%c%c%> in \\x sequence", hexNum[0],
1471 current_char);
1472 return std::make_pair (0, 1);
1473 }
1474 skip_input ();
1475 hexNum[1] = current_char;
1476
1477 long hexLong = std::strtol (hexNum, nullptr, 16);
1478
1479 return std::make_pair (hexLong, additional_length_offset);
1480}
1481
1482// Parses the body of a unicode escape.
1483std::pair<Codepoint, int>
1484Lexer::parse_partial_unicode_escape ()
1485{
1486 skip_input ();
1487 current_char = peek_input ();
1488 int additional_length_offset = 0;
1489
1490 if (current_char != '{')
1491 {
1492 rust_error_at (get_current_location (),
1493 "unicode escape should start with %<{%>");
1494 /* Skip what should probaby have been between brackets. */
1495 while (is_x_digit (current_char) || current_char == '_')
1496 {
1497 skip_input ();
1498 current_char = peek_input ();
1499 additional_length_offset++;
1500 }
1501 return std::make_pair (Codepoint (0), additional_length_offset);
1502 }
1503
1504 skip_input ();
1505 current_char = peek_input ();
1506 additional_length_offset++;
1507
1508 if (current_char == '_')
1509 {
1510 rust_error_at (get_current_location (),
1511 "unicode escape cannot start with %<_%>");
1512 skip_input ();
1513 current_char = peek_input ();
1514 additional_length_offset++;
1515 // fallthrough and try to parse the rest anyway
1516 }
1517
1518 // parse unicode escape - 1-6 hex digits
1519 std::string num_str;
1520 num_str.reserve (6);
1521
1522 // loop through to add entire hex number to string
1523 while (is_x_digit (current_char) || current_char == '_')
1524 {
1525 if (current_char == '_')
1526 {
1527 // don't add _ to number
1528 skip_input ();
1529 current_char = peek_input ();
1530
1531 additional_length_offset++;
1532
1533 continue;
1534 }
1535
1536 additional_length_offset++;
1537
1538 // add raw hex numbers
1539 num_str += current_char;
1540
1541 skip_input ();
1542 current_char = peek_input ();
1543 }
1544
1545 if (current_char == '}')
1546 {
1547 skip_input ();
1548 current_char = peek_input ();
1549 additional_length_offset++;
1550 }
1551 else
1552 {
1553 // actually an error, but allow propagation anyway Assume that
1554 // wrong bracketm whitespace or single/double quotes are wrong
1555 // termination, otherwise it is a wrong character, then skip to the actual
1556 // terminator.
1557 if (current_char == '{' || is_whitespace (current_char)
1558 || current_char == '\'' || current_char == '"')
1559 {
1560 rust_error_at (get_current_location (),
1561 "expected terminating %<}%> in unicode escape");
1562 return std::make_pair (Codepoint (0), additional_length_offset);
1563 }
1564 else
1565 {
1566 rust_error_at (get_current_location (),
1567 "invalid character %<%c%> in unicode escape",
1568 current_char);
1569 while (current_char != '}' && current_char != '{'
1570 && !is_whitespace (current_char) && current_char != '\''
1571 && current_char != '"')
1572 {
1573 skip_input ();
1574 current_char = peek_input ();
1575 additional_length_offset++;
1576 }
1577 // Consume the actual closing bracket if found
1578 if (current_char == '}')
1579 {
1580 skip_input ();
1581 current_char = peek_input ();
1582 additional_length_offset++;
1583 }
1584 return std::make_pair (Codepoint (0), additional_length_offset);
1585 }
1586 }
1587
1588 // ensure 1-6 hex characters
1589 if (num_str.length () > 6 || num_str.length () < 1)
1590 {
1591 rust_error_at (get_current_location (),
1592 "unicode escape should be between 1 and 6 hex "
1593 "characters; it is %lu",
1594 (unsigned long) num_str.length ());
1595 // return false;
1596 return std::make_pair (Codepoint (0), additional_length_offset);
1597 }
1598
1599 unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
1600
1601 if (hex_num > 0xd7ff && hex_num < 0xe000)
1602 {
1603 rust_error_at (
1604 get_current_location (),
1605 "unicode escape cannot be a surrogate value (D800 to DFFF)");
1606 return std::make_pair (Codepoint (0), additional_length_offset);
1607 }
1608
1609 if (hex_num > 0x10ffff)
1610 {
1611 rust_error_at (get_current_location (),
1612 "unicode escape cannot be larger than 10FFFF");
1613 return std::make_pair (Codepoint (0), additional_length_offset);
1614 }
1615
1616 // return true;
1617 return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
1618 additional_length_offset);
1619}
1620
1621// Parses a byte character.
1622TokenPtr
1623Lexer::parse_byte_char (Location loc)
1624{
1625 skip_input ();
1626 current_column++;
1627 // make current char the next character
1628 current_char = peek_input ();
1629
1630 int length = 1;
1631
1632 // char to save
1633 char byte_char = 0;
1634
1635 // detect escapes
1636 if (current_char == '\\')
1637 {
1638 auto escape_length_pair = parse_escape ('\'');
1639 byte_char = std::get<0> (escape_length_pair);
1640 length += std::get<1> (escape_length_pair);
1641
1642 current_char = peek_input ();
1643
1644 if (current_char != '\'')
1645 {
1646 rust_error_at (get_current_location (), "unclosed %<byte char%>");
1647 }
1648
1649 skip_input ();
1650 current_char = peek_input ();
1651 length++; // go to next char
1652 }
1653 else if (current_char != '\'')
1654 {
1655 // otherwise, get character from direct input character
1656 byte_char = current_char;
1657
1658 skip_input ();
1659 current_char = peek_input ();
1660 length++;
1661
1662 if (current_char != '\'')
1663 {
1664 rust_error_at (get_current_location (), "unclosed %<byte char%>");
1665 }
1666
1667 skip_input ();
1668 current_char = peek_input ();
1669 length++; // go to next char
1670 }
1671 else
1672 {
1673 rust_error_at (get_current_location (),
1674 "no character inside %<%> for %<byte char%>");
1675 }
1676
1677 current_column += length;
1678
1679 return Token::make_byte_char (loc, byte_char);
1680}
1681
1682// Parses a byte string.
1683TokenPtr
1684Lexer::parse_byte_string (Location loc)
1685{
1686 // byte string
1687
1688 // skip quote character
1689 skip_input ();
1690 current_column++;
1691
1692 std::string str;
1693 str.reserve (16); // some sensible default
1694
1695 int length = 1;
1696 current_char = peek_input ();
1697
1698 while (current_char != '"' && current_char != EOF)
1699 {
1700 if (current_char == '\\')
1701 {
1702 auto escape_length_pair = parse_escape ('"');
1703 char output_char = std::get<0> (escape_length_pair);
1704
1705 if (output_char == 0 && std::get<2> (escape_length_pair))
1706 length = std::get<1> (escape_length_pair) - 1;
1707 else
1708 length += std::get<1> (escape_length_pair);
1709
1710 if (output_char != 0 || !std::get<2> (escape_length_pair))
1711 str += output_char;
1712
1713 continue;
1714 }
1715
1716 length++;
1717
1718 str += current_char;
1719 skip_input ();
1720 current_char = peek_input ();
1721 }
1722
1723 current_column += length;
1724
1725 if (current_char == '"')
1726 {
1727 current_column++;
1728
1729 skip_input ();
1730 current_char = peek_input ();
1731 }
1732 else if (current_char == EOF)
1733 {
1734 rust_error_at (get_current_location (), "unended byte string literal");
1735 return Token::make (END_OF_FILE, get_current_location ());
1736 }
1737 else
1738 {
1739 gcc_unreachable ();
1740 }
1741
1742 str.shrink_to_fit ();
1743
1744 return Token::make_byte_string (loc, std::move (str));
1745}
1746
1747// Parses a raw byte string.
1748TokenPtr
1749Lexer::parse_raw_byte_string (Location loc)
1750{
1751 // raw byte string literals
1752 std::string str;
1753 str.reserve (16); // some sensible default
1754
1755 int length = 1;
1756 int hash_count = 0;
1757
1758 // get hash count at beginnning
1759 skip_input ();
1760 current_char = peek_input ();
1761 length++;
1762 while (current_char == '#')
1763 {
1764 hash_count++;
1765 length++;
1766
1767 skip_input ();
1768 current_char = peek_input ();
1769 }
1770
1771 if (current_char != '"')
1772 {
1773 rust_error_at (get_current_location (),
1774 "raw byte string has no opening %<\"%>");
1775 }
1776
1777 skip_input ();
1778 current_char = peek_input ();
1779 length++;
1780
1781 while (true)
1782 {
1783 if (current_char == '"')
1784 {
1785 bool enough_hashes = true;
1786
1787 for (int i = 0; i < hash_count; i++)
1788 {
1789 if (peek_input (i + 1) != '#')
1790 {
1791 enough_hashes = false;
1792 break;
1793 }
1794 }
1795
1796 if (enough_hashes)
1797 {
1798 // skip enough input and peek enough input
1799 skip_input (hash_count);
1800 current_char = peek_input ();
1801 length += hash_count + 1;
1802 break;
1803 }
1804 }
1805
1806 if ((unsigned char) current_char > 127)
1807 {
1808 rust_error_at (get_current_location (),
1809 "character %<%c%> in raw byte string out of range",
1810 current_char);
1811 current_char = 0;
1812 }
1813
1814 length++;
1815
1816 str += current_char;
1817 skip_input ();
1818 current_char = peek_input ();
1819 }
1820
1821 current_column += length;
1822
1823 str.shrink_to_fit ();
1824
1825 return Token::make_byte_string (loc, std::move (str));
1826}
1827
1828// Parses a raw identifier.
1829TokenPtr
1830Lexer::parse_raw_identifier (Location loc)
1831{
1832 // raw identifier
1833 std::string str;
1834 str.reserve (16); // default
1835
1836 skip_input ();
1837 current_char = peek_input ();
1838
1839 current_column += 2;
1840
1841 bool first_is_underscore = current_char == '_';
1842
1843 int length = 0;
1844 current_char = peek_input ();
1845 // loop through entire name
1846 while (ISALPHA (current_char) || ISDIGIT (current_char)
1847 || current_char == '_')
1848 {
1849 length++;
1850
1851 str += current_char;
1852 skip_input ();
1853 current_char = peek_input ();
1854 }
1855
1856 current_column += length;
1857
1858 // if just a single underscore, not an identifier
1859 if (first_is_underscore && length == 1)
1860 rust_error_at (get_current_location (),
1861 "%<_%> is not a valid raw identifier");
1862
1863 if (str == "crate" || str == "extern" || str == "self" || str == "super"
1864 || str == "Self")
1865 {
1866 rust_error_at (get_current_location (),
1867 "%qs is a forbidden raw identifier", str.c_str ());
1868
1869 return nullptr;
1870 }
1871 else
1872 {
1873 str.shrink_to_fit ();
1874
1875 return Token::make_identifier (loc, std::move (str));
1876 }
1877}
1878
1879// skip broken string input (unterminated strings)
1880void
1881Lexer::skip_broken_string_input (int current_char)
1882{
1883 while (current_char != '"' && current_char != EOF)
1884 {
1885 if (current_char == '\n')
1886 {
1887 current_line++;
1888 current_column = 1;
1889 }
1890 else
1891 {
1892 current_column++;
1893 }
1894 skip_input ();
1895 current_char = peek_input ();
1896 }
1897 if (current_char == '"')
1898 {
1899 current_column++;
1900
1901 skip_input ();
1902 current_char = peek_input ();
1903 }
1904 rust_debug ("skipped to %d:%d due to bad quotes", current_line,
1905 current_column);
1906}
1907
1908// Parses a unicode string.
1909TokenPtr
1910Lexer::parse_string (Location loc)
1911{
1912 Codepoint current_char32;
1913
1914 std::string str;
1915 str.reserve (16); // some sensible default
1916
1917 int length = 1;
1918 current_char32 = peek_codepoint_input ();
1919
1920 // FIXME: This fails if the input ends. How do we check for EOF?
1921 while (current_char32.value != '"' && !current_char32.is_eof ())
1922 {
1923 if (current_char32.value == '\\')
1924 {
1925 // parse escape
1926 auto utf8_escape_pair = parse_utf8_escape ('\'');
1927 current_char32 = std::get<0> (utf8_escape_pair);
1928
1929 if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair))
1930 length = std::get<1> (utf8_escape_pair) - 1;
1931 else
1932 length += std::get<1> (utf8_escape_pair);
1933
1934 if (current_char32 != Codepoint (0)
1935 || !std::get<2> (utf8_escape_pair))
1936 str += current_char32;
1937
1938 // required as parsing utf8 escape only changes current_char
1939 current_char32 = peek_codepoint_input ();
1940
1941 continue;
1942 }
1943
1944 length += get_input_codepoint_length ();
1945
1946 str += current_char32;
1947 skip_codepoint_input ();
1948 current_char32 = peek_codepoint_input ();
1949 }
1950
1951 current_column += length;
1952
1953 if (current_char32.value == '"')
1954 {
1955 current_column++;
1956
1957 skip_input ();
1958 current_char = peek_input ();
1959 }
1960 else if (current_char32.is_eof ())
1961 {
1962 rust_error_at (get_current_location (), "unended string literal");
1963 return Token::make (END_OF_FILE, get_current_location ());
1964 }
1965 else
1966 {
1967 gcc_unreachable ();
1968 }
1969
1970 str.shrink_to_fit ();
1971 return Token::make_string (loc, std::move (str));
1972}
1973
1974// Parses an identifier or keyword.
1975TokenPtr
1976Lexer::parse_identifier_or_keyword (Location loc)
1977{
1978 std::string str;
1979 str.reserve (16); // default
1980 str += current_char;
1981
1982 bool first_is_underscore = current_char == '_';
1983
1984 int length = 1;
1985 current_char = peek_input ();
1986 // loop through entire name
1987 while (ISALPHA (current_char) || ISDIGIT (current_char)
1988 || current_char == '_')
1989 {
1990 length++;
1991
1992 str += current_char;
1993 skip_input ();
1994 current_char = peek_input ();
1995 }
1996
1997 current_column += length;
1998
1999 // if just a single underscore, not an identifier
2000 if (first_is_underscore && length == 1)
2001 return Token::make (UNDERSCORE, loc);
2002
2003 str.shrink_to_fit ();
2004
2005 TokenId keyword = classify_keyword (str);
2006 if (keyword == IDENTIFIER)
2007 return Token::make_identifier (loc, std::move (str));
2008 else
2009 return Token::make (keyword, loc);
2010}
2011
2012// Possibly returns a raw string token if it exists - otherwise returns null.
2013TokenPtr
2014Lexer::maybe_parse_raw_string (Location loc)
2015{
2016 int peek_index = 0;
2017 while (peek_input (peek_index) == '#')
2018 peek_index++;
2019
2020 if (peek_input (peek_index) == '"')
2021 return parse_raw_string (loc, peek_index);
2022 else
2023 return nullptr;
2024}
2025
2026// Returns a raw string token.
2027TokenPtr
2028Lexer::parse_raw_string (Location loc, int initial_hash_count)
2029{
2030 // raw string literals
2031 std::string str;
2032 str.reserve (16); // some sensible default
2033
2034 int length = 1 + initial_hash_count;
2035
2036 if (initial_hash_count > 0)
2037 skip_input (initial_hash_count - 1);
2038
2039 current_char = peek_input ();
2040
2041 if (current_char != '"')
2042 rust_error_at (get_current_location (), "raw string has no opening %<\"%>");
2043
2044 length++;
2045 skip_input ();
2046 Codepoint current_char32 = peek_codepoint_input ();
2047
2048 while (!current_char32.is_eof ())
2049 {
2050 if (current_char32.value == '"')
2051 {
2052 bool enough_hashes = true;
2053
2054 for (int i = 0; i < initial_hash_count; i++)
2055 {
2056 if (peek_input (i + 1) != '#')
2057 {
2058 enough_hashes = false;
2059 break;
2060 }
2061 }
2062
2063 if (enough_hashes)
2064 {
2065 // skip enough input and peek enough input
2066 skip_input (initial_hash_count);
2067 current_char = peek_input ();
2068 length += initial_hash_count + 1;
2069 break;
2070 }
2071 }
2072
2073 length++;
2074
2075 str += current_char32;
2076 skip_codepoint_input ();
2077 current_char32 = peek_codepoint_input ();
2078 }
2079
2080 current_column += length;
2081
2082 str.shrink_to_fit ();
2083
2084 return Token::make_string (loc, std::move (str));
2085}
2086
2087template <typename IsDigitFunc>
2088TokenPtr
2089Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func,
2090 std::string existent_str, int base)
2091{
2092 int length = 1;
2093
2094 skip_input ();
2095 current_char = peek_input ();
2096
2097 length++;
2098
2099 // loop through to add entire number to string
2100 while (is_digit_func (current_char) || current_char == '_')
2101 {
2102 if (current_char == '_')
2103 {
2104 // don't add _ to number
2105 skip_input ();
2106 current_char = peek_input ();
2107
2108 length++;
2109
2110 continue;
2111 }
2112
2113 length++;
2114
2115 // add raw numbers
2116 existent_str += current_char;
2117 skip_input ();
2118 current_char = peek_input ();
2119 }
2120
2121 // convert value to decimal representation
2122 long dec_num = std::strtol (existent_str.c_str (), nullptr, base);
2123
2124 existent_str = std::to_string (dec_num);
2125
2126 // parse in type suffix if it exists
2127 auto type_suffix_pair = parse_in_type_suffix ();
2128 PrimitiveCoreType type_hint = type_suffix_pair.first;
2129 length += type_suffix_pair.second;
2130
2131 current_column += length;
2132
2133 if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
2134 {
2135 rust_error_at (get_current_location (),
2136 "invalid type suffix %qs for integer (%s) literal",
2137 get_type_hint_string (type_hint),
2138 base == 16
2139 ? "hex"
2140 : (base == 8 ? "octal"
2141 : (base == 2 ? "binary"
2142 : "<insert unknown base>")));
2143 return nullptr;
2144 }
2145 return Token::make_int (loc, std::move (existent_str), type_hint);
2146}
2147
2148// Parses a hex, binary or octal int literal.
2149TokenPtr
2150Lexer::parse_non_decimal_int_literals (Location loc)
2151{
2152 std::string str;
2153 str.reserve (16); // some sensible default
2154 str += current_char;
2155
2156 current_char = peek_input ();
2157
2158 if (current_char == 'x')
2159 {
2160 // hex (integer only)
2161 return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
2162 }
2163 else if (current_char == 'o')
2164 {
2165 // octal (integer only)
2166 return parse_non_decimal_int_literal (loc, is_octal_digit,
2167 std::move (str), 8);
2168 }
2169 else if (current_char == 'b')
2170 {
2171 // binary (integer only)
2172 return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
2173 2);
2174 }
2175 else
2176 {
2177 return nullptr;
2178 }
2179}
2180
2181// Parses a decimal-based int literal or float literal.
2182TokenPtr
2183Lexer::parse_decimal_int_or_float (Location loc)
2184{
2185 std::string str;
2186 str.reserve (16); // some sensible default
2187 str += current_char;
2188
2189 int length = 1;
2190 bool first_zero = current_char == '0';
2191
2192 current_char = peek_input ();
2193
2194 // parse initial decimal integer (or first integer part of float) literal
2195 auto initial_decimal = parse_in_decimal ();
2196 str += std::get<0> (initial_decimal);
2197 length += std::get<1> (initial_decimal);
2198
2199 // detect float literal
2200 if (current_char == '.' && is_float_digit (peek_input (1)))
2201 {
2202 // float with a '.', parse another decimal into it
2203
2204 // add . to str
2205 str += current_char;
2206 skip_input ();
2207 current_char = peek_input ();
2208 length++;
2209
2210 // parse another decimal number for float
2211 auto second_decimal = parse_in_decimal ();
2212 str += std::get<0> (second_decimal);
2213 length += std::get<1> (second_decimal);
2214
2215 // parse in exponent part if it exists
2216 auto exponent_pair = parse_in_exponent_part ();
2217 str += exponent_pair.first;
2218 length += exponent_pair.second;
2219
2220 // parse in type suffix if it exists
2221 auto type_suffix_pair = parse_in_type_suffix ();
2222 PrimitiveCoreType type_hint = type_suffix_pair.first;
2223 length += type_suffix_pair.second;
2224
2225 if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
2226 && type_hint != CORETYPE_UNKNOWN)
2227 {
2228 rust_error_at (get_current_location (),
2229 "invalid type suffix %qs for floating-point literal",
2230 get_type_hint_string (type_hint));
2231 // ignore invalid type suffix as everything else seems fine
2232 type_hint = CORETYPE_UNKNOWN;
2233 }
2234
2235 current_column += length;
2236
2237 str.shrink_to_fit ();
2238 return Token::make_float (loc, std::move (str), type_hint);
2239 }
2240 else if (current_char == '.' && check_valid_float_dot_end (peek_input (1)))
2241 {
2242 // float that is just an integer with a terminating '.' character
2243
2244 // add . to str
2245 str += current_char;
2246 skip_input ();
2247 current_char = peek_input ();
2248 length++;
2249
2250 // add a '0' after the . to prevent ambiguity
2251 str += '0';
2252
2253 // type hint not allowed
2254
2255 current_column += length;
2256
2257 str.shrink_to_fit ();
2258 return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN);
2259 }
2260 else if (current_char == 'E' || current_char == 'e')
2261 {
2262 // exponent float with no '.' character
2263
2264 // parse exponent part
2265 auto exponent_pair = parse_in_exponent_part ();
2266 str += exponent_pair.first;
2267 length += exponent_pair.second;
2268
2269 // parse in type suffix if it exists
2270 auto type_suffix_pair = parse_in_type_suffix ();
2271 PrimitiveCoreType type_hint = type_suffix_pair.first;
2272 length += type_suffix_pair.second;
2273
2274 if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
2275 && type_hint != CORETYPE_UNKNOWN)
2276 {
2277 rust_error_at (get_current_location (),
2278 "invalid type suffix %qs for floating-point literal",
2279 get_type_hint_string (type_hint));
2280 // ignore invalid type suffix as everything else seems fine
2281 type_hint = CORETYPE_UNKNOWN;
2282 }
2283
2284 current_column += length;
2285
2286 str.shrink_to_fit ();
2287 return Token::make_float (loc, std::move (str), type_hint);
2288 }
2289 else
2290 {
2291 // is an integer
2292
2293 // parse in type suffix if it exists
2294 auto type_suffix_pair = parse_in_type_suffix ();
2295 PrimitiveCoreType type_hint = type_suffix_pair.first;
2296 /* A "real" pure decimal doesn't have a suffix and no zero prefix. */
2297 if (type_hint == CORETYPE_UNKNOWN)
2298 {
2299 bool pure_decimal = std::get<2> (initial_decimal);
2300 if (pure_decimal && (!first_zero || str.size () == 1))
2301 type_hint = CORETYPE_PURE_DECIMAL;
2302 }
2303 length += type_suffix_pair.second;
2304
2305 current_column += length;
2306
2307 str.shrink_to_fit ();
2308 return Token::make_int (loc, std::move (str), type_hint);
2309 }
2310}
2311
2312TokenPtr
2313Lexer::parse_char_or_lifetime (Location loc)
2314{
2315 Codepoint current_char32;
2316
2317 int length = 1;
2318
2319 current_char32 = peek_codepoint_input ();
2320 if (current_char32.is_eof ())
2321 return nullptr;
2322
2323 // parse escaped char literal
2324 if (current_char32.value == '\\')
2325 {
2326 // parse escape
2327 auto utf8_escape_pair = parse_utf8_escape ('\'');
2328 current_char32 = std::get<0> (utf8_escape_pair);
2329 length += std::get<1> (utf8_escape_pair);
2330
2331 if (peek_codepoint_input ().value != '\'')
2332 {
2333 rust_error_at (get_current_location (), "unended character literal");
2334 }
2335 else
2336 {
2337 skip_codepoint_input ();
2338 current_char = peek_input ();
2339 length++;
2340 }
2341
2342 current_column += length;
2343
2344 return Token::make_char (loc, current_char32);
2345 }
2346 else
2347 {
2348 skip_codepoint_input ();
2349
2350 if (peek_codepoint_input ().value == '\'')
2351 {
2352 // parse non-escaped char literal
2353
2354 // skip the ' character
2355 skip_input ();
2356 current_char = peek_input ();
2357
2358 // TODO fix due to different widths of utf-8 chars?
2359 current_column += 3;
2360
2361 return Token::make_char (loc, current_char32);
2362 }
2363 else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
2364 || current_char32.value == '_')
2365 {
2366 // parse lifetime name
2367 std::string str;
2368 str += current_char32;
2369 length++;
2370
2371 current_char = peek_input ();
2372 while (ISDIGIT (current_char) || ISALPHA (current_char)
2373 || current_char == '_')
2374 {
2375 str += current_char;
2376 skip_input ();
2377 current_char = peek_input ();
2378 length++;
2379 }
2380
2381 current_column += length;
2382
2383 str.shrink_to_fit ();
2384 return Token::make_lifetime (loc, std::move (str));
2385 }
2386 else
2387 {
2388 rust_error_at (
2389 get_current_location (),
2390 "expected %' after character constant in character literal");
2391 return nullptr;
2392 }
2393 }
2394}
2395
2396// Returns the length of the codepoint at the current position.
2397int
2398Lexer::get_input_codepoint_length ()
2399{
2400 uint8_t input = peek_input ();
2401
2402 if ((int8_t) input == EOF)
2403 return 0;
2404
2405 if (input < 128)
2406 {
2407 // ascii -- 1 byte
2408 // return input;
2409
2410 return 1;
2411 }
2412 else if ((input & 0xC0) == 0x80)
2413 {
2414 // invalid (continuation; can't be first char)
2415 // return 0xFFFE;
2416
2417 return 0;
2418 }
2419 else if ((input & 0xE0) == 0xC0)
2420 {
2421 // 2 bytes
2422 uint8_t input2 = peek_input (1);
2423 if ((input2 & 0xC0) != 0x80)
2424 return 0;
2425 // return 0xFFFE;
2426
2427 // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
2428 // return output;
2429 return 2;
2430 }
2431 else if ((input & 0xF0) == 0xE0)
2432 {
2433 // 3 bytes
2434 uint8_t input2 = peek_input (1);
2435 if ((input2 & 0xC0) != 0x80)
2436 return 0;
2437 // return 0xFFFE;
2438
2439 uint8_t input3 = peek_input (2);
2440 if ((input3 & 0xC0) != 0x80)
2441 return 0;
2442 // return 0xFFFE;
2443
2444 /*uint32_t output
2445 = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
2446 0); return output;*/
2447 return 3;
2448 }
2449 else if ((input & 0xF8) == 0xF0)
2450 {
2451 // 4 bytes
2452 uint8_t input2 = peek_input (1);
2453 if ((input2 & 0xC0) != 0x80)
2454 return 0;
2455 // return 0xFFFE;
2456
2457 uint8_t input3 = peek_input (2);
2458 if ((input3 & 0xC0) != 0x80)
2459 return 0;
2460 // return 0xFFFE;
2461
2462 uint8_t input4 = peek_input (3);
2463 if ((input4 & 0xC0) != 0x80)
2464 return 0;
2465 // return 0xFFFE;
2466
2467 /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
2468 | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
2469 return output;*/
2470 return 4;
2471 }
2472 else
2473 {
2474 rust_error_at (get_current_location (),
2475 "invalid UTF-8 [FIRST] (too long)");
2476 return 0;
2477 }
2478}
2479
2480// Returns the codepoint at the current position.
2481Codepoint
2482Lexer::peek_codepoint_input ()
2483{
2484 uint8_t input = peek_input ();
2485
2486 if ((int8_t) input == EOF)
2487 return Codepoint::eof ();
2488
2489 if (input < 128)
2490 {
2491 // ascii -- 1 byte
2492 return {input};
2493 }
2494 else if ((input & 0xC0) == 0x80)
2495 {
2496 // invalid (continuation; can't be first char)
2497 return {0xFFFE};
2498 }
2499 else if ((input & 0xE0) == 0xC0)
2500 {
2501 // 2 bytes
2502 uint8_t input2 = peek_input (1);
2503 if ((input2 & 0xC0) != 0x80)
2504 return {0xFFFE};
2505
2506 uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
2507 return {output};
2508 }
2509 else if ((input & 0xF0) == 0xE0)
2510 {
2511 // 3 bytes
2512 uint8_t input2 = peek_input (1);
2513 if ((input2 & 0xC0) != 0x80)
2514 return {0xFFFE};
2515
2516 uint8_t input3 = peek_input (2);
2517 if ((input3 & 0xC0) != 0x80)
2518 return {0xFFFE};
2519
2520 uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
2521 | ((input3 & 0x3F) << 0);
2522 return {output};
2523 }
2524 else if ((input & 0xF8) == 0xF0)
2525 {
2526 // 4 bytes
2527 uint8_t input2 = peek_input (1);
2528 if ((input2 & 0xC0) != 0x80)
2529 return {0xFFFE};
2530
2531 uint8_t input3 = peek_input (2);
2532 if ((input3 & 0xC0) != 0x80)
2533 return {0xFFFE};
2534
2535 uint8_t input4 = peek_input (3);
2536 if ((input4 & 0xC0) != 0x80)
2537 return {0xFFFE};
2538
2539 uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
2540 | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
2541 return {output};
2542 }
2543 else
2544 {
2545 rust_error_at (get_current_location (),
2546 "invalid UTF-8 [SECND] (too long)");
2547 return {0xFFFE};
2548 }
2549}
2550
2551void
2552Lexer::skip_codepoint_input ()
2553{
2554 int toSkip = get_input_codepoint_length ();
2555 gcc_assert (toSkip >= 1);
2556
2557 skip_input (toSkip - 1);
2558}
2559
2560int
2561Lexer::test_get_input_codepoint_n_length (int n_start_offset)
2562{
2563 uint8_t input = peek_input (n_start_offset);
2564
2565 if (input < 128)
2566 {
2567 // ascii -- 1 byte
2568 // return input;
2569 return 1;
2570 }
2571 else if ((input & 0xC0) == 0x80)
2572 {
2573 // invalid (continuation; can't be first char)
2574 // return 0xFFFE;
2575 return 0;
2576 }
2577 else if ((input & 0xE0) == 0xC0)
2578 {
2579 // 2 bytes
2580 uint8_t input2 = peek_input (n_start_offset + 1);
2581 if ((input2 & 0xC0) != 0x80)
2582 // return 0xFFFE;
2583 return 0;
2584
2585 // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
2586 // return output;
2587 return 2;
2588 }
2589 else if ((input & 0xF0) == 0xE0)
2590 {
2591 // 3 bytes
2592 uint8_t input2 = peek_input (n_start_offset + 1);
2593 if ((input2 & 0xC0) != 0x80)
2594 // return 0xFFFE;
2595 return 0;
2596
2597 uint8_t input3 = peek_input (n_start_offset + 2);
2598 if ((input3 & 0xC0) != 0x80)
2599 // return 0xFFFE;
2600 return 0;
2601
2602 /*uint32_t output
2603 = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
2604 0); return output;*/
2605 return 3;
2606 }
2607 else if ((input & 0xF8) == 0xF0)
2608 {
2609 // 4 bytes
2610 uint8_t input2 = peek_input (n_start_offset + 1);
2611 if ((input2 & 0xC0) != 0x80)
2612 // return 0xFFFE;
2613 return 0;
2614
2615 uint8_t input3 = peek_input (n_start_offset + 2);
2616 if ((input3 & 0xC0) != 0x80)
2617 // return 0xFFFE;
2618 return 0;
2619
2620 uint8_t input4 = peek_input (n_start_offset + 3);
2621 if ((input4 & 0xC0) != 0x80)
2622 // return 0xFFFE;
2623 return 0;
2624
2625 /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
2626 | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
2627 return output;*/
2628 return 4;
2629 }
2630 else
2631 {
2632 rust_error_at (get_current_location (),
2633 "invalid UTF-8 [THIRD] (too long)");
2634 return 0;
2635 }
2636}
2637
2638// peeks the codepoint input at n codepoints ahead of current codepoint - try
2639// not to use
2640Codepoint
2641Lexer::test_peek_codepoint_input (int n)
2642{
2643 int totalOffset = 0;
2644
2645 // add up all offsets into total offset? does this do what I want?
2646 for (int i = 0; i < n; i++)
2647 {
2648 totalOffset += test_get_input_codepoint_n_length (totalOffset);
2649 }
2650 // issues: this would have (at least) O(n) lookup time, not O(1) like the
2651 // rest?
2652
2653 // TODO: implement if still needed
2654
2655 // error out of function as it is not implemented
2656 gcc_assert (1 == 0);
2657 return {0};
2658 /*
2659 uint8_t input = peek_input();
2660
2661 if (input < 128) {
2662 // ascii -- 1 byte
2663 return input;
2664 } else if ((input & 0xC0) == 0x80) {
2665 // invalid (continuation; can't be first char)
2666 return 0xFFFE;
2667 } else if ((input & 0xE0) == 0xC0) {
2668 // 2 bytes
2669 uint8_t input2 = peek_input(1);
2670 if ((input2 & 0xC0) != 0x80)
2671 return 0xFFFE;
2672
2673 uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
2674 return output;
2675 } else if ((input & 0xF0) == 0xE0) {
2676 // 3 bytes
2677 uint8_t input2 = peek_input(1);
2678 if ((input2 & 0xC0) != 0x80)
2679 return 0xFFFE;
2680
2681 uint8_t input3 = peek_input(2);
2682 if ((input3 & 0xC0) != 0x80)
2683 return 0xFFFE;
2684
2685 uint32_t output
2686 = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
2687 0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
2688 // 4 bytes
2689 uint8_t input2 = peek_input(1);
2690 if ((input2 & 0xC0) != 0x80)
2691 return 0xFFFE;
2692
2693 uint8_t input3 = peek_input(2);
2694 if ((input3 & 0xC0) != 0x80)
2695 return 0xFFFE;
2696
2697 uint8_t input4 = peek_input(3);
2698 if ((input4 & 0xC0) != 0x80)
2699 return 0xFFFE;
2700
2701 uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
2702 | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
2703 0); return output; } else { rust_error_at(get_current_location(), "invalid
2704 UTF-8 (too long)"); return 0xFFFE;
2705 }*/
2706}
2707
2708void
2709Lexer::split_current_token (TokenId new_left, TokenId new_right)
2710{
2711 /* TODO: assert that this TokenId is a "simple token" like punctuation and not
2712 * like "IDENTIFIER"? */
2713 Location current_loc = peek_token ()->get_locus ();
2714 TokenPtr new_left_tok = Token::make (new_left, current_loc);
2715 TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
2716
2717 token_queue.replace_current_value (std::move (new_left_tok));
2718 token_queue.insert (1, std::move (new_right_tok));
2719}
2720
2721void
2722Lexer::start_line (int current_line, int current_column)
2723{
2724 if (line_map)
2725 line_map->start_line (current_line, current_column);
2726}
2727
2728} // namespace Rust