]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/rust/lex/rust-lex.cc
Update copyright years.
[thirdparty/gcc.git] / gcc / rust / lex / rust-lex.cc
CommitLineData
a945c346 1// Copyright (C) 2020-2024 Free Software Foundation, Inc.
18f6990f
JP
2
3// This file is part of GCC.
4
5// GCC is free software; you can redistribute it and/or modify it under
6// the terms of the GNU General Public License as published by the Free
7// Software Foundation; either version 3, or (at your option) any later
8// version.
9
10// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11// WARRANTY; without even the implied warranty of MERCHANTABILITY or
12// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13// for more details.
14
15// You should have received a copy of the GNU General Public License
16// along with GCC; see the file COPYING3. If not see
17// <http://www.gnu.org/licenses/>.
18
19#include "rust-system.h"
20#include "rust-lex.h"
21#include "rust-diagnostics.h"
22#include "rust-linemap.h"
23#include "rust-session-manager.h"
24#include "safe-ctype.h"
25
26namespace Rust {
27// TODO: move to separate compilation unit?
28// overload += for uint32_t to allow 32-bit encoded utf-8 to be added
29std::string &
30operator+= (std::string &str, Codepoint char32)
31{
32 if (char32.value < 0x80)
33 {
34 str += static_cast<char> (char32.value);
35 }
36 else if (char32.value < (0x1F + 1) << (1 * 6))
37 {
38 str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
39 str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
40 }
41 else if (char32.value < (0x0F + 1) << (2 * 6))
42 {
43 str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
44 str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
45 str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
46 }
47 else if (char32.value < (0x07 + 1) << (3 * 6))
48 {
49 str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
50 str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
51 str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
52 str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
53 }
54 else
55 {
56 rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value);
57 }
58 return str;
59}
60
61std::string
62Codepoint::as_string ()
63{
64 std::string str;
65
66 // str += Codepoint (value);
67 str += *this;
68
69 return str;
70}
71
72/* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
73 * for handling. */
74bool
75is_float_digit (char number)
76{
77 return ISDIGIT (number) || number == 'E' || number == 'e';
78}
79
80/* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
81 * whatever is different */
82bool
83is_x_digit (char number)
84{
85 return ISXDIGIT (number);
86}
87
88bool
89is_octal_digit (char number)
90{
91 return number >= '0' && number <= '7';
92}
93
94bool
95is_bin_digit (char number)
96{
97 return number == '0' || number == '1';
98}
99
100bool
101check_valid_float_dot_end (char character)
102{
103 return character != '.' && character != '_' && !ISALPHA (character);
104}
105
106// ISSPACE from safe-ctype but may change in future
107bool
108is_whitespace (char character)
109{
110 return ISSPACE (character);
111}
112
113bool
114is_non_decimal_int_literal_separator (char character)
115{
116 return character == 'x' || character == 'o' || character == 'b';
117}
118
119Lexer::Lexer (const std::string &input)
120 : input (RAIIFile::create_error ()), current_line (1), current_column (1),
0ef795c3
RT
121 line_map (nullptr), dump_lex_out (Optional<std::ofstream &>::none ()),
122 raw_input_source (new BufferInputSource (input, 0)),
18f6990f
JP
123 input_queue{*raw_input_source}, token_queue (TokenSource (this))
124{}
125
0ef795c3
RT
126Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap,
127 Optional<std::ofstream &> dump_lex_opt)
18f6990f 128 : input (std::move (file_input)), current_line (1), current_column (1),
0ef795c3 129 line_map (linemap), dump_lex_out (dump_lex_opt),
18f6990f
JP
130 raw_input_source (new FileInputSource (input.get_raw ())),
131 input_queue{*raw_input_source}, token_queue (TokenSource (this))
132{
133 // inform line_table that file is being entered and is in line 1
134 if (linemap)
135 line_map->start_file (filename, current_line);
136}
137
138Lexer::~Lexer ()
139{
140 /* ok apparently stop (which is equivalent of original code in destructor) is
141 * meant to be called after all files have finished parsing, for cleanup. On
142 * the other hand, actual code that it calls to leave a certain line map is
143 * mentioned in GCC docs as being useful for "just leaving an included header"
144 * and stuff like that, so this line mapping functionality may need fixing.
145 * FIXME: find out whether this occurs. */
146
147 // line_map->stop();
148}
149
150/* TODO: need to optimise somehow to avoid the virtual function call in the
151 * tight loop. Best idea at the moment is CRTP, but that might make lexer
152 * implementation annoying when storing the "base class" (i.e. would need
153 * template parameter everywhere), although in practice it would mostly just
154 * look ugly and make enclosing classes like Parser also require a type
155 * parameter. At this point a macro might be better. OK I guess macros can be
156 * replaced by constexpr if or something if possible. */
157Location
158Lexer::get_current_location ()
159{
160 if (line_map)
161 return line_map->get_location (current_column);
162 else
163 // If we have no linemap, we're lexing something without proper locations
164 return Location ();
165}
166
167int
168Lexer::peek_input (int n)
169{
170 return input_queue.peek (n);
171}
172
173int
174Lexer::peek_input ()
175{
176 return peek_input (0);
177}
178
179void
180Lexer::skip_input (int n)
181{
182 input_queue.skip (n);
183}
184
185void
186Lexer::skip_input ()
187{
188 skip_input (0);
189}
190
0ef795c3
RT
191void
192Lexer::skip_token (int n)
193{
194 // dump tokens if dump-lex option is enabled
195 if (dump_lex_out.is_some ())
196 dump_and_skip (n);
197 else
198 token_queue.skip (n);
199}
200
201void
202Lexer::dump_and_skip (int n)
203{
204 std::ofstream &out = dump_lex_out.get ();
205 bool found_eof = false;
206 const_TokenPtr tok;
207 for (int i = 0; i < n + 1; i++)
208 {
209 if (!found_eof)
210 {
211 tok = peek_token ();
212 found_eof |= tok->get_id () == Rust::END_OF_FILE;
213
214 Location loc = tok->get_locus ();
215
216 out << "<id=";
217 out << tok->token_id_to_str ();
218 out << (tok->has_str () ? (std::string (", text=") + tok->get_str ()
219 + std::string (", typehint=")
220 + std::string (tok->get_type_hint_str ()))
221 : "")
222 << " ";
223 out << get_line_map ()->to_string (loc) << " ";
224 }
225
226 token_queue.skip (0);
227 }
228}
229
18f6990f
JP
230void
231Lexer::replace_current_token (TokenPtr replacement)
232{
233 token_queue.replace_current_value (replacement);
234
235 rust_debug ("called 'replace_current_token' - this is deprecated");
236}
237
238/* shitty anonymous namespace that can only be accessed inside the compilation
239 * unit - used for classify_keyword binary search in sorted array of keywords
240 * created with x-macros. */
241namespace {
242// TODO: make constexpr when update to c++20
243const std::string keyword_index[] = {
244#define RS_TOKEN(x, y)
245#define RS_TOKEN_KEYWORD(name, keyword) keyword,
246 RS_TOKEN_LIST
247#undef RS_TOKEN_KEYWORD
248#undef RS_TOKEN
249};
250
251constexpr TokenId keyword_keys[] = {
252#define RS_TOKEN(x, y)
253#define RS_TOKEN_KEYWORD(name, keyword) name,
254 RS_TOKEN_LIST
255#undef RS_TOKEN_KEYWORD
256#undef RS_TOKEN
257};
258
259constexpr int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index);
260} // namespace
261
262/* Determines whether the string passed in is a keyword or not. If it is, it
263 * returns the keyword name. */
264TokenId
265Lexer::classify_keyword (const std::string &str)
266{
267 const std::string *last = keyword_index + num_keywords;
268 const std::string *idx = std::lower_bound (keyword_index, last, str);
269
270 if (idx == last || str != *idx)
271 return IDENTIFIER;
272
273 // TODO: possibly replace this x-macro system with something like hash map?
274
275 // We now have the expected token ID of the reserved keyword. However, some
276 // keywords are reserved starting in certain editions. For example, `try` is
277 // only a reserved keyword in editions >=2018. The language might gain new
278 // reserved keywords in the future.
279 //
280 // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
281 auto id = keyword_keys[idx - keyword_index];
282
283 // `try` is not a reserved keyword before 2018
284 if (Session::get_instance ().options.get_edition ()
285 == CompileOptions::Edition::E2015
286 && id == TRY)
287 return IDENTIFIER;
288
289 return id;
290}
291
292TokenPtr
293Lexer::build_token ()
294{
295 // loop to go through multiple characters to build a single token
296 while (true)
297 {
298 Location loc = get_current_location ();
299 current_char = peek_input ();
300 skip_input ();
301
302 // detect UTF8 bom
303 //
304 // Must be the first thing on the first line.
305 // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
306 // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
307 if (current_line == 1 && current_column == 1 && current_char == 0xef
308 && peek_input () == 0xbb && peek_input (1) == 0xbf)
309 {
310 skip_input (1);
311 current_char = peek_input ();
312 skip_input ();
313 }
314
315 // detect shebang
316 // Must be the first thing on the first line, starting with #!
317 // But since an attribute can also start with an #! we don't count it as a
318 // shebang line when after any whitespace or comments there is a [. If it
319 // is a shebang line we simple drop the line. Otherwise we don't consume
320 // any characters and fall through to the real tokenizer.
321 if (current_line == 1 && current_column == 1 && current_char == '#'
322 && peek_input () == '!')
323 {
324 int n = 1;
325 while (true)
326 {
327 int next_char = peek_input (n);
328 if (is_whitespace (next_char))
329 n++;
330 else if ((next_char == '/' && peek_input (n + 1) == '/'
331 && peek_input (n + 2) != '!'
332 && peek_input (n + 2) != '/')
333 || (next_char == '/' && peek_input (n + 1) == '/'
334 && peek_input (n + 2) == '/'
335 && peek_input (n + 3) == '/'))
336 {
337 // two // or four ////
338 // A single line comment
339 // (but not an inner or outer doc comment)
340 n += 2;
341 next_char = peek_input (n);
342 while (next_char != '\n' && next_char != EOF)
343 {
344 n++;
345 next_char = peek_input (n);
346 }
347 if (next_char == '\n')
348 n++;
349 }
350 else if (next_char == '/' && peek_input (n + 1) == '*'
351 && peek_input (n + 2) == '*'
352 && peek_input (n + 3) == '/')
353 {
354 /**/
355 n += 4;
356 }
357 else if (next_char == '/' && peek_input (n + 1) == '*'
358 && peek_input (n + 2) == '*' && peek_input (n + 3) == '*'
359 && peek_input (n + 4) == '/')
360 {
361 /***/
362 n += 5;
363 }
364 else if ((next_char == '/' && peek_input (n + 1) == '*'
365 && peek_input (n + 2) != '*'
366 && peek_input (n + 2) != '!')
367 || (next_char == '/' && peek_input (n + 1) == '*'
368 && peek_input (n + 2) == '*'
369 && peek_input (n + 3) == '*'))
370 {
371 // one /* or three /***
372 // Start of a block comment
373 // (but not an inner or outer doc comment)
374 n += 2;
375 int level = 1;
376 while (level > 0)
377 {
378 if (peek_input (n) == EOF)
379 break;
380 else if (peek_input (n) == '/'
381 && peek_input (n + 1) == '*')
382 {
383 n += 2;
384 level += 1;
385 }
386 else if (peek_input (n) == '*'
387 && peek_input (n + 1) == '/')
388 {
389 n += 2;
390 level -= 1;
391 }
392 else
393 n++;
394 }
395 }
396 else if (next_char != '[')
397 {
398 // definitely shebang, ignore the first line
399 while (current_char != '\n' && current_char != EOF)
400 {
401 current_char = peek_input ();
402 skip_input ();
403 }
404
405 // newline
406 current_line++;
407 current_column = 1;
408 // tell line_table that new line starts
409 start_line (current_line, max_column_hint);
410 break;
411 }
412 else
413 break; /* Definitely not a shebang line. */
414 }
415 }
416
417 // return end of file token if end of file
418 if (current_char == EOF)
419 return Token::make (END_OF_FILE, loc);
420
421 // if not end of file, start tokenising
422 switch (current_char)
423 {
424 /* ignore whitespace characters for tokens but continue updating
425 * location */
426 case '\n': // newline
427 current_line++;
428 current_column = 1;
429 // tell line_table that new line starts
430 start_line (current_line, max_column_hint);
431 continue;
432 case '\r': // cr
433 // Ignore, we expect a newline (lf) soon.
434 continue;
435 case ' ': // space
436 current_column++;
437 continue;
438 case '\t': // tab
439 // width of a tab is not well-defined, assume 8 spaces
440 current_column += 8;
441 continue;
442
443 // punctuation - actual tokens
444 case '=':
445 if (peek_input () == '>')
446 {
447 // match arm arrow
448 skip_input ();
449 current_column += 2;
86f8e47f 450 loc += 1;
18f6990f
JP
451
452 return Token::make (MATCH_ARROW, loc);
453 }
454 else if (peek_input () == '=')
455 {
456 // equality operator
457 skip_input ();
458 current_column += 2;
86f8e47f 459 loc += 1;
18f6990f
JP
460
461 return Token::make (EQUAL_EQUAL, loc);
462 }
463 else
464 {
465 // assignment operator
466 current_column++;
467 return Token::make (EQUAL, loc);
468 }
469 case '(':
470 current_column++;
471 return Token::make (LEFT_PAREN, loc);
472 case '-':
473 if (peek_input () == '>')
474 {
475 // return type specifier
476 skip_input ();
477 current_column += 2;
86f8e47f 478 loc += 1;
18f6990f
JP
479
480 return Token::make (RETURN_TYPE, loc);
481 }
482 else if (peek_input () == '=')
483 {
484 // minus-assign
485 skip_input ();
486 current_column += 2;
86f8e47f 487 loc += 1;
18f6990f
JP
488
489 return Token::make (MINUS_EQ, loc);
490 }
491 else
492 {
493 // minus
494 current_column++;
495 return Token::make (MINUS, loc);
496 }
497 case '+':
498 if (peek_input () == '=')
499 {
500 // add-assign
501 skip_input ();
502 current_column += 2;
86f8e47f 503 loc += 1;
18f6990f
JP
504
505 return Token::make (PLUS_EQ, loc);
506 }
507 else
508 {
509 // add
510 current_column++;
511 return Token::make (PLUS, loc);
512 }
513 case ')':
514 current_column++;
515 return Token::make (RIGHT_PAREN, loc);
516 case ';':
517 current_column++;
518 return Token::make (SEMICOLON, loc);
519 case '*':
520 if (peek_input () == '=')
521 {
522 // multiplication-assign
523 skip_input ();
524 current_column += 2;
86f8e47f 525 loc += 1;
18f6990f
JP
526
527 return Token::make (ASTERISK_EQ, loc);
528 }
529 else
530 {
531 // multiplication
532 current_column++;
533 return Token::make (ASTERISK, loc);
534 }
535 case ',':
536 current_column++;
537 return Token::make (COMMA, loc);
538 case '/':
539 if (peek_input () == '=')
540 {
541 // division-assign
542 skip_input ();
543 current_column += 2;
86f8e47f 544 loc += 1;
18f6990f
JP
545
546 return Token::make (DIV_EQ, loc);
547 }
548 else if ((peek_input () == '/' && peek_input (1) != '!'
549 && peek_input (1) != '/')
550 || (peek_input () == '/' && peek_input (1) == '/'
551 && peek_input (2) == '/'))
552 {
553 // two // or four ////
554 // single line comment
555 // (but not an inner or outer doc comment)
556 skip_input ();
557 current_column += 2;
558 current_char = peek_input ();
559
560 // basically ignore until line finishes
561 while (current_char != '\n' && current_char != EOF)
562 {
563 skip_input ();
564 current_column++; // not used
565 current_char = peek_input ();
566 }
567 continue;
568 }
569 else if (peek_input () == '/'
570 && (peek_input (1) == '!' || peek_input (1) == '/'))
571 {
572 /* single line doc comment, inner or outer. */
573 bool is_inner = peek_input (1) == '!';
574 skip_input (1);
575 current_column += 3;
576
577 std::string str;
578 str.reserve (32);
579 current_char = peek_input ();
580 while (current_char != '\n')
581 {
582 skip_input ();
583 if (current_char == '\r')
584 {
585 char next_char = peek_input ();
586 if (next_char == '\n')
587 {
588 current_char = '\n';
589 break;
590 }
591 rust_error_at (
592 loc, "Isolated CR %<\\r%> not allowed in doc comment");
593 current_char = next_char;
594 continue;
595 }
596 if (current_char == EOF)
597 {
598 rust_error_at (
599 loc, "unexpected EOF while looking for end of comment");
600 break;
601 }
602 str += current_char;
603 current_char = peek_input ();
604 }
605 skip_input ();
606 current_line++;
607 current_column = 1;
608 // tell line_table that new line starts
609 start_line (current_line, max_column_hint);
610
611 str.shrink_to_fit ();
86f8e47f 612
613 loc += str.size () - 1;
18f6990f
JP
614 if (is_inner)
615 return Token::make_inner_doc_comment (loc, std::move (str));
616 else
617 return Token::make_outer_doc_comment (loc, std::move (str));
618 }
619 else if (peek_input () == '*' && peek_input (1) == '*'
620 && peek_input (2) == '/')
621 {
622 /**/
623 skip_input (2);
624 current_column += 4;
625 continue;
626 }
627 else if (peek_input () == '*' && peek_input (1) == '*'
628 && peek_input (2) == '*' && peek_input (3) == '/')
629 {
630 /***/
631 skip_input (3);
632 current_column += 5;
633 continue;
634 }
635 else if ((peek_input () == '*' && peek_input (1) != '!'
636 && peek_input (1) != '*')
637 || (peek_input () == '*' && peek_input (1) == '*'
638 && peek_input (2) == '*'))
639 {
640 // one /* or three /***
641 // block comment
642 // (but not an inner or outer doc comment)
643 skip_input ();
644 current_column += 2;
645
646 int level = 1;
647 while (level > 0)
648 {
649 current_char = peek_input ();
650
651 if (current_char == EOF)
652 {
653 rust_error_at (
654 loc, "unexpected EOF while looking for end of comment");
655 break;
656 }
657
658 // if /* found
659 if (current_char == '/' && peek_input (1) == '*')
660 {
661 // skip /* characters
662 skip_input (1);
663
664 current_column += 2;
665
666 level += 1;
667 continue;
668 }
669
670 // ignore until */ is found
671 if (current_char == '*' && peek_input (1) == '/')
672 {
673 // skip */ characters
674 skip_input (1);
675
676 current_column += 2;
677
678 level -= 1;
679 continue;
680 }
681
682 if (current_char == '\n')
683 {
684 skip_input ();
685 current_line++;
686 current_column = 1;
687 // tell line_table that new line starts
688 start_line (current_line, max_column_hint);
689 continue;
690 }
691
692 skip_input ();
693 current_column++;
694 }
695
696 // refresh new token
697 continue;
698 }
699 else if (peek_input () == '*'
700 && (peek_input (1) == '!' || peek_input (1) == '*'))
701 {
702 // block doc comment, inner /*! or outer /**
703 bool is_inner = peek_input (1) == '!';
704 skip_input (1);
705 current_column += 3;
706
707 std::string str;
708 str.reserve (96);
709
710 int level = 1;
711 while (level > 0)
712 {
713 current_char = peek_input ();
714
715 if (current_char == EOF)
716 {
717 rust_error_at (
718 loc, "unexpected EOF while looking for end of comment");
719 break;
720 }
721
722 // if /* found
723 if (current_char == '/' && peek_input (1) == '*')
724 {
725 // skip /* characters
726 skip_input (1);
727 current_column += 2;
728
729 level += 1;
730 str += "/*";
731 continue;
732 }
733
734 // ignore until */ is found
735 if (current_char == '*' && peek_input (1) == '/')
736 {
737 // skip */ characters
738 skip_input (1);
739 current_column += 2;
740
741 level -= 1;
742 if (level > 0)
743 str += "*/";
744 continue;
745 }
746
747 if (current_char == '\r' && peek_input (1) != '\n')
748 rust_error_at (
749 loc, "Isolated CR %<\\r%> not allowed in doc comment");
750
751 if (current_char == '\n')
752 {
753 skip_input ();
754 current_line++;
755 current_column = 1;
756 // tell line_table that new line starts
757 start_line (current_line, max_column_hint);
758 str += '\n';
759 continue;
760 }
761
762 str += current_char;
763 skip_input ();
764 current_column++;
765 }
766
767 str.shrink_to_fit ();
86f8e47f 768
769 loc += str.size () - 1;
18f6990f
JP
770 if (is_inner)
771 return Token::make_inner_doc_comment (loc, std::move (str));
772 else
773 return Token::make_outer_doc_comment (loc, std::move (str));
774 }
775 else
776 {
777 // division
778 current_column++;
779 return Token::make (DIV, loc);
780 }
781 case '%':
782 if (peek_input () == '=')
783 {
784 // modulo-assign
785 skip_input ();
786 current_column += 2;
86f8e47f 787 loc += 1;
18f6990f
JP
788
789 return Token::make (PERCENT_EQ, loc);
790 }
791 else
792 {
793 // modulo
794 current_column++;
795 return Token::make (PERCENT, loc);
796 }
797 case '^':
798 if (peek_input () == '=')
799 {
800 // xor-assign?
801 skip_input ();
802 current_column += 2;
86f8e47f 803 loc += 1;
18f6990f
JP
804
805 return Token::make (CARET_EQ, loc);
806 }
807 else
808 {
809 // xor?
810 current_column++;
811 return Token::make (CARET, loc);
812 }
813 case '<':
814 if (peek_input () == '<')
815 {
816 if (peek_input (1) == '=')
817 {
818 // left-shift assign
819 skip_input (1);
820 current_column += 3;
86f8e47f 821 loc += 2;
18f6990f
JP
822
823 return Token::make (LEFT_SHIFT_EQ, loc);
824 }
825 else
826 {
827 // left-shift
828 skip_input ();
829 current_column += 2;
86f8e47f 830 loc += 1;
18f6990f
JP
831
832 return Token::make (LEFT_SHIFT, loc);
833 }
834 }
835 else if (peek_input () == '=')
836 {
837 // smaller than or equal to
838 skip_input ();
839 current_column += 2;
86f8e47f 840 loc += 1;
18f6990f
JP
841
842 return Token::make (LESS_OR_EQUAL, loc);
843 }
844 else
845 {
846 // smaller than
847 current_column++;
848 return Token::make (LEFT_ANGLE, loc);
849 }
850 break;
851 case '>':
852 if (peek_input () == '>')
853 {
854 if (peek_input (1) == '=')
855 {
856 // right-shift-assign
857 skip_input (1);
858 current_column += 3;
86f8e47f 859 loc += 2;
18f6990f
JP
860
861 return Token::make (RIGHT_SHIFT_EQ, loc);
862 }
863 else
864 {
865 // right-shift
866 skip_input ();
867 current_column += 2;
86f8e47f 868 loc += 1;
18f6990f
JP
869
870 return Token::make (RIGHT_SHIFT, loc);
871 }
872 }
873 else if (peek_input () == '=')
874 {
875 // larger than or equal to
876 skip_input ();
877 current_column += 2;
86f8e47f 878 loc += 1;
18f6990f
JP
879
880 return Token::make (GREATER_OR_EQUAL, loc);
881 }
882 else
883 {
884 // larger than
885 current_column++;
886 return Token::make (RIGHT_ANGLE, loc);
887 }
888 case ':':
889 if (peek_input () == ':')
890 {
891 // scope resolution ::
892 skip_input ();
893 current_column += 2;
86f8e47f 894 loc += 1;
18f6990f
JP
895
896 return Token::make (SCOPE_RESOLUTION, loc);
897 }
898 else
899 {
900 // single colon :
901 current_column++;
902 return Token::make (COLON, loc);
903 }
904 case '!':
905 // no special handling for macros in lexer?
906 if (peek_input () == '=')
907 {
908 // not equal boolean operator
909 skip_input ();
910 current_column += 2;
86f8e47f 911 loc += 1;
18f6990f
JP
912
913 return Token::make (NOT_EQUAL, loc);
914 }
915 else
916 {
917 // not equal unary operator
918 current_column++;
919
920 return Token::make (EXCLAM, loc);
921 }
922 case '?':
923 current_column++;
924 return Token::make (QUESTION_MARK, loc);
925 case '#':
926 current_column++;
927 return Token::make (HASH, loc);
928 case '[':
929 current_column++;
930 return Token::make (LEFT_SQUARE, loc);
931 case ']':
932 current_column++;
933 return Token::make (RIGHT_SQUARE, loc);
934 case '{':
935 current_column++;
936 return Token::make (LEFT_CURLY, loc);
937 case '}':
938 current_column++;
939 return Token::make (RIGHT_CURLY, loc);
940 case '@':
941 current_column++;
942 return Token::make (PATTERN_BIND, loc);
943 case '$':
944 current_column++;
945 return Token::make (DOLLAR_SIGN, loc);
946 case '~':
947 current_column++;
948 return Token::make (TILDE, loc);
949 case '\\':
950 current_column++;
951 return Token::make (BACKSLASH, loc);
952 case '`':
953 current_column++;
954 return Token::make (BACKTICK, loc);
955 case '|':
956 if (peek_input () == '=')
957 {
958 // bitwise or-assign?
959 skip_input ();
960 current_column += 2;
86f8e47f 961 loc += 1;
18f6990f
JP
962
963 return Token::make (PIPE_EQ, loc);
964 }
965 else if (peek_input () == '|')
966 {
967 // logical or
968 skip_input ();
969 current_column += 2;
86f8e47f 970 loc += 1;
18f6990f
JP
971
972 return Token::make (OR, loc);
973 }
974 else
975 {
976 // bitwise or
977 current_column++;
978
979 return Token::make (PIPE, loc);
980 }
981 case '&':
982 if (peek_input () == '=')
983 {
984 // bitwise and-assign?
985 skip_input ();
986 current_column += 2;
86f8e47f 987 loc += 1;
18f6990f
JP
988
989 return Token::make (AMP_EQ, loc);
990 }
991 else if (peek_input () == '&')
992 {
993 // logical and
994 skip_input ();
995 current_column += 2;
86f8e47f 996 loc += 1;
18f6990f
JP
997
998 return Token::make (LOGICAL_AND, loc);
999 }
1000 else
1001 {
1002 // bitwise and/reference
1003 current_column++;
1004
1005 return Token::make (AMP, loc);
1006 }
1007 case '.':
1008 if (peek_input () == '.')
1009 {
1010 if (peek_input (1) == '.')
1011 {
1012 // ellipsis
1013 skip_input (1);
1014 current_column += 3;
86f8e47f 1015 loc += 2;
18f6990f
JP
1016
1017 return Token::make (ELLIPSIS, loc);
1018 }
1019 else if (peek_input (1) == '=')
1020 {
1021 // ..=
1022 skip_input (1);
1023 current_column += 3;
86f8e47f 1024 loc += 2;
18f6990f
JP
1025
1026 return Token::make (DOT_DOT_EQ, loc);
1027 }
1028 else
1029 {
1030 // ..
1031 skip_input ();
1032 current_column += 2;
86f8e47f 1033 loc += 1;
18f6990f
JP
1034
1035 return Token::make (DOT_DOT, loc);
1036 }
1037 }
1038 else /*if (!ISDIGIT (peek_input ()))*/
1039 {
1040 // single dot .
1041 // Only if followed by a non-number - otherwise is float
1042 // nope, float cannot start with '.'.
1043 current_column++;
1044 return Token::make (DOT, loc);
1045 }
1046 }
1047 // TODO: special handling of _ in the lexer? instead of being identifier
1048
1049 // byte character, byte string and raw byte string literals
1050 if (current_char == 'b')
1051 {
1052 if (peek_input () == '\'')
1053 return parse_byte_char (loc);
1054 else if (peek_input () == '"')
1055 return parse_byte_string (loc);
1056 else if (peek_input () == 'r'
1057 && (peek_input (1) == '#' || peek_input (1) == '"'))
1058 return parse_raw_byte_string (loc);
1059 }
1060
1061 // raw identifiers and raw strings
1062 if (current_char == 'r')
1063 {
1064 int peek = peek_input ();
1065 int peek1 = peek_input (1);
1066
1067 if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
1068 {
1069 TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
1070 if (raw_ident_ptr != nullptr)
1071 return raw_ident_ptr;
1072 else
1073 continue; /* input got parsed, it just wasn't valid. An error
1074 was produced. */
1075 }
1076 else
1077 {
1078 TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
1079 if (maybe_raw_string_ptr != nullptr)
1080 return maybe_raw_string_ptr;
1081 }
1082 }
1083
1084 // find identifiers and keywords
1085 if (ISALPHA (current_char) || current_char == '_')
1086 return parse_identifier_or_keyword (loc);
1087
1088 // int and float literals
1089 if (ISDIGIT (current_char))
1090 { // _ not allowed as first char
1091 if (current_char == '0'
1092 && is_non_decimal_int_literal_separator (peek_input ()))
1093 {
1094 // handle binary, octal, hex literals
1095 TokenPtr non_dec_int_lit_ptr
1096 = parse_non_decimal_int_literals (loc);
1097 if (non_dec_int_lit_ptr != nullptr)
1098 return non_dec_int_lit_ptr;
1099 }
1100 else
1101 {
1102 // handle decimals (integer or float)
1103 TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
1104 if (decimal_or_float_ptr != nullptr)
1105 return decimal_or_float_ptr;
1106 }
1107 }
1108
1109 // string literals
1110 if (current_char == '"')
1111 return parse_string (loc);
1112
1113 // char literals and lifetime names
1114 if (current_char == '\'')
1115 {
1116 TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
1117 if (char_or_lifetime_ptr != nullptr)
1118 return char_or_lifetime_ptr;
1119 }
1120
1121 // DEBUG: check for specific character problems:
1122 if (current_char == '0')
1123 rust_debug ("'0' uncaught before unexpected character");
1124 else if (current_char == ']')
1125 rust_debug ("']' uncaught before unexpected character");
1126 else if (current_char == 0x5d)
1127 rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
1128 "unexpected character");
1129
1130 // didn't match anything so error
1131 rust_error_at (loc, "unexpected character %<%x%>", current_char);
1132 current_column++;
1133 }
1134}
1135
1136// Parses in a type suffix.
1137std::pair<PrimitiveCoreType, int>
1138Lexer::parse_in_type_suffix ()
1139{
1140 std::string suffix;
1141 suffix.reserve (5);
1142
1143 int additional_length_offset = 0;
1144
1145 // get suffix
1146 while (ISALPHA (current_char) || ISDIGIT (current_char)
1147 || current_char == '_')
1148 {
1149 if (current_char == '_')
1150 {
1151 // don't add _ to suffix
1152 skip_input ();
1153 current_char = peek_input ();
1154
1155 additional_length_offset++;
1156
1157 continue;
1158 }
1159
1160 additional_length_offset++;
1161
1162 suffix += current_char;
1163 skip_input ();
1164 current_char = peek_input ();
1165 }
1166
1167 if (suffix.empty ())
1168 {
1169 // no type suffix: do nothing but also no error
1170 return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
1171 }
1172 else if (suffix == "f32")
1173 {
1174 return std::make_pair (CORETYPE_F32, additional_length_offset);
1175 }
1176 else if (suffix == "f64")
1177 {
1178 return std::make_pair (CORETYPE_F64, additional_length_offset);
1179 }
1180 else if (suffix == "i8")
1181 {
1182 return std::make_pair (CORETYPE_I8, additional_length_offset);
1183 }
1184 else if (suffix == "i16")
1185 {
1186 return std::make_pair (CORETYPE_I16, additional_length_offset);
1187 }
1188 else if (suffix == "i32")
1189 {
1190 return std::make_pair (CORETYPE_I32, additional_length_offset);
1191 }
1192 else if (suffix == "i64")
1193 {
1194 return std::make_pair (CORETYPE_I64, additional_length_offset);
1195 }
1196 else if (suffix == "i128")
1197 {
1198 return std::make_pair (CORETYPE_I128, additional_length_offset);
1199 }
1200 else if (suffix == "isize")
1201 {
1202 return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
1203 }
1204 else if (suffix == "u8")
1205 {
1206 return std::make_pair (CORETYPE_U8, additional_length_offset);
1207 }
1208 else if (suffix == "u16")
1209 {
1210 return std::make_pair (CORETYPE_U16, additional_length_offset);
1211 }
1212 else if (suffix == "u32")
1213 {
1214 return std::make_pair (CORETYPE_U32, additional_length_offset);
1215 }
1216 else if (suffix == "u64")
1217 {
1218 return std::make_pair (CORETYPE_U64, additional_length_offset);
1219 }
1220 else if (suffix == "u128")
1221 {
1222 return std::make_pair (CORETYPE_U128, additional_length_offset);
1223 }
1224 else if (suffix == "usize")
1225 {
1226 return std::make_pair (CORETYPE_USIZE, additional_length_offset);
1227 }
1228 else
1229 {
1230 rust_error_at (get_current_location (), "unknown number suffix %qs",
1231 suffix.c_str ());
1232
1233 return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
1234 }
1235}
1236
1237// Parses in the exponent part (if any) of a float literal.
1238std::pair<std::string, int>
1239Lexer::parse_in_exponent_part ()
1240{
1241 int additional_length_offset = 0;
1242 std::string str;
1243 if (current_char == 'E' || current_char == 'e')
1244 {
1245 // add exponent to string as strtod works with it
1246 str += current_char;
1247 skip_input ();
1248 current_char = peek_input ();
1249
1250 additional_length_offset++;
1251
1252 // special - and + handling
1253 if (current_char == '-')
1254 {
1255 str += '-';
1256
1257 skip_input ();
1258 current_char = peek_input ();
1259
1260 additional_length_offset++;
1261 }
1262 else if (current_char == '+')
1263 {
1264 // don't add + but still skip input
1265 skip_input ();
1266 current_char = peek_input ();
1267
1268 additional_length_offset++;
1269 }
1270
1271 // parse another decimal number for exponent
1272 auto str_length = parse_in_decimal ();
1273 str += std::get<0> (str_length);
1274 additional_length_offset += std::get<1> (str_length);
1275 }
1276 return std::make_pair (str, additional_length_offset);
1277}
1278
1279// Parses a decimal integer.
1280std::tuple<std::string, int, bool>
1281Lexer::parse_in_decimal ()
1282{
1283 /* A pure decimal contains only digits. */
1284 bool pure_decimal = true;
1285 int additional_length_offset = 0;
1286 std::string str;
1287 while (ISDIGIT (current_char) || current_char == '_')
1288 {
1289 if (current_char == '_')
1290 {
1291 pure_decimal = false;
1292 // don't add _ to number
1293 skip_input ();
1294 current_char = peek_input ();
1295
1296 additional_length_offset++;
1297
1298 continue;
1299 }
1300
1301 additional_length_offset++;
1302
1303 str += current_char;
1304 skip_input ();
1305 current_char = peek_input ();
1306 }
1307 return std::make_tuple (str, additional_length_offset, pure_decimal);
1308}
1309
1310/* Parses escapes (and string continues) in "byte" strings and characters. Does
1311 * not support unicode. */
1312std::tuple<char, int, bool>
1313Lexer::parse_escape (char opening_char)
1314{
1315 int additional_length_offset = 0;
1316 char output_char = 0;
1317
1318 // skip to actual letter
1319 skip_input ();
1320 current_char = peek_input ();
1321 additional_length_offset++;
1322
1323 switch (current_char)
1324 {
1325 case 'x': {
1326 auto hex_escape_pair = parse_partial_hex_escape ();
1327 long hexLong = hex_escape_pair.first;
1328 additional_length_offset += hex_escape_pair.second;
1329
1330 if (hexLong > 255 || hexLong < 0)
1331 rust_error_at (
1332 get_current_location (),
1333 "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
1334 static_cast<unsigned int> (hexLong));
1335 /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1336 * support %X directly */
1337 char hexChar = static_cast<char> (hexLong);
1338
1339 output_char = hexChar;
1340 }
1341 break;
1342 case 'n':
1343 output_char = '\n';
1344 break;
1345 case 'r':
1346 output_char = '\r';
1347 break;
1348 case 't':
1349 output_char = '\t';
1350 break;
1351 case '\\':
1352 output_char = '\\';
1353 break;
1354 case '0':
1355 output_char = '\0';
1356 break;
1357 case '\'':
1358 output_char = '\'';
1359 break;
1360 case '"':
1361 output_char = '"';
1362 break;
1363 case 'u':
1364 rust_error_at (get_current_location (),
1365 "cannot have a unicode escape \\u in a byte %s",
1366 opening_char == '\'' ? "character" : "string");
1367 // Try to parse it anyway, just to skip it
1368 parse_partial_unicode_escape ();
1369 return std::make_tuple (output_char, additional_length_offset, false);
1370 case '\r':
1371 case '\n':
1372 // string continue
1373 return std::make_tuple (0, parse_partial_string_continue (), true);
1374 default:
1375 rust_error_at (get_current_location (),
1376 "unknown escape sequence %<\\%c%>", current_char);
1377 // returns false if no parsing could be done
1378 // return false;
1379 return std::make_tuple (output_char, additional_length_offset, false);
1380 break;
1381 }
1382 // all non-special cases (string continue) should skip their used char
1383 skip_input ();
1384 current_char = peek_input ();
1385 additional_length_offset++;
1386
1387 // returns true if parsing was successful
1388 // return true;
1389 return std::make_tuple (output_char, additional_length_offset, false);
1390}
1391
1392/* Parses an escape (or string continue) in a string or character. Supports
1393 * unicode escapes. */
1394std::tuple<Codepoint, int, bool>
9f455ed8 1395Lexer::parse_utf8_escape ()
18f6990f
JP
1396{
1397 Codepoint output_char;
1398 int additional_length_offset = 0;
1399
1400 // skip to actual letter
1401 skip_input ();
1402 current_char = peek_input ();
1403 additional_length_offset++;
1404
1405 switch (current_char)
1406 {
1407 case 'x': {
1408 auto hex_escape_pair = parse_partial_hex_escape ();
1409 long hexLong = hex_escape_pair.first;
1410 additional_length_offset += hex_escape_pair.second;
1411
1412 if (hexLong > 127 || hexLong < 0)
1413 rust_error_at (
1414 get_current_location (),
1415 "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
1416 static_cast<unsigned int> (hexLong));
1417 /* TODO: restore capital for escape output - gcc pretty-printer doesn't
1418 * support %X directly */
1419 char hexChar = static_cast<char> (hexLong);
1420
1421 output_char = hexChar;
1422 }
1423 break;
1424 case 'n':
1425 output_char = '\n';
1426 break;
1427 case 'r':
1428 output_char = '\r';
1429 break;
1430 case 't':
1431 output_char = '\t';
1432 break;
1433 case '\\':
1434 output_char = '\\';
1435 break;
1436 case '0':
1437 output_char = '\0';
1438 break;
1439 case '\'':
1440 output_char = '\'';
1441 break;
1442 case '"':
1443 output_char = '"';
1444 break;
1445 case 'u': {
1446 auto unicode_escape_pair = parse_partial_unicode_escape ();
1447 output_char = unicode_escape_pair.first;
1448 additional_length_offset += unicode_escape_pair.second;
1449
1450 return std::make_tuple (output_char, additional_length_offset, false);
1451 }
1452 break;
1453 case '\r':
1454 case '\n':
1455 // string continue
1456 return std::make_tuple (0, parse_partial_string_continue (), true);
1457 default:
1458 rust_error_at (get_current_location (),
1459 "unknown escape sequence %<\\%c%>", current_char);
1460 // returns false if no parsing could be done
1461 // return false;
1462 return std::make_tuple (output_char, additional_length_offset, false);
1463 break;
1464 }
1465 /* all non-special cases (unicode, string continue) should skip their used
1466 * char */
1467 skip_input ();
1468 current_char = peek_input ();
1469 additional_length_offset++;
1470
1471 // returns true if parsing was successful
1472 // return true;
1473 return std::make_tuple (output_char, additional_length_offset, false);
1474}
1475
1476// Parses the body of a string continue that has been found in an escape.
1477int
1478Lexer::parse_partial_string_continue ()
1479{
1480 int additional_length_offset = 1;
1481
1482 // string continue
1483 while (is_whitespace (current_char))
1484 {
1485 if (current_char == '\n')
1486 {
1487 current_line++;
1488 current_column = 1;
1489 // tell line_table that new line starts
1490 start_line (current_line, max_column_hint);
1491
1492 // reset "length"
1493 additional_length_offset = 1;
1494
1495 // get next char
1496 skip_input ();
1497 current_char = peek_input ();
1498
1499 continue;
1500 }
1501
1502 skip_input ();
1503 current_char = peek_input ();
1504 additional_length_offset++;
1505 }
1506
1507 return additional_length_offset;
1508}
1509
1510/* Parses the body of a '\x' escape. Note that it does not check that the number
1511 * is valid and smaller than 255. */
1512std::pair<long, int>
1513Lexer::parse_partial_hex_escape ()
1514{
1515 // hex char string (null-terminated)
1516 char hexNum[3] = {0, 0, 0};
1517
1518 // first hex char
1519 current_char = peek_input (1);
1520 int additional_length_offset = 1;
1521
1522 if (!is_x_digit (current_char))
1523 {
1524 rust_error_at (get_current_location (),
1525 "invalid character %<\\x%c%> in \\x sequence",
1526 current_char);
1527 return std::make_pair (0, 0);
1528 }
1529 hexNum[0] = current_char;
1530
1531 // second hex char
1532 skip_input ();
1533 current_char = peek_input (1);
1534 additional_length_offset++;
1535
1536 if (!is_x_digit (current_char))
1537 {
1538 rust_error_at (get_current_location (),
1539 "invalid character %<\\x%c%c%> in \\x sequence", hexNum[0],
1540 current_char);
1541 return std::make_pair (0, 1);
1542 }
1543 skip_input ();
1544 hexNum[1] = current_char;
1545
1546 long hexLong = std::strtol (hexNum, nullptr, 16);
1547
1548 return std::make_pair (hexLong, additional_length_offset);
1549}
1550
1551// Parses the body of a unicode escape.
1552std::pair<Codepoint, int>
1553Lexer::parse_partial_unicode_escape ()
1554{
1555 skip_input ();
1556 current_char = peek_input ();
1557 int additional_length_offset = 0;
1558
1559 if (current_char != '{')
1560 {
1561 rust_error_at (get_current_location (),
1562 "unicode escape should start with %<{%>");
1563 /* Skip what should probaby have been between brackets. */
1564 while (is_x_digit (current_char) || current_char == '_')
1565 {
1566 skip_input ();
1567 current_char = peek_input ();
1568 additional_length_offset++;
1569 }
1570 return std::make_pair (Codepoint (0), additional_length_offset);
1571 }
1572
1573 skip_input ();
1574 current_char = peek_input ();
1575 additional_length_offset++;
1576
1577 if (current_char == '_')
1578 {
1579 rust_error_at (get_current_location (),
1580 "unicode escape cannot start with %<_%>");
1581 skip_input ();
1582 current_char = peek_input ();
1583 additional_length_offset++;
1584 // fallthrough and try to parse the rest anyway
1585 }
1586
1587 // parse unicode escape - 1-6 hex digits
1588 std::string num_str;
1589 num_str.reserve (6);
1590
1591 // loop through to add entire hex number to string
1592 while (is_x_digit (current_char) || current_char == '_')
1593 {
1594 if (current_char == '_')
1595 {
1596 // don't add _ to number
1597 skip_input ();
1598 current_char = peek_input ();
1599
1600 additional_length_offset++;
1601
1602 continue;
1603 }
1604
1605 additional_length_offset++;
1606
1607 // add raw hex numbers
1608 num_str += current_char;
1609
1610 skip_input ();
1611 current_char = peek_input ();
1612 }
1613
1614 if (current_char == '}')
1615 {
1616 skip_input ();
1617 current_char = peek_input ();
1618 additional_length_offset++;
1619 }
1620 else
1621 {
1622 // actually an error, but allow propagation anyway Assume that
1623 // wrong bracketm whitespace or single/double quotes are wrong
1624 // termination, otherwise it is a wrong character, then skip to the actual
1625 // terminator.
1626 if (current_char == '{' || is_whitespace (current_char)
1627 || current_char == '\'' || current_char == '"')
1628 {
1629 rust_error_at (get_current_location (),
1630 "expected terminating %<}%> in unicode escape");
1631 return std::make_pair (Codepoint (0), additional_length_offset);
1632 }
1633 else
1634 {
1635 rust_error_at (get_current_location (),
1636 "invalid character %<%c%> in unicode escape",
1637 current_char);
1638 while (current_char != '}' && current_char != '{'
1639 && !is_whitespace (current_char) && current_char != '\''
1640 && current_char != '"')
1641 {
1642 skip_input ();
1643 current_char = peek_input ();
1644 additional_length_offset++;
1645 }
1646 // Consume the actual closing bracket if found
1647 if (current_char == '}')
1648 {
1649 skip_input ();
1650 current_char = peek_input ();
1651 additional_length_offset++;
1652 }
1653 return std::make_pair (Codepoint (0), additional_length_offset);
1654 }
1655 }
1656
1657 // ensure 1-6 hex characters
1658 if (num_str.length () > 6 || num_str.length () < 1)
1659 {
1660 rust_error_at (get_current_location (),
1661 "unicode escape should be between 1 and 6 hex "
1662 "characters; it is %lu",
1663 (unsigned long) num_str.length ());
1664 // return false;
1665 return std::make_pair (Codepoint (0), additional_length_offset);
1666 }
1667
1668 unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
1669
1670 if (hex_num > 0xd7ff && hex_num < 0xe000)
1671 {
1672 rust_error_at (
1673 get_current_location (),
1674 "unicode escape cannot be a surrogate value (D800 to DFFF)");
1675 return std::make_pair (Codepoint (0), additional_length_offset);
1676 }
1677
1678 if (hex_num > 0x10ffff)
1679 {
1680 rust_error_at (get_current_location (),
1681 "unicode escape cannot be larger than 10FFFF");
1682 return std::make_pair (Codepoint (0), additional_length_offset);
1683 }
1684
1685 // return true;
1686 return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
1687 additional_length_offset);
1688}
1689
1690// Parses a byte character.
1691TokenPtr
1692Lexer::parse_byte_char (Location loc)
1693{
1694 skip_input ();
1695 current_column++;
1696 // make current char the next character
1697 current_char = peek_input ();
1698
1699 int length = 1;
1700
1701 // char to save
1702 char byte_char = 0;
1703
1704 // detect escapes
1705 if (current_char == '\\')
1706 {
1707 auto escape_length_pair = parse_escape ('\'');
1708 byte_char = std::get<0> (escape_length_pair);
1709 length += std::get<1> (escape_length_pair);
1710
1711 current_char = peek_input ();
1712
1713 if (current_char != '\'')
1714 {
1715 rust_error_at (get_current_location (), "unclosed %<byte char%>");
1716 }
1717
1718 skip_input ();
1719 current_char = peek_input ();
1720 length++; // go to next char
1721 }
1722 else if (current_char != '\'')
1723 {
1724 // otherwise, get character from direct input character
1725 byte_char = current_char;
1726
1727 skip_input ();
1728 current_char = peek_input ();
1729 length++;
1730
1731 if (current_char != '\'')
1732 {
1733 rust_error_at (get_current_location (), "unclosed %<byte char%>");
1734 }
1735
1736 skip_input ();
1737 current_char = peek_input ();
1738 length++; // go to next char
1739 }
1740 else
1741 {
1742 rust_error_at (get_current_location (),
1743 "no character inside %<%> for %<byte char%>");
1744 }
1745
1746 current_column += length;
1747
86f8e47f 1748 loc += length - 1;
1749
18f6990f
JP
1750 return Token::make_byte_char (loc, byte_char);
1751}
1752
1753// Parses a byte string.
1754TokenPtr
1755Lexer::parse_byte_string (Location loc)
1756{
1757 // byte string
1758
1759 // skip quote character
1760 skip_input ();
1761 current_column++;
1762
1763 std::string str;
1764 str.reserve (16); // some sensible default
1765
1766 int length = 1;
1767 current_char = peek_input ();
1768
1769 while (current_char != '"' && current_char != EOF)
1770 {
1771 if (current_char == '\\')
1772 {
1773 auto escape_length_pair = parse_escape ('"');
1774 char output_char = std::get<0> (escape_length_pair);
1775
1776 if (output_char == 0 && std::get<2> (escape_length_pair))
1777 length = std::get<1> (escape_length_pair) - 1;
1778 else
1779 length += std::get<1> (escape_length_pair);
1780
1781 if (output_char != 0 || !std::get<2> (escape_length_pair))
1782 str += output_char;
1783
1784 continue;
1785 }
1786
1787 length++;
1788
1789 str += current_char;
1790 skip_input ();
1791 current_char = peek_input ();
1792 }
1793
1794 current_column += length;
1795
1796 if (current_char == '"')
1797 {
1798 current_column++;
1799
1800 skip_input ();
1801 current_char = peek_input ();
1802 }
1803 else if (current_char == EOF)
1804 {
1805 rust_error_at (get_current_location (), "unended byte string literal");
1806 return Token::make (END_OF_FILE, get_current_location ());
1807 }
1808 else
1809 {
1810 gcc_unreachable ();
1811 }
1812
1813 str.shrink_to_fit ();
86f8e47f 1814 loc += str.size () - 1;
18f6990f
JP
1815
1816 return Token::make_byte_string (loc, std::move (str));
1817}
1818
1819// Parses a raw byte string.
1820TokenPtr
1821Lexer::parse_raw_byte_string (Location loc)
1822{
1823 // raw byte string literals
1824 std::string str;
1825 str.reserve (16); // some sensible default
1826
1827 int length = 1;
1828 int hash_count = 0;
1829
1830 // get hash count at beginnning
1831 skip_input ();
1832 current_char = peek_input ();
1833 length++;
1834 while (current_char == '#')
1835 {
1836 hash_count++;
1837 length++;
1838
1839 skip_input ();
1840 current_char = peek_input ();
1841 }
1842
1843 if (current_char != '"')
1844 {
1845 rust_error_at (get_current_location (),
1846 "raw byte string has no opening %<\"%>");
1847 }
1848
1849 skip_input ();
1850 current_char = peek_input ();
1851 length++;
1852
1853 while (true)
1854 {
1855 if (current_char == '"')
1856 {
1857 bool enough_hashes = true;
1858
1859 for (int i = 0; i < hash_count; i++)
1860 {
1861 if (peek_input (i + 1) != '#')
1862 {
1863 enough_hashes = false;
1864 break;
1865 }
1866 }
1867
1868 if (enough_hashes)
1869 {
1870 // skip enough input and peek enough input
1871 skip_input (hash_count);
1872 current_char = peek_input ();
1873 length += hash_count + 1;
1874 break;
1875 }
1876 }
1877
1878 if ((unsigned char) current_char > 127)
1879 {
1880 rust_error_at (get_current_location (),
1881 "character %<%c%> in raw byte string out of range",
1882 current_char);
1883 current_char = 0;
1884 }
1885
1886 length++;
1887
1888 str += current_char;
1889 skip_input ();
1890 current_char = peek_input ();
1891 }
1892
1893 current_column += length;
1894
86f8e47f 1895 loc += length - 1;
1896
18f6990f
JP
1897 str.shrink_to_fit ();
1898
1899 return Token::make_byte_string (loc, std::move (str));
1900}
1901
1902// Parses a raw identifier.
1903TokenPtr
1904Lexer::parse_raw_identifier (Location loc)
1905{
1906 // raw identifier
1907 std::string str;
1908 str.reserve (16); // default
1909
1910 skip_input ();
1911 current_char = peek_input ();
1912
1913 current_column += 2;
1914
1915 bool first_is_underscore = current_char == '_';
1916
1917 int length = 0;
1918 current_char = peek_input ();
1919 // loop through entire name
1920 while (ISALPHA (current_char) || ISDIGIT (current_char)
1921 || current_char == '_')
1922 {
1923 length++;
1924
1925 str += current_char;
1926 skip_input ();
1927 current_char = peek_input ();
1928 }
1929
1930 current_column += length;
1931
1932 // if just a single underscore, not an identifier
1933 if (first_is_underscore && length == 1)
1934 rust_error_at (get_current_location (),
1935 "%<_%> is not a valid raw identifier");
1936
1937 if (str == "crate" || str == "extern" || str == "self" || str == "super"
1938 || str == "Self")
1939 {
1940 rust_error_at (get_current_location (),
1941 "%qs is a forbidden raw identifier", str.c_str ());
1942
1943 return nullptr;
1944 }
1945 else
1946 {
1947 str.shrink_to_fit ();
86f8e47f 1948 loc += length - 1;
18f6990f
JP
1949
1950 return Token::make_identifier (loc, std::move (str));
1951 }
1952}
1953
1954// skip broken string input (unterminated strings)
1955void
1956Lexer::skip_broken_string_input (int current_char)
1957{
1958 while (current_char != '"' && current_char != EOF)
1959 {
1960 if (current_char == '\n')
1961 {
1962 current_line++;
1963 current_column = 1;
1964 }
1965 else
1966 {
1967 current_column++;
1968 }
1969 skip_input ();
1970 current_char = peek_input ();
1971 }
1972 if (current_char == '"')
1973 {
1974 current_column++;
1975
1976 skip_input ();
1977 current_char = peek_input ();
1978 }
1979 rust_debug ("skipped to %d:%d due to bad quotes", current_line,
1980 current_column);
1981}
1982
1983// Parses a unicode string.
1984TokenPtr
1985Lexer::parse_string (Location loc)
1986{
1987 Codepoint current_char32;
1988
1989 std::string str;
1990 str.reserve (16); // some sensible default
1991
1992 int length = 1;
1993 current_char32 = peek_codepoint_input ();
1994
1995 // FIXME: This fails if the input ends. How do we check for EOF?
1996 while (current_char32.value != '"' && !current_char32.is_eof ())
1997 {
1998 if (current_char32.value == '\\')
1999 {
2000 // parse escape
9f455ed8 2001 auto utf8_escape_pair = parse_utf8_escape ();
18f6990f
JP
2002 current_char32 = std::get<0> (utf8_escape_pair);
2003
2004 if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair))
2005 length = std::get<1> (utf8_escape_pair) - 1;
2006 else
2007 length += std::get<1> (utf8_escape_pair);
2008
2009 if (current_char32 != Codepoint (0)
2010 || !std::get<2> (utf8_escape_pair))
2011 str += current_char32;
2012
2013 // required as parsing utf8 escape only changes current_char
2014 current_char32 = peek_codepoint_input ();
2015
2016 continue;
2017 }
2018
2019 length += get_input_codepoint_length ();
2020
2021 str += current_char32;
2022 skip_codepoint_input ();
2023 current_char32 = peek_codepoint_input ();
2024 }
2025
2026 current_column += length;
2027
2028 if (current_char32.value == '"')
2029 {
2030 current_column++;
2031
2032 skip_input ();
2033 current_char = peek_input ();
2034 }
2035 else if (current_char32.is_eof ())
2036 {
2037 rust_error_at (get_current_location (), "unended string literal");
2038 return Token::make (END_OF_FILE, get_current_location ());
2039 }
2040 else
2041 {
2042 gcc_unreachable ();
2043 }
2044
2045 str.shrink_to_fit ();
86f8e47f 2046 loc += length - 1;
2047
18f6990f
JP
2048 return Token::make_string (loc, std::move (str));
2049}
2050
2051// Parses an identifier or keyword.
2052TokenPtr
2053Lexer::parse_identifier_or_keyword (Location loc)
2054{
2055 std::string str;
2056 str.reserve (16); // default
2057 str += current_char;
2058
2059 bool first_is_underscore = current_char == '_';
2060
2061 int length = 1;
2062 current_char = peek_input ();
2063 // loop through entire name
2064 while (ISALPHA (current_char) || ISDIGIT (current_char)
2065 || current_char == '_')
2066 {
2067 length++;
2068
2069 str += current_char;
2070 skip_input ();
2071 current_char = peek_input ();
2072 }
2073
2074 current_column += length;
2075
2076 // if just a single underscore, not an identifier
2077 if (first_is_underscore && length == 1)
2078 return Token::make (UNDERSCORE, loc);
2079
2080 str.shrink_to_fit ();
2081
86f8e47f 2082 loc += length - 1;
2083
18f6990f
JP
2084 TokenId keyword = classify_keyword (str);
2085 if (keyword == IDENTIFIER)
2086 return Token::make_identifier (loc, std::move (str));
2087 else
2088 return Token::make (keyword, loc);
2089}
2090
2091// Possibly returns a raw string token if it exists - otherwise returns null.
2092TokenPtr
2093Lexer::maybe_parse_raw_string (Location loc)
2094{
2095 int peek_index = 0;
2096 while (peek_input (peek_index) == '#')
2097 peek_index++;
2098
2099 if (peek_input (peek_index) == '"')
2100 return parse_raw_string (loc, peek_index);
2101 else
2102 return nullptr;
2103}
2104
2105// Returns a raw string token.
2106TokenPtr
2107Lexer::parse_raw_string (Location loc, int initial_hash_count)
2108{
2109 // raw string literals
2110 std::string str;
2111 str.reserve (16); // some sensible default
2112
2113 int length = 1 + initial_hash_count;
2114
2115 if (initial_hash_count > 0)
2116 skip_input (initial_hash_count - 1);
2117
2118 current_char = peek_input ();
2119
2120 if (current_char != '"')
2121 rust_error_at (get_current_location (), "raw string has no opening %<\"%>");
2122
2123 length++;
2124 skip_input ();
2125 Codepoint current_char32 = peek_codepoint_input ();
2126
2127 while (!current_char32.is_eof ())
2128 {
2129 if (current_char32.value == '"')
2130 {
2131 bool enough_hashes = true;
2132
2133 for (int i = 0; i < initial_hash_count; i++)
2134 {
2135 if (peek_input (i + 1) != '#')
2136 {
2137 enough_hashes = false;
2138 break;
2139 }
2140 }
2141
2142 if (enough_hashes)
2143 {
2144 // skip enough input and peek enough input
2145 skip_input (initial_hash_count);
2146 current_char = peek_input ();
2147 length += initial_hash_count + 1;
2148 break;
2149 }
2150 }
2151
2152 length++;
2153
2154 str += current_char32;
2155 skip_codepoint_input ();
2156 current_char32 = peek_codepoint_input ();
2157 }
2158
2159 current_column += length;
2160
86f8e47f 2161 loc += length - 1;
2162
18f6990f
JP
2163 str.shrink_to_fit ();
2164
2165 return Token::make_string (loc, std::move (str));
2166}
2167
2168template <typename IsDigitFunc>
2169TokenPtr
2170Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func,
2171 std::string existent_str, int base)
2172{
2173 int length = 1;
2174
2175 skip_input ();
2176 current_char = peek_input ();
2177
2178 length++;
2179
2180 // loop through to add entire number to string
2181 while (is_digit_func (current_char) || current_char == '_')
2182 {
2183 if (current_char == '_')
2184 {
2185 // don't add _ to number
2186 skip_input ();
2187 current_char = peek_input ();
2188
2189 length++;
2190
2191 continue;
2192 }
2193
2194 length++;
2195
2196 // add raw numbers
2197 existent_str += current_char;
2198 skip_input ();
2199 current_char = peek_input ();
2200 }
2201
2202 // convert value to decimal representation
2203 long dec_num = std::strtol (existent_str.c_str (), nullptr, base);
2204
2205 existent_str = std::to_string (dec_num);
2206
2207 // parse in type suffix if it exists
2208 auto type_suffix_pair = parse_in_type_suffix ();
2209 PrimitiveCoreType type_hint = type_suffix_pair.first;
2210 length += type_suffix_pair.second;
2211
2212 current_column += length;
2213
2214 if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
2215 {
2216 rust_error_at (get_current_location (),
2217 "invalid type suffix %qs for integer (%s) literal",
2218 get_type_hint_string (type_hint),
2219 base == 16
2220 ? "hex"
2221 : (base == 8 ? "octal"
2222 : (base == 2 ? "binary"
2223 : "<insert unknown base>")));
2224 return nullptr;
2225 }
86f8e47f 2226
2227 loc += length - 1;
2228
18f6990f
JP
2229 return Token::make_int (loc, std::move (existent_str), type_hint);
2230}
2231
2232// Parses a hex, binary or octal int literal.
2233TokenPtr
2234Lexer::parse_non_decimal_int_literals (Location loc)
2235{
2236 std::string str;
2237 str.reserve (16); // some sensible default
2238 str += current_char;
2239
2240 current_char = peek_input ();
2241
2242 if (current_char == 'x')
2243 {
2244 // hex (integer only)
2245 return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
2246 }
2247 else if (current_char == 'o')
2248 {
2249 // octal (integer only)
2250 return parse_non_decimal_int_literal (loc, is_octal_digit,
2251 std::move (str), 8);
2252 }
2253 else if (current_char == 'b')
2254 {
2255 // binary (integer only)
2256 return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
2257 2);
2258 }
2259 else
2260 {
2261 return nullptr;
2262 }
2263}
2264
2265// Parses a decimal-based int literal or float literal.
2266TokenPtr
2267Lexer::parse_decimal_int_or_float (Location loc)
2268{
2269 std::string str;
2270 str.reserve (16); // some sensible default
2271 str += current_char;
2272
2273 int length = 1;
2274 bool first_zero = current_char == '0';
2275
2276 current_char = peek_input ();
2277
2278 // parse initial decimal integer (or first integer part of float) literal
2279 auto initial_decimal = parse_in_decimal ();
2280 str += std::get<0> (initial_decimal);
2281 length += std::get<1> (initial_decimal);
2282
2283 // detect float literal
2284 if (current_char == '.' && is_float_digit (peek_input (1)))
2285 {
2286 // float with a '.', parse another decimal into it
2287
2288 // add . to str
2289 str += current_char;
2290 skip_input ();
2291 current_char = peek_input ();
2292 length++;
2293
2294 // parse another decimal number for float
2295 auto second_decimal = parse_in_decimal ();
2296 str += std::get<0> (second_decimal);
2297 length += std::get<1> (second_decimal);
2298
2299 // parse in exponent part if it exists
2300 auto exponent_pair = parse_in_exponent_part ();
2301 str += exponent_pair.first;
2302 length += exponent_pair.second;
2303
2304 // parse in type suffix if it exists
2305 auto type_suffix_pair = parse_in_type_suffix ();
2306 PrimitiveCoreType type_hint = type_suffix_pair.first;
2307 length += type_suffix_pair.second;
2308
2309 if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
2310 && type_hint != CORETYPE_UNKNOWN)
2311 {
2312 rust_error_at (get_current_location (),
2313 "invalid type suffix %qs for floating-point literal",
2314 get_type_hint_string (type_hint));
2315 // ignore invalid type suffix as everything else seems fine
2316 type_hint = CORETYPE_UNKNOWN;
2317 }
2318
2319 current_column += length;
2320
86f8e47f 2321 loc += length - 1;
2322
18f6990f
JP
2323 str.shrink_to_fit ();
2324 return Token::make_float (loc, std::move (str), type_hint);
2325 }
2326 else if (current_char == '.' && check_valid_float_dot_end (peek_input (1)))
2327 {
2328 // float that is just an integer with a terminating '.' character
2329
2330 // add . to str
2331 str += current_char;
2332 skip_input ();
2333 current_char = peek_input ();
2334 length++;
2335
2336 // add a '0' after the . to prevent ambiguity
2337 str += '0';
2338
2339 // type hint not allowed
2340
2341 current_column += length;
2342
86f8e47f 2343 loc += length - 1;
2344
18f6990f
JP
2345 str.shrink_to_fit ();
2346 return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN);
2347 }
2348 else if (current_char == 'E' || current_char == 'e')
2349 {
2350 // exponent float with no '.' character
2351
2352 // parse exponent part
2353 auto exponent_pair = parse_in_exponent_part ();
2354 str += exponent_pair.first;
2355 length += exponent_pair.second;
2356
2357 // parse in type suffix if it exists
2358 auto type_suffix_pair = parse_in_type_suffix ();
2359 PrimitiveCoreType type_hint = type_suffix_pair.first;
2360 length += type_suffix_pair.second;
2361
2362 if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
2363 && type_hint != CORETYPE_UNKNOWN)
2364 {
2365 rust_error_at (get_current_location (),
2366 "invalid type suffix %qs for floating-point literal",
2367 get_type_hint_string (type_hint));
2368 // ignore invalid type suffix as everything else seems fine
2369 type_hint = CORETYPE_UNKNOWN;
2370 }
2371
2372 current_column += length;
2373
86f8e47f 2374 loc += length - 1;
2375
18f6990f
JP
2376 str.shrink_to_fit ();
2377 return Token::make_float (loc, std::move (str), type_hint);
2378 }
2379 else
2380 {
2381 // is an integer
2382
2383 // parse in type suffix if it exists
2384 auto type_suffix_pair = parse_in_type_suffix ();
2385 PrimitiveCoreType type_hint = type_suffix_pair.first;
2386 /* A "real" pure decimal doesn't have a suffix and no zero prefix. */
2387 if (type_hint == CORETYPE_UNKNOWN)
2388 {
2389 bool pure_decimal = std::get<2> (initial_decimal);
2390 if (pure_decimal && (!first_zero || str.size () == 1))
2391 type_hint = CORETYPE_PURE_DECIMAL;
2392 }
2393 length += type_suffix_pair.second;
2394
2395 current_column += length;
2396
86f8e47f 2397 loc += length - 1;
2398
18f6990f
JP
2399 str.shrink_to_fit ();
2400 return Token::make_int (loc, std::move (str), type_hint);
2401 }
2402}
2403
2404TokenPtr
2405Lexer::parse_char_or_lifetime (Location loc)
2406{
2407 Codepoint current_char32;
2408
2409 int length = 1;
2410
2411 current_char32 = peek_codepoint_input ();
2412 if (current_char32.is_eof ())
2413 return nullptr;
2414
2415 // parse escaped char literal
2416 if (current_char32.value == '\\')
2417 {
2418 // parse escape
9f455ed8 2419 auto utf8_escape_pair = parse_utf8_escape ();
18f6990f
JP
2420 current_char32 = std::get<0> (utf8_escape_pair);
2421 length += std::get<1> (utf8_escape_pair);
2422
2423 if (peek_codepoint_input ().value != '\'')
2424 {
2425 rust_error_at (get_current_location (), "unended character literal");
2426 }
2427 else
2428 {
2429 skip_codepoint_input ();
2430 current_char = peek_input ();
2431 length++;
2432 }
2433
2434 current_column += length;
2435
86f8e47f 2436 loc += length - 1;
2437
18f6990f
JP
2438 return Token::make_char (loc, current_char32);
2439 }
2440 else
2441 {
2442 skip_codepoint_input ();
2443
2444 if (peek_codepoint_input ().value == '\'')
2445 {
2446 // parse non-escaped char literal
2447
2448 // skip the ' character
2449 skip_input ();
2450 current_char = peek_input ();
2451
2452 // TODO fix due to different widths of utf-8 chars?
2453 current_column += 3;
2454
86f8e47f 2455 loc += 2;
2456
18f6990f
JP
2457 return Token::make_char (loc, current_char32);
2458 }
2459 else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
2460 || current_char32.value == '_')
2461 {
2462 // parse lifetime name
2463 std::string str;
2464 str += current_char32;
2465 length++;
2466
2467 current_char = peek_input ();
2468 while (ISDIGIT (current_char) || ISALPHA (current_char)
2469 || current_char == '_')
2470 {
2471 str += current_char;
2472 skip_input ();
2473 current_char = peek_input ();
2474 length++;
2475 }
2476
2477 current_column += length;
2478
86f8e47f 2479 loc += length - 1;
2480
18f6990f
JP
2481 str.shrink_to_fit ();
2482 return Token::make_lifetime (loc, std::move (str));
2483 }
2484 else
2485 {
2486 rust_error_at (
2487 get_current_location (),
2488 "expected %' after character constant in character literal");
2489 return nullptr;
2490 }
2491 }
2492}
2493
2494// Returns the length of the codepoint at the current position.
2495int
2496Lexer::get_input_codepoint_length ()
2497{
2498 uint8_t input = peek_input ();
2499
2500 if ((int8_t) input == EOF)
2501 return 0;
2502
2503 if (input < 128)
2504 {
2505 // ascii -- 1 byte
2506 // return input;
2507
2508 return 1;
2509 }
2510 else if ((input & 0xC0) == 0x80)
2511 {
2512 // invalid (continuation; can't be first char)
2513 // return 0xFFFE;
2514
2515 return 0;
2516 }
2517 else if ((input & 0xE0) == 0xC0)
2518 {
2519 // 2 bytes
2520 uint8_t input2 = peek_input (1);
2521 if ((input2 & 0xC0) != 0x80)
2522 return 0;
2523 // return 0xFFFE;
2524
2525 // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
2526 // return output;
2527 return 2;
2528 }
2529 else if ((input & 0xF0) == 0xE0)
2530 {
2531 // 3 bytes
2532 uint8_t input2 = peek_input (1);
2533 if ((input2 & 0xC0) != 0x80)
2534 return 0;
2535 // return 0xFFFE;
2536
2537 uint8_t input3 = peek_input (2);
2538 if ((input3 & 0xC0) != 0x80)
2539 return 0;
2540 // return 0xFFFE;
2541
2542 /*uint32_t output
2543 = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
2544 0); return output;*/
2545 return 3;
2546 }
2547 else if ((input & 0xF8) == 0xF0)
2548 {
2549 // 4 bytes
2550 uint8_t input2 = peek_input (1);
2551 if ((input2 & 0xC0) != 0x80)
2552 return 0;
2553 // return 0xFFFE;
2554
2555 uint8_t input3 = peek_input (2);
2556 if ((input3 & 0xC0) != 0x80)
2557 return 0;
2558 // return 0xFFFE;
2559
2560 uint8_t input4 = peek_input (3);
2561 if ((input4 & 0xC0) != 0x80)
2562 return 0;
2563 // return 0xFFFE;
2564
2565 /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
2566 | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
2567 return output;*/
2568 return 4;
2569 }
2570 else
2571 {
2572 rust_error_at (get_current_location (),
2573 "invalid UTF-8 [FIRST] (too long)");
2574 return 0;
2575 }
2576}
2577
2578// Returns the codepoint at the current position.
2579Codepoint
2580Lexer::peek_codepoint_input ()
2581{
2582 uint8_t input = peek_input ();
2583
2584 if ((int8_t) input == EOF)
2585 return Codepoint::eof ();
2586
2587 if (input < 128)
2588 {
2589 // ascii -- 1 byte
2590 return {input};
2591 }
2592 else if ((input & 0xC0) == 0x80)
2593 {
2594 // invalid (continuation; can't be first char)
2595 return {0xFFFE};
2596 }
2597 else if ((input & 0xE0) == 0xC0)
2598 {
2599 // 2 bytes
2600 uint8_t input2 = peek_input (1);
2601 if ((input2 & 0xC0) != 0x80)
2602 return {0xFFFE};
2603
2604 uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
2605 return {output};
2606 }
2607 else if ((input & 0xF0) == 0xE0)
2608 {
2609 // 3 bytes
2610 uint8_t input2 = peek_input (1);
2611 if ((input2 & 0xC0) != 0x80)
2612 return {0xFFFE};
2613
2614 uint8_t input3 = peek_input (2);
2615 if ((input3 & 0xC0) != 0x80)
2616 return {0xFFFE};
2617
2618 uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
2619 | ((input3 & 0x3F) << 0);
2620 return {output};
2621 }
2622 else if ((input & 0xF8) == 0xF0)
2623 {
2624 // 4 bytes
2625 uint8_t input2 = peek_input (1);
2626 if ((input2 & 0xC0) != 0x80)
2627 return {0xFFFE};
2628
2629 uint8_t input3 = peek_input (2);
2630 if ((input3 & 0xC0) != 0x80)
2631 return {0xFFFE};
2632
2633 uint8_t input4 = peek_input (3);
2634 if ((input4 & 0xC0) != 0x80)
2635 return {0xFFFE};
2636
2637 uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
2638 | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
2639 return {output};
2640 }
2641 else
2642 {
2643 rust_error_at (get_current_location (),
2644 "invalid UTF-8 [SECND] (too long)");
2645 return {0xFFFE};
2646 }
2647}
2648
2649void
2650Lexer::skip_codepoint_input ()
2651{
2652 int toSkip = get_input_codepoint_length ();
2653 gcc_assert (toSkip >= 1);
2654
2655 skip_input (toSkip - 1);
2656}
2657
2658int
2659Lexer::test_get_input_codepoint_n_length (int n_start_offset)
2660{
2661 uint8_t input = peek_input (n_start_offset);
2662
2663 if (input < 128)
2664 {
2665 // ascii -- 1 byte
2666 // return input;
2667 return 1;
2668 }
2669 else if ((input & 0xC0) == 0x80)
2670 {
2671 // invalid (continuation; can't be first char)
2672 // return 0xFFFE;
2673 return 0;
2674 }
2675 else if ((input & 0xE0) == 0xC0)
2676 {
2677 // 2 bytes
2678 uint8_t input2 = peek_input (n_start_offset + 1);
2679 if ((input2 & 0xC0) != 0x80)
2680 // return 0xFFFE;
2681 return 0;
2682
2683 // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
2684 // return output;
2685 return 2;
2686 }
2687 else if ((input & 0xF0) == 0xE0)
2688 {
2689 // 3 bytes
2690 uint8_t input2 = peek_input (n_start_offset + 1);
2691 if ((input2 & 0xC0) != 0x80)
2692 // return 0xFFFE;
2693 return 0;
2694
2695 uint8_t input3 = peek_input (n_start_offset + 2);
2696 if ((input3 & 0xC0) != 0x80)
2697 // return 0xFFFE;
2698 return 0;
2699
2700 /*uint32_t output
2701 = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
2702 0); return output;*/
2703 return 3;
2704 }
2705 else if ((input & 0xF8) == 0xF0)
2706 {
2707 // 4 bytes
2708 uint8_t input2 = peek_input (n_start_offset + 1);
2709 if ((input2 & 0xC0) != 0x80)
2710 // return 0xFFFE;
2711 return 0;
2712
2713 uint8_t input3 = peek_input (n_start_offset + 2);
2714 if ((input3 & 0xC0) != 0x80)
2715 // return 0xFFFE;
2716 return 0;
2717
2718 uint8_t input4 = peek_input (n_start_offset + 3);
2719 if ((input4 & 0xC0) != 0x80)
2720 // return 0xFFFE;
2721 return 0;
2722
2723 /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
2724 | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
2725 return output;*/
2726 return 4;
2727 }
2728 else
2729 {
2730 rust_error_at (get_current_location (),
2731 "invalid UTF-8 [THIRD] (too long)");
2732 return 0;
2733 }
2734}
2735
2736// peeks the codepoint input at n codepoints ahead of current codepoint - try
2737// not to use
2738Codepoint
2739Lexer::test_peek_codepoint_input (int n)
2740{
2741 int totalOffset = 0;
2742
2743 // add up all offsets into total offset? does this do what I want?
2744 for (int i = 0; i < n; i++)
2745 {
2746 totalOffset += test_get_input_codepoint_n_length (totalOffset);
2747 }
2748 // issues: this would have (at least) O(n) lookup time, not O(1) like the
2749 // rest?
2750
2751 // TODO: implement if still needed
2752
2753 // error out of function as it is not implemented
2754 gcc_assert (1 == 0);
2755 return {0};
2756 /*
2757 uint8_t input = peek_input();
2758
2759 if (input < 128) {
2760 // ascii -- 1 byte
2761 return input;
2762 } else if ((input & 0xC0) == 0x80) {
2763 // invalid (continuation; can't be first char)
2764 return 0xFFFE;
2765 } else if ((input & 0xE0) == 0xC0) {
2766 // 2 bytes
2767 uint8_t input2 = peek_input(1);
2768 if ((input2 & 0xC0) != 0x80)
2769 return 0xFFFE;
2770
2771 uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
2772 return output;
2773 } else if ((input & 0xF0) == 0xE0) {
2774 // 3 bytes
2775 uint8_t input2 = peek_input(1);
2776 if ((input2 & 0xC0) != 0x80)
2777 return 0xFFFE;
2778
2779 uint8_t input3 = peek_input(2);
2780 if ((input3 & 0xC0) != 0x80)
2781 return 0xFFFE;
2782
2783 uint32_t output
2784 = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
2785 0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
2786 // 4 bytes
2787 uint8_t input2 = peek_input(1);
2788 if ((input2 & 0xC0) != 0x80)
2789 return 0xFFFE;
2790
2791 uint8_t input3 = peek_input(2);
2792 if ((input3 & 0xC0) != 0x80)
2793 return 0xFFFE;
2794
2795 uint8_t input4 = peek_input(3);
2796 if ((input4 & 0xC0) != 0x80)
2797 return 0xFFFE;
2798
2799 uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
2800 | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
2801 0); return output; } else { rust_error_at(get_current_location(), "invalid
2802 UTF-8 (too long)"); return 0xFFFE;
2803 }*/
2804}
2805
2806void
2807Lexer::split_current_token (TokenId new_left, TokenId new_right)
2808{
2809 /* TODO: assert that this TokenId is a "simple token" like punctuation and not
2810 * like "IDENTIFIER"? */
2811 Location current_loc = peek_token ()->get_locus ();
2812 TokenPtr new_left_tok = Token::make (new_left, current_loc);
2813 TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
2814
2815 token_queue.replace_current_value (std::move (new_left_tok));
2816 token_queue.insert (1, std::move (new_right_tok));
2817}
2818
2819void
2820Lexer::start_line (int current_line, int current_column)
2821{
2822 if (line_map)
2823 line_map->start_line (current_line, current_column);
2824}
2825
2826} // namespace Rust