gccrs: Refactor lexer to handle UTF-8

author Raiki Tamura <tamaron1203@gmail.com>

Sun, 18 Jun 2023 14:25:31 +0000 (23:25 +0900)

committer Arthur Cohen <arthur.cohen@embecosm.com>

Tue, 16 Jan 2024 17:46:30 +0000 (18:46 +0100)
author Raiki Tamura <tamaron1203@gmail.com>
Sun, 18 Jun 2023 14:25:31 +0000 (23:25 +0900)
committer Arthur Cohen <arthur.cohen@embecosm.com>
Tue, 16 Jan 2024 17:46:30 +0000 (18:46 +0100)
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc

index 94ec67d2e66b68dce2dabd1090c16bd5fa8e6c64..aec2a96694abcc25e925e84723016a160472555b 100644 (file)
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -16,6 +16,7 @@
  // along with GCC; see the file COPYING3.  If not see
  // <http://www.gnu.org/licenses/>.
  
+#include "rust-codepoint.h"
  #include "rust-system.h"
  #include "rust-lex.h"
  #include "rust-diagnostics.h"
@@ -73,7 +74,7 @@ Codepoint::as_string ()
  /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
   * for handling. */
  bool
-is_float_digit (char number)
+is_float_digit (uint32_t number)
  {
    return ISDIGIT (number) || number == 'E' || number == 'e';
  }
@@ -81,31 +82,31 @@ is_float_digit (char number)
  /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
   * whatever is different */
  bool
-is_x_digit (char number)
+is_x_digit (uint32_t number)
  {
    return ISXDIGIT (number);
  }
  
  bool
-is_octal_digit (char number)
+is_octal_digit (uint32_t number)
  {
    return number >= '0' && number <= '7';
  }
  
  bool
-is_bin_digit (char number)
+is_bin_digit (uint32_t number)
  {
    return number == '0' || number == '1';
  }
  
  bool
-check_valid_float_dot_end (char character)
+check_valid_float_dot_end (uint32_t character)
  {
    return character != '.' && character != '_' && !ISALPHA (character);
  }
  
  bool
-is_whitespace (int character)
+is_whitespace (uint32_t character)
  {
    // https://doc.rust-lang.org/reference/whitespace.html
    return character == '\t' || character == '\n' || character == '\v'
@@ -118,19 +119,19 @@ is_whitespace (int character)
  }
  
  bool
-is_non_decimal_int_literal_separator (char character)
+is_non_decimal_int_literal_separator (uint32_t character)
  {
    return character == 'x' || character == 'o' || character == 'b';
  }
  
  bool
-is_identifier_start (int codepoint)
+is_identifier_start (uint32_t codepoint)
  {
    return (cpp_check_xid_property (codepoint) & CPP_XID_START) || codepoint == '_';
  }
  
  bool
-is_identifier_continue (int codepoint)
+is_identifier_continue (uint32_t codepoint)
  {
    return cpp_check_xid_property (codepoint) & CPP_XID_CONTINUE;
  }
@@ -183,13 +184,13 @@ Lexer::get_current_location ()
      return Location ();
  }
  
-int
+Codepoint
  Lexer::peek_input (int n)
  {
    return input_queue.peek (n);
  }
  
-int
+Codepoint
  Lexer::peek_input ()
  {
    return peek_input (0);
@@ -304,17 +305,6 @@ Lexer::build_token ()
      {
        Location loc = get_current_location ();
  
-      // detect UTF8 bom
-      //
-      // Must be the first thing on the first line.
-      // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
-      // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
-      if (current_line == 1 && current_column == 1 && peek_input () == 0xef
-         && peek_input (1) == 0xbb && peek_input (2) == 0xbf)
-       {
-         skip_input (2);
-       }
-
        current_char = peek_input ();
        current_char32 = peek_codepoint_input ();
        skip_codepoint_input ();
@@ -331,9 +321,8 @@ Lexer::build_token ()
           int n = 1;
           while (true)
             {
-             // TODO use utf-8 codepoint to skip whitespaces
-             int next_char = peek_input (n);
-             if (is_whitespace (next_char))
+             Codepoint next_char = peek_input (n);
+             if (is_whitespace (next_char.value))
                 n++;
               else if ((next_char == '/' && peek_input (n + 1) == '/'
                         && peek_input (n + 2) != '!'
@@ -347,7 +336,7 @@ Lexer::build_token ()
                   // (but not an inner or outer doc comment)
                   n += 2;
                   next_char = peek_input (n);
-                 while (next_char != '\n' && next_char != EOF)
+                 while (next_char != '\n' && !next_char.is_eof ())
                     {
                       n++;
                       next_char = peek_input (n);
@@ -383,7 +372,7 @@ Lexer::build_token ()
                   int level = 1;
                   while (level > 0)
                     {
-                     if (peek_input (n) == EOF)
+                     if (peek_input (n).is_eof ())
                         break;
                       else if (peek_input (n) == '/'
                                && peek_input (n + 1) == '*')
@@ -404,7 +393,7 @@ Lexer::build_token ()
               else if (next_char != '[')
                 {
                   // definitely shebang, ignore the first line
-                 while (current_char != '\n' && current_char != EOF)
+                 while (current_char != '\n' && !current_char.is_eof ())
                     {
                       current_char = peek_input ();
                       skip_input ();
@@ -423,11 +412,11 @@ Lexer::build_token ()
         }
  
        // return end of file token if end of file
-      if (current_char == EOF)
+      if (current_char.is_eof ())
         return Token::make (END_OF_FILE, loc);
  
        // if not end of file, start tokenising
-      switch (current_char)
+      switch (current_char.value)
         {
         /* ignore whitespace characters for tokens but continue updating
          * location */
@@ -566,7 +555,7 @@ Lexer::build_token ()
               current_char = peek_input ();
  
               // basically ignore until line finishes
-             while (current_char != '\n' && current_char != EOF)
+             while (current_char != '\n' && !current_char.is_eof ())
                 {
                   skip_input ();
                   current_column++; // not used
@@ -590,7 +579,7 @@ Lexer::build_token ()
                   skip_input ();
                   if (current_char == '\r')
                     {
-                     char next_char = peek_input ();
+                     Codepoint next_char = peek_input ();
                       if (next_char == '\n')
                         {
                           current_char = '\n';
@@ -601,7 +590,7 @@ Lexer::build_token ()
                       current_char = next_char;
                       continue;
                     }
-                 if (current_char == EOF)
+                 if (current_char.is_eof ())
                     {
                       rust_error_at (
                         loc, "unexpected EOF while looking for end of comment");
@@ -656,7 +645,7 @@ Lexer::build_token ()
                 {
                   current_char = peek_input ();
  
-                 if (current_char == EOF)
+                 if (current_char.is_eof ())
                     {
                       rust_error_at (
                         loc, "unexpected EOF while looking for end of comment");
@@ -720,7 +709,7 @@ Lexer::build_token ()
                 {
                   current_char = peek_input ();
  
-                 if (current_char == EOF)
+                 if (current_char.is_eof ())
                     {
                       rust_error_at (
                         loc, "unexpected EOF while looking for end of comment");
@@ -1069,11 +1058,11 @@ Lexer::build_token ()
        // raw identifiers and raw strings
        if (current_char == 'r')
         {
-         int peek = peek_input ();
-         int peek1 = peek_input (1);
+         Codepoint peek = peek_input ();
+         Codepoint peek1 = peek_input (1);
  
           // TODO (tamaron) parse Unicode ident
-         if (peek == '#' && is_identifier_start (peek1))
+         if (peek == '#' && is_identifier_start (peek1.value))
             {
               TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
               if (raw_ident_ptr != nullptr)
@@ -1095,10 +1084,10 @@ Lexer::build_token ()
         return parse_identifier_or_keyword (loc);
  
        // int and float literals
-      if (ISDIGIT (current_char))
+      if (ISDIGIT (current_char.value))
         { //  _ not allowed as first char
           if (current_char == '0'
-             && is_non_decimal_int_literal_separator (peek_input ()))
+             && is_non_decimal_int_literal_separator (peek_input ().value))
             {
               // handle binary, octal, hex literals
               TokenPtr non_dec_int_lit_ptr
@@ -1137,7 +1126,7 @@ Lexer::build_token ()
                     "unexpected character");
  
        // didn't match anything so error
-      rust_error_at (loc, "unexpected character %<%x%>", current_char);
+      rust_error_at (loc, "unexpected character %<%x%>", current_char.value);
        current_column++;
      }
  }
@@ -1152,7 +1141,7 @@ Lexer::parse_in_type_suffix ()
    int additional_length_offset = 0;
  
    // get suffix
-  while (ISALPHA (current_char) || ISDIGIT (current_char)
+  while (ISALPHA (current_char.value) || ISDIGIT (current_char.value)
          || current_char == '_')
      {
        if (current_char == '_')
@@ -1293,7 +1282,7 @@ Lexer::parse_in_decimal ()
    bool pure_decimal = true;
    int additional_length_offset = 0;
    std::string str;
-  while (ISDIGIT (current_char) || current_char == '_')
+  while (ISDIGIT (current_char.value) || current_char.value == '_')
      {
        if (current_char == '_')
         {
@@ -1329,7 +1318,7 @@ Lexer::parse_escape (char opening_char)
    current_char = peek_input ();
    additional_length_offset++;
  
-  switch (current_char)
+  switch (current_char.value)
      {
        case 'x': {
         auto hex_escape_pair = parse_partial_hex_escape ();
@@ -1382,7 +1371,8 @@ Lexer::parse_escape (char opening_char)
        return std::make_tuple (0, parse_partial_string_continue (), true);
      default:
        rust_error_at (get_current_location (),
-                    "unknown escape sequence %<\\%c%>", current_char);
+                    "unknown escape sequence %<\\%s%>",
+                    current_char.as_string ().c_str ());
        // returns false if no parsing could be done
        // return false;
        return std::make_tuple (output_char, additional_length_offset, false);
@@ -1411,7 +1401,7 @@ Lexer::parse_utf8_escape ()
    current_char = peek_input ();
    additional_length_offset++;
  
-  switch (current_char)
+  switch (current_char.value)
      {
        case 'x': {
         auto hex_escape_pair = parse_partial_hex_escape ();
@@ -1465,7 +1455,8 @@ Lexer::parse_utf8_escape ()
        return std::make_tuple (0, parse_partial_string_continue (), true);
      default:
        rust_error_at (get_current_location (),
-                    "unknown escape sequence %<\\%c%>", current_char);
+                    "unknown escape sequence %<\\%s%>",
+                    current_char.as_string ().c_str ());
        // returns false if no parsing could be done
        // return false;
        return std::make_tuple (output_char, additional_length_offset, false);
@@ -1490,7 +1481,7 @@ Lexer::parse_partial_string_continue ()
  
    // string continue
    // TODO use utf-8 codepoint to skip whitespaces
-  while (is_whitespace (current_char))
+  while (is_whitespace (current_char.value))
      {
        if (current_char == '\n')
         {
@@ -1529,29 +1520,29 @@ Lexer::parse_partial_hex_escape ()
    current_char = peek_input (1);
    int additional_length_offset = 1;
  
-  if (!is_x_digit (current_char))
+  if (!is_x_digit (current_char.value))
      {
        rust_error_at (get_current_location (),
-                    "invalid character %<\\x%c%> in \\x sequence",
-                    current_char);
+                    "invalid character %<\\x%s%> in \\x sequence",
+                    current_char.as_string ().c_str ());
        return std::make_pair (0, 0);
      }
-  hexNum[0] = current_char;
+  hexNum[0] = current_char.value;
  
    // second hex char
    skip_input ();
    current_char = peek_input (1);
    additional_length_offset++;
  
-  if (!is_x_digit (current_char))
+  if (!is_x_digit (current_char.value))
      {
        rust_error_at (get_current_location (),
-                    "invalid character %<\\x%c%c%> in \\x sequence", hexNum[0],
-                    current_char);
+                    "invalid character %<\\x%c%s%> in \\x sequence", hexNum[0],
+                    current_char.as_string ().c_str ());
        return std::make_pair (0, 1);
      }
    skip_input ();
-  hexNum[1] = current_char;
+  hexNum[1] = current_char.value;
  
    long hexLong = std::strtol (hexNum, nullptr, 16);
  
@@ -1571,7 +1562,7 @@ Lexer::parse_partial_unicode_escape ()
        rust_error_at (get_current_location (),
                      "unicode escape should start with %<{%>");
        /* Skip what should probaby have been between brackets.  */
-      while (is_x_digit (current_char) || current_char == '_')
+      while (is_x_digit (current_char.value) || current_char == '_')
         {
           skip_input ();
           current_char = peek_input ();
@@ -1599,7 +1590,7 @@ Lexer::parse_partial_unicode_escape ()
    num_str.reserve (6);
  
    // loop through to add entire hex number to string
-  while (is_x_digit (current_char) || current_char == '_')
+  while (is_x_digit (current_char.value) || current_char.value == '_')
      {
        if (current_char == '_')
         {
@@ -1634,7 +1625,7 @@ Lexer::parse_partial_unicode_escape ()
        // termination, otherwise it is a wrong character, then skip to the actual
        // terminator.
        // TODO use utf-8 codepoint to skip whitespaces
-      if (current_char == '{' || is_whitespace (current_char)
+      if (current_char == '{' || is_whitespace (current_char.value)
           || current_char == '\'' || current_char == '"')
         {
           rust_error_at (get_current_location (),
@@ -1644,11 +1635,11 @@ Lexer::parse_partial_unicode_escape ()
        else
         {
           rust_error_at (get_current_location (),
-                        "invalid character %<%c%> in unicode escape",
-                        current_char);
+                        "invalid character %<%s%> in unicode escape",
+                        current_char.as_string ().c_str ());
           // TODO use utf-8 codepoint to skip whitespaces
           while (current_char != '}' && current_char != '{'
-                && !is_whitespace (current_char) && current_char != '\''
+                && !is_whitespace (current_char.value) && current_char != '\''
                  && current_char != '"')
             {
               skip_input ();
@@ -1711,7 +1702,7 @@ Lexer::parse_byte_char (Location loc)
    int length = 1;
  
    // char to save
-  char byte_char = 0;
+  Codepoint byte_char = 0;
  
    // detect escapes
    if (current_char == '\\')
@@ -1759,7 +1750,8 @@ Lexer::parse_byte_char (Location loc)
  
    loc += length - 1;
  
-  return Token::make_byte_char (loc, byte_char);
+  // TODO: error when byte_char is non ASCII
+  return Token::make_byte_char (loc, byte_char.value);
  }
  
  // Parses a byte string.
@@ -1778,7 +1770,7 @@ Lexer::parse_byte_string (Location loc)
    int length = 1;
    current_char = peek_input ();
  
-  while (current_char != '"' && current_char != EOF)
+  while (current_char != '"' && !current_char.is_eof ())
      {
        if (current_char == '\\')
         {
@@ -1812,7 +1804,7 @@ Lexer::parse_byte_string (Location loc)
        skip_input ();
        current_char = peek_input ();
      }
-  else if (current_char == EOF)
+  else if (current_char.is_eof ())
      {
        rust_error_at (get_current_location (), "unended byte string literal");
        return Token::make (END_OF_FILE, get_current_location ());
@@ -1887,11 +1879,11 @@ Lexer::parse_raw_byte_string (Location loc)
             }
         }
  
-      if ((unsigned char) current_char > 127)
+      if (current_char.value > 127)
         {
           rust_error_at (get_current_location (),
-                        "character %<%c%> in raw byte string out of range",
-                        current_char);
+                        "character %<%s%> in raw byte string out of range",
+                        current_char.as_string ().c_str ());
           current_char = 0;
         }
  
@@ -1929,7 +1921,7 @@ Lexer::parse_raw_identifier (Location loc)
    int length = 0;
    current_char = peek_input ();
    // loop through entire name
-  while (is_identifier_continue (current_char))
+  while (is_identifier_continue (current_char.value))
      {
        length++;
  
@@ -1940,6 +1932,8 @@ Lexer::parse_raw_identifier (Location loc)
  
    current_column += length;
  
+  rust_debug ("raw ident: %s", str.c_str ());
+
    // if just a single underscore, not an identifier
    if (first_is_underscore && length == 1)
      rust_error_at (get_current_location (),
@@ -1964,9 +1958,9 @@ Lexer::parse_raw_identifier (Location loc)
  
  // skip broken string input (unterminated strings)
  void
-Lexer::skip_broken_string_input (int current_char)
+Lexer::skip_broken_string_input (Codepoint current_char)
  {
-  while (current_char != '"' && current_char != EOF)
+  while (current_char != '"' && !current_char.is_eof ())
      {
        if (current_char == '\n')
         {
@@ -2190,7 +2184,7 @@ Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func,
    length++;
  
    // loop through to add entire number to string
-  while (is_digit_func (current_char) || current_char == '_')
+  while (is_digit_func (current_char.value) || current_char == '_')
      {
        if (current_char == '_')
         {
@@ -2293,7 +2287,7 @@ Lexer::parse_decimal_int_or_float (Location loc)
    length += std::get<1> (initial_decimal);
  
    // detect float literal
-  if (current_char == '.' && is_float_digit (peek_input (1)))
+  if (current_char == '.' && is_float_digit (peek_input (1).value))
      {
        // float with a '.', parse another decimal into it
  
@@ -2335,7 +2329,8 @@ Lexer::parse_decimal_int_or_float (Location loc)
        str.shrink_to_fit ();
        return Token::make_float (loc, std::move (str), type_hint);
      }
-  else if (current_char == '.' && check_valid_float_dot_end (peek_input (1)))
+  else if (current_char == '.'
+          && check_valid_float_dot_end (peek_input (1).value))
      {
        // float that is just an integer with a terminating '.' character
  
@@ -2504,338 +2499,133 @@ Lexer::parse_char_or_lifetime (Location loc)
      }
  }
  
+// TODO remove this function
  // Returns the length of the codepoint at the current position.
  int
  Lexer::get_input_codepoint_length ()
  {
-  uint8_t input = peek_input ();
-
-  if ((int8_t) input == EOF)
-    return 0;
-
-  if (input < 128)
-    {
-      // ascii -- 1 byte
-      // return input;
-
-      return 1;
-    }
-  else if ((input & 0xC0) == 0x80)
-    {
-      // invalid (continuation; can't be first char)
-      // return 0xFFFE;
-
-      return 0;
-    }
-  else if ((input & 0xE0) == 0xC0)
-    {
-      // 2 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-       return 0;
-      // return 0xFFFE;
-
-      // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-      // return output;
-      return 2;
-    }
-  else if ((input & 0xF0) == 0xE0)
-    {
-      // 3 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-       return 0;
-      // return 0xFFFE;
-
-      uint8_t input3 = peek_input (2);
-      if ((input3 & 0xC0) != 0x80)
-       return 0;
-      // return 0xFFFE;
-
-      /*uint32_t output
-       = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
-      0); return output;*/
-      return 3;
-    }
-  else if ((input & 0xF8) == 0xF0)
-    {
-      // 4 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-       return 0;
-      // return 0xFFFE;
-
-      uint8_t input3 = peek_input (2);
-      if ((input3 & 0xC0) != 0x80)
-       return 0;
-      // return 0xFFFE;
-
-      uint8_t input4 = peek_input (3);
-      if ((input4 & 0xC0) != 0x80)
-       return 0;
-      // return 0xFFFE;
-
-      /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-                       | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-      return output;*/
-      return 4;
-    }
-  else
-    {
-      rust_error_at (get_current_location (),
-                    "invalid UTF-8 [FIRST] (too long)");
-      return 0;
-    }
+  return 1;
  }
  
+// TODO remove this function
  // Returns the codepoint at the current position.
  Codepoint
  Lexer::peek_codepoint_input ()
  {
-  uint8_t input = peek_input ();
-
-  if ((int8_t) input == EOF)
-    return Codepoint::eof ();
-
-  if (input < 128)
-    {
-      // ascii -- 1 byte
-      return {input};
-    }
-  else if ((input & 0xC0) == 0x80)
-    {
-      // invalid (continuation; can't be first char)
-      return {0xFFFE};
-    }
-  else if ((input & 0xE0) == 0xC0)
-    {
-      // 2 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-       return {0xFFFE};
-
-      uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-      return {output};
-    }
-  else if ((input & 0xF0) == 0xE0)
-    {
-      // 3 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-       return {0xFFFE};
-
-      uint8_t input3 = peek_input (2);
-      if ((input3 & 0xC0) != 0x80)
-       return {0xFFFE};
-
-      uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
-                       | ((input3 & 0x3F) << 0);
-      return {output};
-    }
-  else if ((input & 0xF8) == 0xF0)
-    {
-      // 4 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-       return {0xFFFE};
-
-      uint8_t input3 = peek_input (2);
-      if ((input3 & 0xC0) != 0x80)
-       return {0xFFFE};
-
-      uint8_t input4 = peek_input (3);
-      if ((input4 & 0xC0) != 0x80)
-       return {0xFFFE};
-
-      uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-                       | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-      return {output};
-    }
-  else
-    {
-      rust_error_at (get_current_location (),
-                    "invalid UTF-8 [SECND] (too long)");
-      return {0xFFFE};
-    }
+  return peek_input ();
  }
  
+// TODO remove this function
  void
  Lexer::skip_codepoint_input ()
  {
-  if (peek_input () == EOF)
-    return;
-  int toSkip = get_input_codepoint_length ();
-  gcc_assert (toSkip >= 1);
-
-  skip_input (toSkip - 1);
+  skip_input ();
  }
  
-int
-Lexer::test_get_input_codepoint_n_length (int n_start_offset)
+void
+Lexer::split_current_token (TokenId new_left, TokenId new_right)
  {
-  uint8_t input = peek_input (n_start_offset);
-
-  if (input < 128)
-    {
-      // ascii -- 1 byte
-      // return input;
-      return 1;
-    }
-  else if ((input & 0xC0) == 0x80)
-    {
-      // invalid (continuation; can't be first char)
-      // return 0xFFFE;
-      return 0;
-    }
-  else if ((input & 0xE0) == 0xC0)
-    {
-      // 2 bytes
-      uint8_t input2 = peek_input (n_start_offset + 1);
-      if ((input2 & 0xC0) != 0x80)
-       // return 0xFFFE;
-       return 0;
+  /* TODO: assert that this TokenId is a "simple token" like punctuation and not
+   * like "IDENTIFIER"? */
+  Location current_loc = peek_token ()->get_locus ();
+  TokenPtr new_left_tok = Token::make (new_left, current_loc);
+  TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
  
-      // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-      // return output;
-      return 2;
-    }
-  else if ((input & 0xF0) == 0xE0)
-    {
-      // 3 bytes
-      uint8_t input2 = peek_input (n_start_offset + 1);
-      if ((input2 & 0xC0) != 0x80)
-       // return 0xFFFE;
-       return 0;
+  token_queue.replace_current_value (std::move (new_left_tok));
+  token_queue.insert (1, std::move (new_right_tok));
+}
  
-      uint8_t input3 = peek_input (n_start_offset + 2);
-      if ((input3 & 0xC0) != 0x80)
-       // return 0xFFFE;
-       return 0;
+void
+Lexer::start_line (int current_line, int current_column)
+{
+  if (line_map)
+    line_map->start_line (current_line, current_column);
+}
  
-      /*uint32_t output
-       = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
-      0); return output;*/
-      return 3;
-    }
-  else if ((input & 0xF8) == 0xF0)
-    {
-      // 4 bytes
-      uint8_t input2 = peek_input (n_start_offset + 1);
-      if ((input2 & 0xC0) != 0x80)
-       // return 0xFFFE;
-       return 0;
+} // namespace Rust
  
-      uint8_t input3 = peek_input (n_start_offset + 2);
-      if ((input3 & 0xC0) != 0x80)
-       // return 0xFFFE;
-       return 0;
+#if CHECKING_P
  
-      uint8_t input4 = peek_input (n_start_offset + 3);
-      if ((input4 & 0xC0) != 0x80)
-       // return 0xFFFE;
-       return 0;
+namespace selftest {
  
-      /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-                       | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-      return output;*/
-      return 4;
-    }
-  else
+// Checks if `src` has the same contents as the given characters
+void
+assert_source_content (Rust::Lexer::InputSource &src,
+                      std::vector<uint32_t> expected)
+{
+  Rust::Codepoint src_char = src.next ();
+  for (auto expected_char : expected)
      {
-      rust_error_at (get_current_location (),
-                    "invalid UTF-8 [THIRD] (too long)");
-      return 0;
+      // Make sure that `src` is not shorter than `expected`
+      ASSERT_FALSE (src_char.is_eof ());
+      // Checks skipped character is expeceted one.
+      ASSERT_EQ (src_char.value, expected_char);
+      src_char = src.next ();
      }
+  // Checks if `src` and `chars` has the same length.
+  ASSERT_TRUE (src_char.is_eof ());
  }
  
-// peeks the codepoint input at n codepoints ahead of current codepoint - try
-// not to use
-Codepoint
-Lexer::test_peek_codepoint_input (int n)
+void
+test_buffer_input_source (std::string str, std::vector<uint32_t> expected)
  {
-  int totalOffset = 0;
-
-  // add up all offsets into total offset? does this do what I want?
-  for (int i = 0; i < n; i++)
-    {
-      totalOffset += test_get_input_codepoint_n_length (totalOffset);
-    }
-  // issues: this would have (at least) O(n) lookup time, not O(1) like the
-  // rest?
-
-  // TODO: implement if still needed
-
-  // error out of function as it is not implemented
-  gcc_assert (1 == 0);
-  return {0};
-  /*
-         uint8_t input = peek_input();
-
-         if (input < 128) {
-             // ascii -- 1 byte
-             return input;
-         } else if ((input & 0xC0) == 0x80) {
-             // invalid (continuation; can't be first char)
-             return 0xFFFE;
-         } else if ((input & 0xE0) == 0xC0) {
-             // 2 bytes
-             uint8_t input2 = peek_input(1);
-             if ((input2 & 0xC0) != 0x80)
-                 return 0xFFFE;
-
-             uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-             return output;
-         } else if ((input & 0xF0) == 0xE0) {
-             // 3 bytes
-             uint8_t input2 = peek_input(1);
-             if ((input2 & 0xC0) != 0x80)
-                 return 0xFFFE;
-
-             uint8_t input3 = peek_input(2);
-             if ((input3 & 0xC0) != 0x80)
-                 return 0xFFFE;
-
-             uint32_t output
-               = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
-     0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
-             // 4 bytes
-             uint8_t input2 = peek_input(1);
-             if ((input2 & 0xC0) != 0x80)
-                 return 0xFFFE;
-
-             uint8_t input3 = peek_input(2);
-             if ((input3 & 0xC0) != 0x80)
-                 return 0xFFFE;
-
-             uint8_t input4 = peek_input(3);
-             if ((input4 & 0xC0) != 0x80)
-                 return 0xFFFE;
-
-             uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-                               | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
-     0); return output; } else { rust_error_at(get_current_location(), "invalid
-     UTF-8 (too long)"); return 0xFFFE;
-         }*/
+  Rust::Lexer::BufferInputSource source (str, 0);
+  assert_source_content (source, expected);
  }
  
  void
-Lexer::split_current_token (TokenId new_left, TokenId new_right)
+test_file_input_source (std::string str, std::vector<uint32_t> expected)
  {
-  /* TODO: assert that this TokenId is a "simple token" like punctuation and not
-   * like "IDENTIFIER"? */
-  Location current_loc = peek_token ()->get_locus ();
-  TokenPtr new_left_tok = Token::make (new_left, current_loc);
-  TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
-
-  token_queue.replace_current_value (std::move (new_left_tok));
-  token_queue.insert (1, std::move (new_right_tok));
+  FILE *tmpf = tmpfile ();
+  // Moves to the first character
+  fputs (str.c_str (), tmpf);
+  std::rewind (tmpf);
+  Rust::Lexer::FileInputSource source (tmpf);
+  assert_source_content (source, expected);
  }
  
  void
-Lexer::start_line (int current_line, int current_column)
+rust_input_source_test ()
  {
-  if (line_map)
-    line_map->start_line (current_line, current_column);
+  // ASCII
+  std::string src = u8"_abcde\tXYZ\v\f";
+  std::vector<uint32_t> expected
+    = {'_', 'a', 'b', 'c', 'd', 'e', '\t', 'X', 'Y', 'Z', '\v', '\f'};
+  test_buffer_input_source (src, expected);
+
+  // BOM
+  src = u8"\xef\xbb\xbfOK";
+  expected = {'O', 'K'};
+  test_buffer_input_source (src, expected);
+
+  // Russian
+  src = u8"приве́т";
+  expected = {L'п',
+             L'р',
+             L'и',
+             L'в',
+             0x0435 /* CYRILLIC SMALL LETTER IE е */,
+             0x301 /* COMBINING ACUTE ACCENT ́ */,
+             L'т'};
+  test_buffer_input_source (src, expected);
+
+  src = u8"❤️🦀";
+  expected = {0x2764 /* HEAVY BLACK HEART */,
+             0xfe0f /* VARIATION SELECTOR-16 */, L'🦀'};
+  test_buffer_input_source (src, expected);
+
+  src = u8"こんにちは";
+  expected = {L'こ', L'ん', L'に', L'ち', L'は'};
+  test_file_input_source (src, expected);
+
+  src = u8"👮‍♂👩‍⚕";
+  expected
+    = {0x1f46e /* POLICE OFFICER */,   0x200d /* ZERO WIDTH JOINER */,
+       0x2642 /* MALE SIGN */,        0x1f469 /* WOMAN */,
+       0x200d /* ZERO WIDTH JOINER */, 0x2695 /* STAFF OF AESCULAPIUS */};
+  test_file_input_source (src, expected);
  }
  
-} // namespace Rust
+} // namespace selftest
+
+#endif // CHECKING_P
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h

index a05e8fcbfe10683e52d201c5d0a5df2fb4117ca4..58ad48533c665b33d243b1e7856178f82e56fc40 100644 (file)
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -23,6 +23,7 @@
  #include "rust-buffered-queue.h"
  #include "rust-token.h"
  #include "rust-optional.h"
+#include "selftest.h"
  
  namespace Rust {
  // Simple wrapper for FILE* that simplifies destruction.
@@ -119,9 +120,9 @@ private:
    void skip_input (int n);
  
    // Peeks the current char.
-  int peek_input ();
+  Codepoint peek_input ();
    // Returns char n bytes ahead of current position.
-  int peek_input (int n);
+  Codepoint peek_input (int n);
  
    // Classifies keyword (i.e. gets id for keyword).
    TokenId classify_keyword (const std::string &str);
@@ -136,12 +137,10 @@ private:
    std::pair<Codepoint, int> parse_partial_unicode_escape ();
  
    int get_input_codepoint_length ();
-  int test_get_input_codepoint_n_length (int n_start_offset);
    // Peeks the current utf-8 char
    Codepoint peek_codepoint_input ();
-  Codepoint test_peek_codepoint_input (int n);
    void skip_codepoint_input ();
-  void skip_broken_string_input (int current_char);
+  void skip_broken_string_input (Codepoint current_char);
  
    TokenPtr parse_byte_char (Location loc);
    TokenPtr parse_byte_string (Location loc);
@@ -208,37 +207,136 @@ public:
    Linemap *get_line_map () { return line_map; }
    std::string get_filename () { return std::string (input.get_filename ()); }
  
-private:
-  void start_line (int current_line, int current_column);
-
-  // File for use as input.
-  RAIIFile input;
-  // TODO is this actually required? could just have file storage in InputSource
+  // Input source wrapper thing.
+  class InputSource
+  {
+  private:
+    // position of current character
+    unsigned int pos;
+    std::vector<Codepoint> chars;
+    bool is_valid_utf8;
  
-  // Current line number.
-  int current_line;
-  // Current column number.
-  int current_column;
-  // Current character.
-  int current_char;
-  Codepoint current_char32;
-  // Line map.
-  Linemap *line_map;
+    // Overload operator () to return next char from input stream.
+    virtual int next_byte () = 0;
  
-  /* Max column number that can be quickly allocated - higher may require
-   * allocating new linemap */
-  static const int max_column_hint = 80;
+    Codepoint next_codepoint ()
+    {
+      uint8_t input = next_byte ();
+
+      if ((int8_t) input == EOF)
+       return Codepoint::eof ();
+      else if (input < 128)
+       {
+         // ascii -- 1 byte
+         return {input};
+       }
+      else if ((input & 0xC0) == 0x80)
+       {
+         // invalid (continuation; can't be first char)
+         return {0xFFFE};
+       }
+      else if ((input & 0xE0) == 0xC0)
+       {
+         // 2 bytes
+         uint8_t input2 = next_byte ();
+         if ((input2 & 0xC0) != 0x80)
+           return {0xFFFE};
+
+         uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+         return output;
+       }
+      else if ((input & 0xF0) == 0xE0)
+       {
+         // 3 bytes or UTF-8 BOM
+         uint8_t input2 = next_byte ();
+         // If the second byte is equal to 0xBB then the input is no longer a
+         // valid UTF-8 char.
+         if (input == 0xEF && input2 == 0xBB)
+           {
+             uint8_t input3 = next_byte ();
+             if (input3 == 0xBF)
+               return next_codepoint ();
+             else
+               return {0xFFFE};
+           }
+
+         if ((input2 & 0xC0) != 0x80)
+           return {0xFFFE};
+
+         uint8_t input3 = next_byte ();
+
+         if ((input3 & 0xC0) != 0x80)
+           return {0xFFFE};
+
+         uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
+                           | ((input3 & 0x3F) << 0);
+         return {output};
+       }
+      else if ((input & 0xF8) == 0xF0)
+       {
+         // 4 bytes
+         uint8_t input2 = next_byte ();
+         if ((input2 & 0xC0) != 0x80)
+           return {0xFFFE};
+
+         uint8_t input3 = next_byte ();
+         if ((input3 & 0xC0) != 0x80)
+           return {0xFFFE};
+
+         uint8_t input4 = next_byte ();
+         if ((input4 & 0xC0) != 0x80)
+           return {0xFFFE};
+
+         uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+                           | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+         return {output};
+       }
+      else
+       {
+         // rust_error_at (get_current_location (),
+         //   "invalid UTF-8 [SECND] (too long)");
+         return {0xFFFE};
+       }
+    }
  
-  Optional<std::ofstream &> dump_lex_out;
+  protected:
+    // Check if the input source is valid as utf-8 and copy all characters to
+    // `chars`.
+    void init ()
+    {
+      Codepoint char32 = next_codepoint ();
+      while (!char32.is_eof () && char32 != 0xFFFE)
+       {
+         chars.push_back (char32);
+         char32 = next_codepoint ();
+       }
+
+      if (char32 == 0xFFFE)
+       {
+         // Input source is not valid as utf-8.
+         is_valid_utf8 = false;
+       }
+    }
  
-  // Input source wrapper thing.
-  class InputSource
-  {
    public:
+    InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
+
      virtual ~InputSource () {}
  
-    // Overload operator () to return next char from input stream.
-    virtual int next () = 0;
+    bool is_valid () { return is_valid_utf8; }
+
+    // get the next UTF-8 character
+    Codepoint next ()
+    {
+      if (pos >= chars.size ())
+       return Codepoint::eof ();
+      else
+       {
+         Codepoint c = chars[pos];
+         pos++;
+         return c;
+       }
+    }
    };
  
    class FileInputSource : public InputSource
@@ -247,11 +345,15 @@ private:
      // Input source file.
      FILE *input;
  
+    int next_byte () override { return fgetc (input); }
+
    public:
      // Create new input source from file.
-    FileInputSource (FILE *input) : input (input) {}
-
-    int next () override { return fgetc (input); }
+    FileInputSource (FILE *input) : InputSource (), input (input)
+    {
+      // TODO make this better?
+      init ();
+    }
    };
  
    class BufferInputSource : public InputSource
@@ -260,26 +362,52 @@ private:
      const std::string &buffer;
      size_t offs;
  
-  public:
-    // Create new input source from file.
-    BufferInputSource (const std::string &b, size_t offset)
-      : buffer (b), offs (offset)
-    {}
-
-    int next () override
+    int next_byte () override
      {
        if (offs >= buffer.size ())
         return EOF;
  
        return buffer.at (offs++);
      }
+
+  public:
+    // Create new input source from file.
+    BufferInputSource (const std::string &b, size_t offset)
+      : InputSource (), buffer (b), offs (offset)
+    {
+      // TODO make this better?
+      init ();
+    }
    };
  
+private:
+  void start_line (int current_line, int current_column);
+
+  // File for use as input.
+  RAIIFile input;
+  // TODO is this actually required? could just have file storage in InputSource
+
+  // Current line number.
+  int current_line;
+  // Current column number.
+  int current_column;
+  // Current character.
+  Codepoint current_char;
+  Codepoint current_char32;
+  // Line map.
+  Linemap *line_map;
+
+  /* Max column number that can be quickly allocated - higher may require
+   * allocating new linemap */
+  static const int max_column_hint = 80;
+
+  Optional<std::ofstream &> dump_lex_out;
+
    // The input source for the lexer.
    // InputSource input_source;
    // Input file queue.
    std::unique_ptr<InputSource> raw_input_source;
-  buffered_queue<int, std::reference_wrapper<InputSource>> input_queue;
+  buffered_queue<Codepoint, std::reference_wrapper<InputSource>> input_queue;
  
    // Token source wrapper thing.
    struct TokenSource
@@ -305,4 +433,14 @@ private:
  
  } // namespace Rust
  
+#if CHECKING_P
+
+namespace selftest {
+void
+rust_input_source_test ();
+
+} // namespace selftest
+
+#endif // CHECKING_P
+
  #endif
diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc

index 084f74c9121de83a5c70467876e8cc84ff6e7a33..2032cab87be362ce6f17bac1a1c6694d8ab29f85 100644 (file)
--- a/gcc/rust/rust-lang.cc
+++ b/gcc/rust/rust-lang.cc
@@ -36,6 +36,7 @@
  #include "rust-privacy-ctx.h"
  #include "rust-ast-resolve-item.h"
  #include "rust-optional.h"
+#include "rust-lex.h"
  
  #include <mpfr.h>
  // note: header files must be in this order or else forward declarations don't
@@ -449,6 +450,7 @@ void
  run_rust_tests ()
  {
    // Call tests for the rust frontend here
+  rust_input_source_test ();
    rust_cfg_parser_test ();
    rust_privacy_ctx_test ();
    rust_crate_name_validation_test ();
author	Raiki Tamura <tamaron1203@gmail.com>
	Sun, 18 Jun 2023 14:25:31 +0000 (23:25 +0900)
committer	Arthur Cohen <arthur.cohen@embecosm.com>
	Tue, 16 Jan 2024 17:46:30 +0000 (18:46 +0100)
gcc/rust/lex/rust-lex.cc		patch \| blob \| blame \| history
gcc/rust/lex/rust-lex.h		patch \| blob \| blame \| history
gcc/rust/rust-lang.cc		patch \| blob \| blame \| history