From: Raiki Tamura <tamaron1203@gmail.com>
Date: Sun, 18 Jun 2023 14:25:31 +0000 (+0900)
Subject: gccrs: Refactor lexer to handle UTF-8
X-Git-Tag: basepoints/gcc-15~2439
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=0ee16a42074486d6514379d7faa6aee54100f0a4;p=thirdparty%2Fgcc.git

gccrs: Refactor lexer to handle UTF-8

gcc/rust/ChangeLog:

	* lex/rust-lex.cc (is_float_digit):Change types of param to `uint32_t`
	(is_x_digit):Likewise
	(is_octal_digit):Likewise
	(is_bin_digit):Likewise
	(check_valid_float_dot_end):Likewise
	(is_whitespace):Likewise
	(is_non_decimal_int_literal_separator):Likewise
	(is_identifier_start):Likewise
	(is_identifier_continue):Likewise
	(Lexer::skip_broken_string_input):Likewise
	(Lexer::build_token):Remove handling BOM
	(Lexer::parse_in_type_suffix):Modify use of `current_char`
	(Lexer::parse_in_decimal):Likewise
	(Lexer::parse_escape):Likewise
	(Lexer::parse_utf8_escape):Likewise
	(Lexer::parse_partial_string_continue):Likewise
	(Lexer::parse_partial_hex_escape):Likewise
	(Lexer::parse_partial_unicode_escape):Likewise
	(Lexer::parse_byte_char):Likewise
	(Lexer::parse_byte_string):Likewise
	(Lexer::parse_raw_byte_string):Likewise
	(Lexer::parse_raw_identifier):Likewise
	(Lexer::parse_non_decimal_int_literal):Likewise
	(Lexer::parse_decimal_int_or_float):Likewise
	(Lexer::peek_input):Change return type to `Codepoint`
	(Lexer::get_input_codepoint_length):Change to return 1
	(Lexer::peek_codepoint_input):Change to be wrapper of `peek_input`
	(Lexer::skip_codepoint_input):Change to be wrapper of `skip_input`
	(Lexer::test_get_input_codepoint_n_length):Deleted
	(Lexer::split_current_token):Deleted
	(Lexer::test_peek_codepoint_input):Deleted
	(Lexer::start_line):Move backwards
	(assert_source_content):New helper function for selftest
	(test_buffer_input_source):New helper function for selftest
	(test_file_input_source):Likewise
	(rust_input_source_test):New test
	* lex/rust-lex.h (rust_input_source_test):New test
	* rust-lang.cc (run_rust_tests):Add selftest

Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
---

diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index 94ec67d2e66b..aec2a96694ab 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -16,6 +16,7 @@
 // along with GCC; see the file COPYING3.  If not see
 // <http://www.gnu.org/licenses/>.
 
+#include "rust-codepoint.h"
 #include "rust-system.h"
 #include "rust-lex.h"
 #include "rust-diagnostics.h"
@@ -73,7 +74,7 @@ Codepoint::as_string ()
 /* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
  * for handling. */
 bool
-is_float_digit (char number)
+is_float_digit (uint32_t number)
 {
   return ISDIGIT (number) || number == 'E' || number == 'e';
 }
@@ -81,31 +82,31 @@ is_float_digit (char number)
 /* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
  * whatever is different */
 bool
-is_x_digit (char number)
+is_x_digit (uint32_t number)
 {
   return ISXDIGIT (number);
 }
 
 bool
-is_octal_digit (char number)
+is_octal_digit (uint32_t number)
 {
   return number >= '0' && number <= '7';
 }
 
 bool
-is_bin_digit (char number)
+is_bin_digit (uint32_t number)
 {
   return number == '0' || number == '1';
 }
 
 bool
-check_valid_float_dot_end (char character)
+check_valid_float_dot_end (uint32_t character)
 {
   return character != '.' && character != '_' && !ISALPHA (character);
 }
 
 bool
-is_whitespace (int character)
+is_whitespace (uint32_t character)
 {
   // https://doc.rust-lang.org/reference/whitespace.html
   return character == '\t' || character == '\n' || character == '\v'
@@ -118,19 +119,19 @@ is_whitespace (int character)
 }
 
 bool
-is_non_decimal_int_literal_separator (char character)
+is_non_decimal_int_literal_separator (uint32_t character)
 {
   return character == 'x' || character == 'o' || character == 'b';
 }
 
 bool
-is_identifier_start (int codepoint)
+is_identifier_start (uint32_t codepoint)
 {
   return (cpp_check_xid_property (codepoint) & CPP_XID_START) || codepoint == '_';
 }
 
 bool
-is_identifier_continue (int codepoint)
+is_identifier_continue (uint32_t codepoint)
 {
   return cpp_check_xid_property (codepoint) & CPP_XID_CONTINUE;
 }
@@ -183,13 +184,13 @@ Lexer::get_current_location ()
     return Location ();
 }
 
-int
+Codepoint
 Lexer::peek_input (int n)
 {
   return input_queue.peek (n);
 }
 
-int
+Codepoint
 Lexer::peek_input ()
 {
   return peek_input (0);
@@ -304,17 +305,6 @@ Lexer::build_token ()
     {
       Location loc = get_current_location ();
 
-      // detect UTF8 bom
-      //
-      // Must be the first thing on the first line.
-      // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
-      // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
-      if (current_line == 1 && current_column == 1 && peek_input () == 0xef
-	  && peek_input (1) == 0xbb && peek_input (2) == 0xbf)
-	{
-	  skip_input (2);
-	}
-
       current_char = peek_input ();
       current_char32 = peek_codepoint_input ();
       skip_codepoint_input ();
@@ -331,9 +321,8 @@ Lexer::build_token ()
 	  int n = 1;
 	  while (true)
 	    {
-	      // TODO use utf-8 codepoint to skip whitespaces
-	      int next_char = peek_input (n);
-	      if (is_whitespace (next_char))
+	      Codepoint next_char = peek_input (n);
+	      if (is_whitespace (next_char.value))
 		n++;
 	      else if ((next_char == '/' && peek_input (n + 1) == '/'
 			&& peek_input (n + 2) != '!'
@@ -347,7 +336,7 @@ Lexer::build_token ()
 		  // (but not an inner or outer doc comment)
 		  n += 2;
 		  next_char = peek_input (n);
-		  while (next_char != '\n' && next_char != EOF)
+		  while (next_char != '\n' && !next_char.is_eof ())
 		    {
 		      n++;
 		      next_char = peek_input (n);
@@ -383,7 +372,7 @@ Lexer::build_token ()
 		  int level = 1;
 		  while (level > 0)
 		    {
-		      if (peek_input (n) == EOF)
+		      if (peek_input (n).is_eof ())
 			break;
 		      else if (peek_input (n) == '/'
 			       && peek_input (n + 1) == '*')
@@ -404,7 +393,7 @@ Lexer::build_token ()
 	      else if (next_char != '[')
 		{
 		  // definitely shebang, ignore the first line
-		  while (current_char != '\n' && current_char != EOF)
+		  while (current_char != '\n' && !current_char.is_eof ())
 		    {
 		      current_char = peek_input ();
 		      skip_input ();
@@ -423,11 +412,11 @@ Lexer::build_token ()
 	}
 
       // return end of file token if end of file
-      if (current_char == EOF)
+      if (current_char.is_eof ())
 	return Token::make (END_OF_FILE, loc);
 
       // if not end of file, start tokenising
-      switch (current_char)
+      switch (current_char.value)
 	{
 	/* ignore whitespace characters for tokens but continue updating
 	 * location */
@@ -566,7 +555,7 @@ Lexer::build_token ()
 	      current_char = peek_input ();
 
 	      // basically ignore until line finishes
-	      while (current_char != '\n' && current_char != EOF)
+	      while (current_char != '\n' && !current_char.is_eof ())
 		{
 		  skip_input ();
 		  current_column++; // not used
@@ -590,7 +579,7 @@ Lexer::build_token ()
 		  skip_input ();
 		  if (current_char == '\r')
 		    {
-		      char next_char = peek_input ();
+		      Codepoint next_char = peek_input ();
 		      if (next_char == '\n')
 			{
 			  current_char = '\n';
@@ -601,7 +590,7 @@ Lexer::build_token ()
 		      current_char = next_char;
 		      continue;
 		    }
-		  if (current_char == EOF)
+		  if (current_char.is_eof ())
 		    {
 		      rust_error_at (
 			loc, "unexpected EOF while looking for end of comment");
@@ -656,7 +645,7 @@ Lexer::build_token ()
 		{
 		  current_char = peek_input ();
 
-		  if (current_char == EOF)
+		  if (current_char.is_eof ())
 		    {
 		      rust_error_at (
 			loc, "unexpected EOF while looking for end of comment");
@@ -720,7 +709,7 @@ Lexer::build_token ()
 		{
 		  current_char = peek_input ();
 
-		  if (current_char == EOF)
+		  if (current_char.is_eof ())
 		    {
 		      rust_error_at (
 			loc, "unexpected EOF while looking for end of comment");
@@ -1069,11 +1058,11 @@ Lexer::build_token ()
       // raw identifiers and raw strings
       if (current_char == 'r')
 	{
-	  int peek = peek_input ();
-	  int peek1 = peek_input (1);
+	  Codepoint peek = peek_input ();
+	  Codepoint peek1 = peek_input (1);
 
 	  // TODO (tamaron) parse Unicode ident
-	  if (peek == '#' && is_identifier_start (peek1))
+	  if (peek == '#' && is_identifier_start (peek1.value))
 	    {
 	      TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
 	      if (raw_ident_ptr != nullptr)
@@ -1095,10 +1084,10 @@ Lexer::build_token ()
 	return parse_identifier_or_keyword (loc);
 
       // int and float literals
-      if (ISDIGIT (current_char))
+      if (ISDIGIT (current_char.value))
 	{ //  _ not allowed as first char
 	  if (current_char == '0'
-	      && is_non_decimal_int_literal_separator (peek_input ()))
+	      && is_non_decimal_int_literal_separator (peek_input ().value))
 	    {
 	      // handle binary, octal, hex literals
 	      TokenPtr non_dec_int_lit_ptr
@@ -1137,7 +1126,7 @@ Lexer::build_token ()
 		    "unexpected character");
 
       // didn't match anything so error
-      rust_error_at (loc, "unexpected character %<%x%>", current_char);
+      rust_error_at (loc, "unexpected character %<%x%>", current_char.value);
       current_column++;
     }
 }
@@ -1152,7 +1141,7 @@ Lexer::parse_in_type_suffix ()
   int additional_length_offset = 0;
 
   // get suffix
-  while (ISALPHA (current_char) || ISDIGIT (current_char)
+  while (ISALPHA (current_char.value) || ISDIGIT (current_char.value)
 	 || current_char == '_')
     {
       if (current_char == '_')
@@ -1293,7 +1282,7 @@ Lexer::parse_in_decimal ()
   bool pure_decimal = true;
   int additional_length_offset = 0;
   std::string str;
-  while (ISDIGIT (current_char) || current_char == '_')
+  while (ISDIGIT (current_char.value) || current_char.value == '_')
     {
       if (current_char == '_')
 	{
@@ -1329,7 +1318,7 @@ Lexer::parse_escape (char opening_char)
   current_char = peek_input ();
   additional_length_offset++;
 
-  switch (current_char)
+  switch (current_char.value)
     {
       case 'x': {
 	auto hex_escape_pair = parse_partial_hex_escape ();
@@ -1382,7 +1371,8 @@ Lexer::parse_escape (char opening_char)
       return std::make_tuple (0, parse_partial_string_continue (), true);
     default:
       rust_error_at (get_current_location (),
-		     "unknown escape sequence %<\\%c%>", current_char);
+		     "unknown escape sequence %<\\%s%>",
+		     current_char.as_string ().c_str ());
       // returns false if no parsing could be done
       // return false;
       return std::make_tuple (output_char, additional_length_offset, false);
@@ -1411,7 +1401,7 @@ Lexer::parse_utf8_escape ()
   current_char = peek_input ();
   additional_length_offset++;
 
-  switch (current_char)
+  switch (current_char.value)
     {
       case 'x': {
 	auto hex_escape_pair = parse_partial_hex_escape ();
@@ -1465,7 +1455,8 @@ Lexer::parse_utf8_escape ()
       return std::make_tuple (0, parse_partial_string_continue (), true);
     default:
       rust_error_at (get_current_location (),
-		     "unknown escape sequence %<\\%c%>", current_char);
+		     "unknown escape sequence %<\\%s%>",
+		     current_char.as_string ().c_str ());
       // returns false if no parsing could be done
       // return false;
       return std::make_tuple (output_char, additional_length_offset, false);
@@ -1490,7 +1481,7 @@ Lexer::parse_partial_string_continue ()
 
   // string continue
   // TODO use utf-8 codepoint to skip whitespaces
-  while (is_whitespace (current_char))
+  while (is_whitespace (current_char.value))
     {
       if (current_char == '\n')
 	{
@@ -1529,29 +1520,29 @@ Lexer::parse_partial_hex_escape ()
   current_char = peek_input (1);
   int additional_length_offset = 1;
 
-  if (!is_x_digit (current_char))
+  if (!is_x_digit (current_char.value))
     {
       rust_error_at (get_current_location (),
-		     "invalid character %<\\x%c%> in \\x sequence",
-		     current_char);
+		     "invalid character %<\\x%s%> in \\x sequence",
+		     current_char.as_string ().c_str ());
       return std::make_pair (0, 0);
     }
-  hexNum[0] = current_char;
+  hexNum[0] = current_char.value;
 
   // second hex char
   skip_input ();
   current_char = peek_input (1);
   additional_length_offset++;
 
-  if (!is_x_digit (current_char))
+  if (!is_x_digit (current_char.value))
     {
       rust_error_at (get_current_location (),
-		     "invalid character %<\\x%c%c%> in \\x sequence", hexNum[0],
-		     current_char);
+		     "invalid character %<\\x%c%s%> in \\x sequence", hexNum[0],
+		     current_char.as_string ().c_str ());
       return std::make_pair (0, 1);
     }
   skip_input ();
-  hexNum[1] = current_char;
+  hexNum[1] = current_char.value;
 
   long hexLong = std::strtol (hexNum, nullptr, 16);
 
@@ -1571,7 +1562,7 @@ Lexer::parse_partial_unicode_escape ()
       rust_error_at (get_current_location (),
 		     "unicode escape should start with %<{%>");
       /* Skip what should probaby have been between brackets.  */
-      while (is_x_digit (current_char) || current_char == '_')
+      while (is_x_digit (current_char.value) || current_char == '_')
 	{
 	  skip_input ();
 	  current_char = peek_input ();
@@ -1599,7 +1590,7 @@ Lexer::parse_partial_unicode_escape ()
   num_str.reserve (6);
 
   // loop through to add entire hex number to string
-  while (is_x_digit (current_char) || current_char == '_')
+  while (is_x_digit (current_char.value) || current_char.value == '_')
     {
       if (current_char == '_')
 	{
@@ -1634,7 +1625,7 @@ Lexer::parse_partial_unicode_escape ()
       // termination, otherwise it is a wrong character, then skip to the actual
       // terminator.
       // TODO use utf-8 codepoint to skip whitespaces
-      if (current_char == '{' || is_whitespace (current_char)
+      if (current_char == '{' || is_whitespace (current_char.value)
 	  || current_char == '\'' || current_char == '"')
 	{
 	  rust_error_at (get_current_location (),
@@ -1644,11 +1635,11 @@ Lexer::parse_partial_unicode_escape ()
       else
 	{
 	  rust_error_at (get_current_location (),
-			 "invalid character %<%c%> in unicode escape",
-			 current_char);
+			 "invalid character %<%s%> in unicode escape",
+			 current_char.as_string ().c_str ());
 	  // TODO use utf-8 codepoint to skip whitespaces
 	  while (current_char != '}' && current_char != '{'
-		 && !is_whitespace (current_char) && current_char != '\''
+		 && !is_whitespace (current_char.value) && current_char != '\''
 		 && current_char != '"')
 	    {
 	      skip_input ();
@@ -1711,7 +1702,7 @@ Lexer::parse_byte_char (Location loc)
   int length = 1;
 
   // char to save
-  char byte_char = 0;
+  Codepoint byte_char = 0;
 
   // detect escapes
   if (current_char == '\\')
@@ -1759,7 +1750,8 @@ Lexer::parse_byte_char (Location loc)
 
   loc += length - 1;
 
-  return Token::make_byte_char (loc, byte_char);
+  // TODO: error when byte_char is non ASCII
+  return Token::make_byte_char (loc, byte_char.value);
 }
 
 // Parses a byte string.
@@ -1778,7 +1770,7 @@ Lexer::parse_byte_string (Location loc)
   int length = 1;
   current_char = peek_input ();
 
-  while (current_char != '"' && current_char != EOF)
+  while (current_char != '"' && !current_char.is_eof ())
     {
       if (current_char == '\\')
 	{
@@ -1812,7 +1804,7 @@ Lexer::parse_byte_string (Location loc)
       skip_input ();
       current_char = peek_input ();
     }
-  else if (current_char == EOF)
+  else if (current_char.is_eof ())
     {
       rust_error_at (get_current_location (), "unended byte string literal");
       return Token::make (END_OF_FILE, get_current_location ());
@@ -1887,11 +1879,11 @@ Lexer::parse_raw_byte_string (Location loc)
 	    }
 	}
 
-      if ((unsigned char) current_char > 127)
+      if (current_char.value > 127)
 	{
 	  rust_error_at (get_current_location (),
-			 "character %<%c%> in raw byte string out of range",
-			 current_char);
+			 "character %<%s%> in raw byte string out of range",
+			 current_char.as_string ().c_str ());
 	  current_char = 0;
 	}
 
@@ -1929,7 +1921,7 @@ Lexer::parse_raw_identifier (Location loc)
   int length = 0;
   current_char = peek_input ();
   // loop through entire name
-  while (is_identifier_continue (current_char))
+  while (is_identifier_continue (current_char.value))
     {
       length++;
 
@@ -1940,6 +1932,8 @@ Lexer::parse_raw_identifier (Location loc)
 
   current_column += length;
 
+  rust_debug ("raw ident: %s", str.c_str ());
+
   // if just a single underscore, not an identifier
   if (first_is_underscore && length == 1)
     rust_error_at (get_current_location (),
@@ -1964,9 +1958,9 @@ Lexer::parse_raw_identifier (Location loc)
 
 // skip broken string input (unterminated strings)
 void
-Lexer::skip_broken_string_input (int current_char)
+Lexer::skip_broken_string_input (Codepoint current_char)
 {
-  while (current_char != '"' && current_char != EOF)
+  while (current_char != '"' && !current_char.is_eof ())
     {
       if (current_char == '\n')
 	{
@@ -2190,7 +2184,7 @@ Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func,
   length++;
 
   // loop through to add entire number to string
-  while (is_digit_func (current_char) || current_char == '_')
+  while (is_digit_func (current_char.value) || current_char == '_')
     {
       if (current_char == '_')
 	{
@@ -2293,7 +2287,7 @@ Lexer::parse_decimal_int_or_float (Location loc)
   length += std::get<1> (initial_decimal);
 
   // detect float literal
-  if (current_char == '.' && is_float_digit (peek_input (1)))
+  if (current_char == '.' && is_float_digit (peek_input (1).value))
     {
       // float with a '.', parse another decimal into it
 
@@ -2335,7 +2329,8 @@ Lexer::parse_decimal_int_or_float (Location loc)
       str.shrink_to_fit ();
       return Token::make_float (loc, std::move (str), type_hint);
     }
-  else if (current_char == '.' && check_valid_float_dot_end (peek_input (1)))
+  else if (current_char == '.'
+	   && check_valid_float_dot_end (peek_input (1).value))
     {
       // float that is just an integer with a terminating '.' character
 
@@ -2504,338 +2499,133 @@ Lexer::parse_char_or_lifetime (Location loc)
     }
 }
 
+// TODO remove this function
 // Returns the length of the codepoint at the current position.
 int
 Lexer::get_input_codepoint_length ()
 {
-  uint8_t input = peek_input ();
-
-  if ((int8_t) input == EOF)
-    return 0;
-
-  if (input < 128)
-    {
-      // ascii -- 1 byte
-      // return input;
-
-      return 1;
-    }
-  else if ((input & 0xC0) == 0x80)
-    {
-      // invalid (continuation; can't be first char)
-      // return 0xFFFE;
-
-      return 0;
-    }
-  else if ((input & 0xE0) == 0xC0)
-    {
-      // 2 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-	return 0;
-      // return 0xFFFE;
-
-      // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-      // return output;
-      return 2;
-    }
-  else if ((input & 0xF0) == 0xE0)
-    {
-      // 3 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-	return 0;
-      // return 0xFFFE;
-
-      uint8_t input3 = peek_input (2);
-      if ((input3 & 0xC0) != 0x80)
-	return 0;
-      // return 0xFFFE;
-
-      /*uint32_t output
-	= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
-      0); return output;*/
-      return 3;
-    }
-  else if ((input & 0xF8) == 0xF0)
-    {
-      // 4 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-	return 0;
-      // return 0xFFFE;
-
-      uint8_t input3 = peek_input (2);
-      if ((input3 & 0xC0) != 0x80)
-	return 0;
-      // return 0xFFFE;
-
-      uint8_t input4 = peek_input (3);
-      if ((input4 & 0xC0) != 0x80)
-	return 0;
-      // return 0xFFFE;
-
-      /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-			| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-      return output;*/
-      return 4;
-    }
-  else
-    {
-      rust_error_at (get_current_location (),
-		     "invalid UTF-8 [FIRST] (too long)");
-      return 0;
-    }
+  return 1;
 }
 
+// TODO remove this function
 // Returns the codepoint at the current position.
 Codepoint
 Lexer::peek_codepoint_input ()
 {
-  uint8_t input = peek_input ();
-
-  if ((int8_t) input == EOF)
-    return Codepoint::eof ();
-
-  if (input < 128)
-    {
-      // ascii -- 1 byte
-      return {input};
-    }
-  else if ((input & 0xC0) == 0x80)
-    {
-      // invalid (continuation; can't be first char)
-      return {0xFFFE};
-    }
-  else if ((input & 0xE0) == 0xC0)
-    {
-      // 2 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-	return {0xFFFE};
-
-      uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-      return {output};
-    }
-  else if ((input & 0xF0) == 0xE0)
-    {
-      // 3 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-	return {0xFFFE};
-
-      uint8_t input3 = peek_input (2);
-      if ((input3 & 0xC0) != 0x80)
-	return {0xFFFE};
-
-      uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
-			| ((input3 & 0x3F) << 0);
-      return {output};
-    }
-  else if ((input & 0xF8) == 0xF0)
-    {
-      // 4 bytes
-      uint8_t input2 = peek_input (1);
-      if ((input2 & 0xC0) != 0x80)
-	return {0xFFFE};
-
-      uint8_t input3 = peek_input (2);
-      if ((input3 & 0xC0) != 0x80)
-	return {0xFFFE};
-
-      uint8_t input4 = peek_input (3);
-      if ((input4 & 0xC0) != 0x80)
-	return {0xFFFE};
-
-      uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-			| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-      return {output};
-    }
-  else
-    {
-      rust_error_at (get_current_location (),
-		     "invalid UTF-8 [SECND] (too long)");
-      return {0xFFFE};
-    }
+  return peek_input ();
 }
 
+// TODO remove this function
 void
 Lexer::skip_codepoint_input ()
 {
-  if (peek_input () == EOF)
-    return;
-  int toSkip = get_input_codepoint_length ();
-  gcc_assert (toSkip >= 1);
-
-  skip_input (toSkip - 1);
+  skip_input ();
 }
 
-int
-Lexer::test_get_input_codepoint_n_length (int n_start_offset)
+void
+Lexer::split_current_token (TokenId new_left, TokenId new_right)
 {
-  uint8_t input = peek_input (n_start_offset);
-
-  if (input < 128)
-    {
-      // ascii -- 1 byte
-      // return input;
-      return 1;
-    }
-  else if ((input & 0xC0) == 0x80)
-    {
-      // invalid (continuation; can't be first char)
-      // return 0xFFFE;
-      return 0;
-    }
-  else if ((input & 0xE0) == 0xC0)
-    {
-      // 2 bytes
-      uint8_t input2 = peek_input (n_start_offset + 1);
-      if ((input2 & 0xC0) != 0x80)
-	// return 0xFFFE;
-	return 0;
+  /* TODO: assert that this TokenId is a "simple token" like punctuation and not
+   * like "IDENTIFIER"? */
+  Location current_loc = peek_token ()->get_locus ();
+  TokenPtr new_left_tok = Token::make (new_left, current_loc);
+  TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
 
-      // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-      // return output;
-      return 2;
-    }
-  else if ((input & 0xF0) == 0xE0)
-    {
-      // 3 bytes
-      uint8_t input2 = peek_input (n_start_offset + 1);
-      if ((input2 & 0xC0) != 0x80)
-	// return 0xFFFE;
-	return 0;
+  token_queue.replace_current_value (std::move (new_left_tok));
+  token_queue.insert (1, std::move (new_right_tok));
+}
 
-      uint8_t input3 = peek_input (n_start_offset + 2);
-      if ((input3 & 0xC0) != 0x80)
-	// return 0xFFFE;
-	return 0;
+void
+Lexer::start_line (int current_line, int current_column)
+{
+  if (line_map)
+    line_map->start_line (current_line, current_column);
+}
 
-      /*uint32_t output
-	= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
-      0); return output;*/
-      return 3;
-    }
-  else if ((input & 0xF8) == 0xF0)
-    {
-      // 4 bytes
-      uint8_t input2 = peek_input (n_start_offset + 1);
-      if ((input2 & 0xC0) != 0x80)
-	// return 0xFFFE;
-	return 0;
+} // namespace Rust
 
-      uint8_t input3 = peek_input (n_start_offset + 2);
-      if ((input3 & 0xC0) != 0x80)
-	// return 0xFFFE;
-	return 0;
+#if CHECKING_P
 
-      uint8_t input4 = peek_input (n_start_offset + 3);
-      if ((input4 & 0xC0) != 0x80)
-	// return 0xFFFE;
-	return 0;
+namespace selftest {
 
-      /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-			| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
-      return output;*/
-      return 4;
-    }
-  else
+// Checks if `src` has the same contents as the given characters
+void
+assert_source_content (Rust::Lexer::InputSource &src,
+		       std::vector<uint32_t> expected)
+{
+  Rust::Codepoint src_char = src.next ();
+  for (auto expected_char : expected)
     {
-      rust_error_at (get_current_location (),
-		     "invalid UTF-8 [THIRD] (too long)");
-      return 0;
+      // Make sure that `src` is not shorter than `expected`
+      ASSERT_FALSE (src_char.is_eof ());
+      // Checks skipped character is expeceted one.
+      ASSERT_EQ (src_char.value, expected_char);
+      src_char = src.next ();
     }
+  // Checks if `src` and `chars` has the same length.
+  ASSERT_TRUE (src_char.is_eof ());
 }
 
-// peeks the codepoint input at n codepoints ahead of current codepoint - try
-// not to use
-Codepoint
-Lexer::test_peek_codepoint_input (int n)
+void
+test_buffer_input_source (std::string str, std::vector<uint32_t> expected)
 {
-  int totalOffset = 0;
-
-  // add up all offsets into total offset? does this do what I want?
-  for (int i = 0; i < n; i++)
-    {
-      totalOffset += test_get_input_codepoint_n_length (totalOffset);
-    }
-  // issues: this would have (at least) O(n) lookup time, not O(1) like the
-  // rest?
-
-  // TODO: implement if still needed
-
-  // error out of function as it is not implemented
-  gcc_assert (1 == 0);
-  return {0};
-  /*
-	  uint8_t input = peek_input();
-
-	  if (input < 128) {
-	      // ascii -- 1 byte
-	      return input;
-	  } else if ((input & 0xC0) == 0x80) {
-	      // invalid (continuation; can't be first char)
-	      return 0xFFFE;
-	  } else if ((input & 0xE0) == 0xC0) {
-	      // 2 bytes
-	      uint8_t input2 = peek_input(1);
-	      if ((input2 & 0xC0) != 0x80)
-		  return 0xFFFE;
-
-	      uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
-	      return output;
-	  } else if ((input & 0xF0) == 0xE0) {
-	      // 3 bytes
-	      uint8_t input2 = peek_input(1);
-	      if ((input2 & 0xC0) != 0x80)
-		  return 0xFFFE;
-
-	      uint8_t input3 = peek_input(2);
-	      if ((input3 & 0xC0) != 0x80)
-		  return 0xFFFE;
-
-	      uint32_t output
-		= ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
-     0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
-	      // 4 bytes
-	      uint8_t input2 = peek_input(1);
-	      if ((input2 & 0xC0) != 0x80)
-		  return 0xFFFE;
-
-	      uint8_t input3 = peek_input(2);
-	      if ((input3 & 0xC0) != 0x80)
-		  return 0xFFFE;
-
-	      uint8_t input4 = peek_input(3);
-	      if ((input4 & 0xC0) != 0x80)
-		  return 0xFFFE;
-
-	      uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
-				| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
-     0); return output; } else { rust_error_at(get_current_location(), "invalid
-     UTF-8 (too long)"); return 0xFFFE;
-	  }*/
+  Rust::Lexer::BufferInputSource source (str, 0);
+  assert_source_content (source, expected);
 }
 
 void
-Lexer::split_current_token (TokenId new_left, TokenId new_right)
+test_file_input_source (std::string str, std::vector<uint32_t> expected)
 {
-  /* TODO: assert that this TokenId is a "simple token" like punctuation and not
-   * like "IDENTIFIER"? */
-  Location current_loc = peek_token ()->get_locus ();
-  TokenPtr new_left_tok = Token::make (new_left, current_loc);
-  TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
-
-  token_queue.replace_current_value (std::move (new_left_tok));
-  token_queue.insert (1, std::move (new_right_tok));
+  FILE *tmpf = tmpfile ();
+  // Moves to the first character
+  fputs (str.c_str (), tmpf);
+  std::rewind (tmpf);
+  Rust::Lexer::FileInputSource source (tmpf);
+  assert_source_content (source, expected);
 }
 
 void
-Lexer::start_line (int current_line, int current_column)
+rust_input_source_test ()
 {
-  if (line_map)
-    line_map->start_line (current_line, current_column);
+  // ASCII
+  std::string src = u8"_abcde\tXYZ\v\f";
+  std::vector<uint32_t> expected
+    = {'_', 'a', 'b', 'c', 'd', 'e', '\t', 'X', 'Y', 'Z', '\v', '\f'};
+  test_buffer_input_source (src, expected);
+
+  // BOM
+  src = u8"\xef\xbb\xbfOK";
+  expected = {'O', 'K'};
+  test_buffer_input_source (src, expected);
+
+  // Russian
+  src = u8"Ð¿ÑÐ¸Ð²ÐµÌÑ";
+  expected = {L'Ð¿',
+	      L'Ñ',
+	      L'Ð¸',
+	      L'Ð²',
+	      0x0435 /* CYRILLIC SMALL LETTER IE Ðµ */,
+	      0x301 /* COMBINING ACUTE ACCENT Ì */,
+	      L'Ñ'};
+  test_buffer_input_source (src, expected);
+
+  src = u8"â¤ï¸ð¦";
+  expected = {0x2764 /* HEAVY BLACK HEART */,
+	      0xfe0f /* VARIATION SELECTOR-16 */, L'ð¦'};
+  test_buffer_input_source (src, expected);
+
+  src = u8"ããã«ã¡ã¯";
+  expected = {L'ã', L'ã', L'ã«', L'ã¡', L'ã¯'};
+  test_file_input_source (src, expected);
+
+  src = u8"ð®ââð©ââ";
+  expected
+    = {0x1f46e /* POLICE OFFICER */,   0x200d /* ZERO WIDTH JOINER */,
+       0x2642 /* MALE SIGN */,	       0x1f469 /* WOMAN */,
+       0x200d /* ZERO WIDTH JOINER */, 0x2695 /* STAFF OF AESCULAPIUS */};
+  test_file_input_source (src, expected);
 }
 
-} // namespace Rust
+} // namespace selftest
+
+#endif // CHECKING_P
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
index a05e8fcbfe10..58ad48533c66 100644
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -23,6 +23,7 @@
 #include "rust-buffered-queue.h"
 #include "rust-token.h"
 #include "rust-optional.h"
+#include "selftest.h"
 
 namespace Rust {
 // Simple wrapper for FILE* that simplifies destruction.
@@ -119,9 +120,9 @@ private:
   void skip_input (int n);
 
   // Peeks the current char.
-  int peek_input ();
+  Codepoint peek_input ();
   // Returns char n bytes ahead of current position.
-  int peek_input (int n);
+  Codepoint peek_input (int n);
 
   // Classifies keyword (i.e. gets id for keyword).
   TokenId classify_keyword (const std::string &str);
@@ -136,12 +137,10 @@ private:
   std::pair<Codepoint, int> parse_partial_unicode_escape ();
 
   int get_input_codepoint_length ();
-  int test_get_input_codepoint_n_length (int n_start_offset);
   // Peeks the current utf-8 char
   Codepoint peek_codepoint_input ();
-  Codepoint test_peek_codepoint_input (int n);
   void skip_codepoint_input ();
-  void skip_broken_string_input (int current_char);
+  void skip_broken_string_input (Codepoint current_char);
 
   TokenPtr parse_byte_char (Location loc);
   TokenPtr parse_byte_string (Location loc);
@@ -208,37 +207,136 @@ public:
   Linemap *get_line_map () { return line_map; }
   std::string get_filename () { return std::string (input.get_filename ()); }
 
-private:
-  void start_line (int current_line, int current_column);
-
-  // File for use as input.
-  RAIIFile input;
-  // TODO is this actually required? could just have file storage in InputSource
+  // Input source wrapper thing.
+  class InputSource
+  {
+  private:
+    // position of current character
+    unsigned int pos;
+    std::vector<Codepoint> chars;
+    bool is_valid_utf8;
 
-  // Current line number.
-  int current_line;
-  // Current column number.
-  int current_column;
-  // Current character.
-  int current_char;
-  Codepoint current_char32;
-  // Line map.
-  Linemap *line_map;
+    // Overload operator () to return next char from input stream.
+    virtual int next_byte () = 0;
 
-  /* Max column number that can be quickly allocated - higher may require
-   * allocating new linemap */
-  static const int max_column_hint = 80;
+    Codepoint next_codepoint ()
+    {
+      uint8_t input = next_byte ();
+
+      if ((int8_t) input == EOF)
+	return Codepoint::eof ();
+      else if (input < 128)
+	{
+	  // ascii -- 1 byte
+	  return {input};
+	}
+      else if ((input & 0xC0) == 0x80)
+	{
+	  // invalid (continuation; can't be first char)
+	  return {0xFFFE};
+	}
+      else if ((input & 0xE0) == 0xC0)
+	{
+	  // 2 bytes
+	  uint8_t input2 = next_byte ();
+	  if ((input2 & 0xC0) != 0x80)
+	    return {0xFFFE};
+
+	  uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+	  return output;
+	}
+      else if ((input & 0xF0) == 0xE0)
+	{
+	  // 3 bytes or UTF-8 BOM
+	  uint8_t input2 = next_byte ();
+	  // If the second byte is equal to 0xBB then the input is no longer a
+	  // valid UTF-8 char.
+	  if (input == 0xEF && input2 == 0xBB)
+	    {
+	      uint8_t input3 = next_byte ();
+	      if (input3 == 0xBF)
+		return next_codepoint ();
+	      else
+		return {0xFFFE};
+	    }
+
+	  if ((input2 & 0xC0) != 0x80)
+	    return {0xFFFE};
+
+	  uint8_t input3 = next_byte ();
+
+	  if ((input3 & 0xC0) != 0x80)
+	    return {0xFFFE};
+
+	  uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
+			    | ((input3 & 0x3F) << 0);
+	  return {output};
+	}
+      else if ((input & 0xF8) == 0xF0)
+	{
+	  // 4 bytes
+	  uint8_t input2 = next_byte ();
+	  if ((input2 & 0xC0) != 0x80)
+	    return {0xFFFE};
+
+	  uint8_t input3 = next_byte ();
+	  if ((input3 & 0xC0) != 0x80)
+	    return {0xFFFE};
+
+	  uint8_t input4 = next_byte ();
+	  if ((input4 & 0xC0) != 0x80)
+	    return {0xFFFE};
+
+	  uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+			    | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+	  return {output};
+	}
+      else
+	{
+	  // rust_error_at (get_current_location (),
+	  //   "invalid UTF-8 [SECND] (too long)");
+	  return {0xFFFE};
+	}
+    }
 
-  Optional<std::ofstream &> dump_lex_out;
+  protected:
+    // Check if the input source is valid as utf-8 and copy all characters to
+    // `chars`.
+    void init ()
+    {
+      Codepoint char32 = next_codepoint ();
+      while (!char32.is_eof () && char32 != 0xFFFE)
+	{
+	  chars.push_back (char32);
+	  char32 = next_codepoint ();
+	}
+
+      if (char32 == 0xFFFE)
+	{
+	  // Input source is not valid as utf-8.
+	  is_valid_utf8 = false;
+	}
+    }
 
-  // Input source wrapper thing.
-  class InputSource
-  {
   public:
+    InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
+
     virtual ~InputSource () {}
 
-    // Overload operator () to return next char from input stream.
-    virtual int next () = 0;
+    bool is_valid () { return is_valid_utf8; }
+
+    // get the next UTF-8 character
+    Codepoint next ()
+    {
+      if (pos >= chars.size ())
+	return Codepoint::eof ();
+      else
+	{
+	  Codepoint c = chars[pos];
+	  pos++;
+	  return c;
+	}
+    }
   };
 
   class FileInputSource : public InputSource
@@ -247,11 +345,15 @@ private:
     // Input source file.
     FILE *input;
 
+    int next_byte () override { return fgetc (input); }
+
   public:
     // Create new input source from file.
-    FileInputSource (FILE *input) : input (input) {}
-
-    int next () override { return fgetc (input); }
+    FileInputSource (FILE *input) : InputSource (), input (input)
+    {
+      // TODO make this better?
+      init ();
+    }
   };
 
   class BufferInputSource : public InputSource
@@ -260,26 +362,52 @@ private:
     const std::string &buffer;
     size_t offs;
 
-  public:
-    // Create new input source from file.
-    BufferInputSource (const std::string &b, size_t offset)
-      : buffer (b), offs (offset)
-    {}
-
-    int next () override
+    int next_byte () override
     {
       if (offs >= buffer.size ())
 	return EOF;
 
       return buffer.at (offs++);
     }
+
+  public:
+    // Create new input source from file.
+    BufferInputSource (const std::string &b, size_t offset)
+      : InputSource (), buffer (b), offs (offset)
+    {
+      // TODO make this better?
+      init ();
+    }
   };
 
+private:
+  void start_line (int current_line, int current_column);
+
+  // File for use as input.
+  RAIIFile input;
+  // TODO is this actually required? could just have file storage in InputSource
+
+  // Current line number.
+  int current_line;
+  // Current column number.
+  int current_column;
+  // Current character.
+  Codepoint current_char;
+  Codepoint current_char32;
+  // Line map.
+  Linemap *line_map;
+
+  /* Max column number that can be quickly allocated - higher may require
+   * allocating new linemap */
+  static const int max_column_hint = 80;
+
+  Optional<std::ofstream &> dump_lex_out;
+
   // The input source for the lexer.
   // InputSource input_source;
   // Input file queue.
   std::unique_ptr<InputSource> raw_input_source;
-  buffered_queue<int, std::reference_wrapper<InputSource>> input_queue;
+  buffered_queue<Codepoint, std::reference_wrapper<InputSource>> input_queue;
 
   // Token source wrapper thing.
   struct TokenSource
@@ -305,4 +433,14 @@ private:
 
 } // namespace Rust
 
+#if CHECKING_P
+
+namespace selftest {
+void
+rust_input_source_test ();
+
+} // namespace selftest
+
+#endif // CHECKING_P
+
 #endif
diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc
index 084f74c9121d..2032cab87be3 100644
--- a/gcc/rust/rust-lang.cc
+++ b/gcc/rust/rust-lang.cc
@@ -36,6 +36,7 @@
 #include "rust-privacy-ctx.h"
 #include "rust-ast-resolve-item.h"
 #include "rust-optional.h"
+#include "rust-lex.h"
 
 #include <mpfr.h>
 // note: header files must be in this order or else forward declarations don't
@@ -449,6 +450,7 @@ void
 run_rust_tests ()
 {
   // Call tests for the rust frontend here
+  rust_input_source_test ();
   rust_cfg_parser_test ();
   rust_privacy_ctx_test ();
   rust_crate_name_validation_test ();