]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
gccrs: fix tokenizing utf-8 whitespaces
authorRaiki Tamura <tamaron1203@gmail.com>
Wed, 28 Jun 2023 10:14:50 +0000 (19:14 +0900)
committerArthur Cohen <arthur.cohen@embecosm.com>
Tue, 16 Jan 2024 17:46:31 +0000 (18:46 +0100)
gcc/rust/ChangeLog:

* lex/rust-lex.cc (Lexer::build_token):add check for all kinds of whitespaces

gcc/testsuite/ChangeLog:

* rust/compile/torture/utf8_whitespaces.rs: New test.

Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
gcc/rust/lex/rust-lex.cc
gcc/testsuite/rust/compile/torture/utf8_whitespaces.rs [new file with mode: 0644]

index aec2a96694abcc25e925e84723016a160472555b..7f7fc0c80bff7c53cb3bfb9d632acb6b8cd85d87 100644 (file)
@@ -420,7 +420,10 @@ Lexer::build_token ()
        {
        /* ignore whitespace characters for tokens but continue updating
         * location */
-       case '\n': // newline
+       case '\n':   // newline
+       case 0x0085: // next line
+       case 0x2028: // line separator
+       case 0x2029: // paragraph separator
          current_line++;
          current_column = 1;
          // tell line_table that new line starts
@@ -432,10 +435,16 @@ Lexer::build_token ()
        case ' ': // space
          current_column++;
          continue;
-       case '\t': // tab
+       case '\t': // horizontal tab
          // width of a tab is not well-defined, assume 8 spaces
          current_column += 8;
          continue;
+       case '\v':   // vertical tab
+       case 0x000c: // form feed
+       case 0x200e: // left-to-right mark
+       case 0x200f: // right-to-left mark
+         // Ignored.
+         continue;
 
        // punctuation - actual tokens
        case '=':
diff --git a/gcc/testsuite/rust/compile/torture/utf8_whitespaces.rs b/gcc/testsuite/rust/compile/torture/utf8_whitespaces.rs
new file mode 100644 (file)
index 0000000..b45c014
--- /dev/null
@@ -0,0 +1,16 @@
+fn main() {
+    // FORM FEED
+    \f
+    // LINE TABULATION (vt)
+    \v
+    // NEXT LINE (nel)
+    \85
+    // LEFT-TO-RIGHT MARK
+    ‎
+    // RIGHT-TO-LEFT MARK 
+    ‏
+    // LINE SEPARATOR
+    

+    // PARAGRAPH SEPARATOR 
+    

+}