Handle newlines during string parsing while lexing

author Nirmal Patel <nirmal@nirmal.dev>

Wed, 27 Dec 2023 22:49:58 +0000 (17:49 -0500)

committer P-E-P <32375388+P-E-P@users.noreply.github.com>

Thu, 4 Jan 2024 09:06:04 +0000 (09:06 +0000)
author Nirmal Patel <nirmal@nirmal.dev>
Wed, 27 Dec 2023 22:49:58 +0000 (17:49 -0500)
committer P-E-P <32375388+P-E-P@users.noreply.github.com>
Thu, 4 Jan 2024 09:06:04 +0000 (09:06 +0000)
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc

index 910ad0762252f5881fdff388e0cf09afcd59c6ba..30691109d2a22bbaa8abb282d738053685f4f7af 100644 (file)
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -1766,13 +1766,15 @@ Lexer::parse_byte_string (location_t loc)
    std::string str;
    str.reserve (16); // some sensible default
  
-  int length = 1;
    current_char = peek_input ();
  
+  const location_t string_begin_locus = get_current_location ();
+
    while (current_char != '"' && !current_char.is_eof ())
      {
        if (current_char == '\\')
         {
+         int length = 1;
           auto escape_length_pair = parse_escape ('"');
           char output_char = std::get<0> (escape_length_pair);
  
@@ -1784,18 +1786,25 @@ Lexer::parse_byte_string (location_t loc)
           if (output_char != 0 || !std::get<2> (escape_length_pair))
             str += output_char;
  
+         current_column += length;
+
           continue;
         }
  
-      length++;
+      current_column++;
+      if (current_char.value == '\n')
+       {
+         current_line++;
+         current_column = 1;
+         // tell line_table that new line starts
+         start_line (current_line, max_column_hint);
+       }
  
        str += current_char;
        skip_input ();
        current_char = peek_input ();
      }
  
-  current_column += length;
-
    if (current_char == '"')
      {
        current_column++;
@@ -1805,7 +1814,7 @@ Lexer::parse_byte_string (location_t loc)
      }
    else if (current_char.is_eof ())
      {
-      rust_error_at (get_current_location (), "unended byte string literal");
+      rust_error_at (string_begin_locus, "unended byte string literal");
        return Token::make (END_OF_FILE, get_current_location ());
      }
    else
@@ -1996,14 +2005,17 @@ Lexer::parse_string (location_t loc)
    std::string str;
    str.reserve (16); // some sensible default
  
-  int length = 1;
    current_char = peek_input ();
  
+  const location_t string_begin_locus = get_current_location ();
+
    // FIXME: This fails if the input ends. How do we check for EOF?
    while (current_char.value != '"' && !current_char.is_eof ())
      {
        if (current_char.value == '\\')
         {
+         int length = 1;
+
           // parse escape
           auto utf8_escape_pair = parse_utf8_escape ();
           current_char = std::get<0> (utf8_escape_pair);
@@ -2016,21 +2028,28 @@ Lexer::parse_string (location_t loc)
           if (current_char != Codepoint (0) || !std::get<2> (utf8_escape_pair))
             str += current_char.as_string ();
  
+         current_column += length;
+
           // FIXME: should remove this but can't.
           // `parse_utf8_escape` does not update `current_char` correctly.
           current_char = peek_input ();
           continue;
         }
  
-      length++;
+      current_column++;
+      if (current_char.value == '\n')
+       {
+         current_line++;
+         current_column = 1;
+         // tell line_table that new line starts
+         start_line (current_line, max_column_hint);
+       }
  
        str += current_char;
        skip_input ();
        current_char = peek_input ();
      }
  
-  current_column += length;
-
    if (current_char.value == '"')
      {
        current_column++;
@@ -2040,7 +2059,7 @@ Lexer::parse_string (location_t loc)
      }
    else if (current_char.is_eof ())
      {
-      rust_error_at (get_current_location (), "unended string literal");
+      rust_error_at (string_begin_locus, "unended string literal");
        return Token::make (END_OF_FILE, get_current_location ());
      }
    else
@@ -2049,7 +2068,6 @@ Lexer::parse_string (location_t loc)
      }
  
    str.shrink_to_fit ();
-  loc += length - 1;
  
    return Token::make_string (loc, std::move (str));
  }
diff --git a/gcc/testsuite/rust/compile/issue-2187.rs b/gcc/testsuite/rust/compile/issue-2187.rs

new file mode 100644 (file)

index 0000000..deef417
--- /dev/null
+++ b/gcc/testsuite/rust/compile/issue-2187.rs
@@ -0,0 +1,11 @@
+const A: &'static u8 = b"
+";
+const B: &'static str = b"
+";
+const C: &'static u8 = "
+";
+const D: &'static str = "
+";
+ERROR_TIME
+// { dg-error "unrecognised token" "" { target *-*-* } .-1 }
+// { dg-error "failed to parse item in crate" "" { target *-*-* } .-2 }
diff --git a/gcc/testsuite/rust/execute/torture/issue-2187.rs b/gcc/testsuite/rust/execute/torture/issue-2187.rs

new file mode 100644 (file)

index 0000000..b531257
--- /dev/null
+++ b/gcc/testsuite/rust/execute/torture/issue-2187.rs
@@ -0,0 +1,23 @@
+/* { dg-output "L1\n\L2\nL3\nL4" } */
+extern "C" {
+    fn printf(s: *const i8, ...);
+}
+
+fn main() -> i32 {
+    let A = b"L1
+L2\0";
+    let B = "L3
+L4\0";
+
+    unsafe {
+        let a = "%s\n\0";
+        let b = a as *const str;
+        let c = b as *const i8;
+
+        printf(c, A);
+        printf(c, B);
+    }
+
+    0
+}
+
author	Nirmal Patel <nirmal@nirmal.dev>
	Wed, 27 Dec 2023 22:49:58 +0000 (17:49 -0500)
committer	P-E-P <32375388+P-E-P@users.noreply.github.com>
	Thu, 4 Jan 2024 09:06:04 +0000 (09:06 +0000)
gcc/rust/lex/rust-lex.cc		patch \| blob \| blame \| history
gcc/testsuite/rust/compile/issue-2187.rs	[new file with mode: 0644]	patch \| blob
gcc/testsuite/rust/execute/torture/issue-2187.rs	[new file with mode: 0644]	patch \| blob