gh-105017: Include CRLF lines in strings and column numbers (#105030)

author Marta Gómez Macías <mgmacias@google.com>

Sun, 28 May 2023 14:15:53 +0000 (15:15 +0100)

committer GitHub <noreply@github.com>

Sun, 28 May 2023 14:15:53 +0000 (15:15 +0100)
author Marta Gómez Macías <mgmacias@google.com>
Sun, 28 May 2023 14:15:53 +0000 (15:15 +0100)
committer GitHub <noreply@github.com>
Sun, 28 May 2023 14:15:53 +0000 (15:15 +0100)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 293592b3fd13db49e51026d1df501dc6d0cc1ce4..cd11dddd0fe51abf95b1a34b1de93f54c6f34959 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -85,11 +85,29 @@ class TokenizeTest(TestCase):
      DEDENT     ''            (5, 0) (5, 0)
      """)
  
-        self.check_tokenize("foo='bar'\r\n", """\
-    NAME       'foo'         (1, 0) (1, 3)
-    OP         '='           (1, 3) (1, 4)
-    STRING     "'bar'"       (1, 4) (1, 9)
-    NEWLINE    '\\n'          (1, 9) (1, 10)
+        self.check_tokenize("if True:\r\n    # NL\r\n    foo='bar'\r\n\r\n", """\
+    NAME       'if'          (1, 0) (1, 2)
+    NAME       'True'        (1, 3) (1, 7)
+    OP         ':'           (1, 7) (1, 8)
+    NEWLINE    '\\r\\n'        (1, 8) (1, 10)
+    COMMENT    '# NL'        (2, 4) (2, 8)
+    NL         '\\r\\n'        (2, 8) (2, 10)
+    INDENT     '    '        (3, 0) (3, 4)
+    NAME       'foo'         (3, 4) (3, 7)
+    OP         '='           (3, 7) (3, 8)
+    STRING     "\'bar\'"       (3, 8) (3, 13)
+    NEWLINE    '\\r\\n'        (3, 13) (3, 15)
+    NL         '\\r\\n'        (4, 0) (4, 2)
+    DEDENT     ''            (5, 0) (5, 0)
+            """)
+
+        self.check_tokenize("x = 1 + \\\r\n1\r\n", """\
+    NAME       'x'           (1, 0) (1, 1)
+    OP         '='           (1, 2) (1, 3)
+    NUMBER     '1'           (1, 4) (1, 5)
+    OP         '+'           (1, 6) (1, 7)
+    NUMBER     '1'           (2, 0) (2, 1)
+    NEWLINE    '\\r\\n'        (2, 1) (2, 3)
              """)
  
          indent_error_file = b"""\
@@ -1784,9 +1802,9 @@ class TestRoundtrip(TestCase):
              if support.verbose >= 2:
                  print('tokenize', testfile)
              with open(testfile, 'rb') as f:
-                # with self.subTest(file=testfile):
-                self.check_roundtrip(f)
-                self.check_line_extraction(f)
+                with self.subTest(file=testfile):
+                    self.check_roundtrip(f)
+                    self.check_line_extraction(f)
  
  
      def roundtrip(self, code):
@@ -2084,6 +2102,10 @@ b"""', """\
  b\
  c"""', """\
      STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
+    """)
+
+        self.check_tokenize(r'"hola\\\r\ndfgf"', """\
+    STRING     \'"hola\\\\\\\\\\\\r\\\\ndfgf"\' (1, 0) (1, 16)
      """)
  
          self.check_tokenize('f"abc"', """\
@@ -2120,6 +2142,12 @@ def"', """\
      FSTRING_START 'Rf"'         (1, 0) (1, 3)
      FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 3) (2, 3)
      FSTRING_END '"'           (2, 3) (2, 4)
+    """)
+
+        self.check_tokenize(r'f"hola\\\r\ndfgf"', """\
+    FSTRING_START \'f"\'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'hola\\\\\\\\\\\\r\\\\ndfgf' (1, 2) (1, 16)
+    FSTRING_END \'"\'           (1, 16) (1, 17)
      """)
  
      def test_function(self):
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst

new file mode 100644 (file)

index 0000000..02d653c
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst
@@ -0,0 +1 @@
+Show CRLF lines in the tokenize string attribute in both NL and NEWLINE tokens. Patch by Marta Gómez.
diff --git a/Parser/pegen.c b/Parser/pegen.c

index b031a6f5d440e85a833e47e2ea4ac99fd08dda29..b9894dd0acc54644683e3e7b3358b6a516dbe821 100644 (file)
--- a/Parser/pegen.c
+++ b/Parser/pegen.c
@@ -924,9 +924,9 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
  
      struct tok_state *tok;
      if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
-        tok = _PyTokenizer_FromUTF8(str, exec_input);
+        tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
      } else {
-        tok = _PyTokenizer_FromString(str, exec_input);
+        tok = _PyTokenizer_FromString(str, exec_input, 0);
      }
      if (tok == NULL) {
          if (PyErr_Occurred()) {
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c

index a84c2492b6b17a085b4558afcdda54fcb9bc5fbb..59c817293fbfcd9c6bbd8aaee9d7972e024fc59e 100644 (file)
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -772,7 +772,8 @@ translate_into_utf8(const char* str, const char* enc) {
  
  
  static char *
-translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
+translate_newlines(const char *s, int exec_input, int preserve_crlf,
+                   struct tok_state *tok) {
      int skip_next_lf = 0;
      size_t needed_length = strlen(s) + 2, final_length;
      char *buf, *current;
@@ -792,7 +793,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
                      break;
              }
          }
-        if (c == '\r') {
+        if (!preserve_crlf && c == '\r') {
              skip_next_lf = 1;
              c = '\n';
          }
@@ -822,14 +823,14 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
     inside TOK.  */
  
  static char *
-decode_str(const char *input, int single, struct tok_state *tok)
+decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
  {
      PyObject* utf8 = NULL;
      char *str;
      const char *s;
      const char *newl[2] = {NULL, NULL};
      int lineno = 0;
-    tok->input = str = translate_newlines(input, single, tok);
+    tok->input = str = translate_newlines(input, single, preserve_crlf, tok);
      if (str == NULL)
          return NULL;
      tok->enc = NULL;
@@ -881,14 +882,14 @@ decode_str(const char *input, int single, struct tok_state *tok)
  /* Set up tokenizer for string */
  
  struct tok_state *
-_PyTokenizer_FromString(const char *str, int exec_input)
+_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
  {
      struct tok_state *tok = tok_new();
      char *decoded;
  
      if (tok == NULL)
          return NULL;
-    decoded = decode_str(str, exec_input, tok);
+    decoded = decode_str(str, exec_input, tok, preserve_crlf);
      if (decoded == NULL) {
          _PyTokenizer_Free(tok);
          return NULL;
@@ -902,13 +903,13 @@ _PyTokenizer_FromString(const char *str, int exec_input)
  /* Set up tokenizer for UTF-8 string */
  
  struct tok_state *
-_PyTokenizer_FromUTF8(const char *str, int exec_input)
+_PyTokenizer_FromUTF8(const char *str, int exec_input, int preserve_crlf)
  {
      struct tok_state *tok = tok_new();
      char *translated;
      if (tok == NULL)
          return NULL;
-    tok->input = translated = translate_newlines(str, exec_input, tok);
+    tok->input = translated = translate_newlines(str, exec_input, preserve_crlf, tok);
      if (translated == NULL) {
          _PyTokenizer_Free(tok);
          return NULL;
@@ -1050,7 +1051,7 @@ tok_underflow_interactive(struct tok_state *tok) {
      }
      char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
      if (newtok != NULL) {
-        char *translated = translate_newlines(newtok, 0, tok);
+        char *translated = translate_newlines(newtok, 0, 0, tok);
          PyMem_Free(newtok);
          if (translated == NULL) {
              return 0;
@@ -1594,6 +1595,9 @@ tok_decimal_tail(struct tok_state *tok)
  static inline int
  tok_continuation_line(struct tok_state *tok) {
      int c = tok_nextc(tok);
+    if (c == '\r') {
+        c = tok_nextc(tok);
+    }
      if (c != '\n') {
          tok->done = E_LINECONT;
          return -1;
@@ -1693,7 +1697,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
              }
          }
          tok_backup(tok, c);
-        if (c == '#' || c == '\n') {
+        if (c == '#' || c == '\n' || c == '\r') {
              /* Lines with only whitespace and/or comments
                 shouldn't affect the indentation and are
                 not passed to the parser as NEWLINE tokens,
@@ -1822,7 +1826,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
          const char *prefix, *type_start;
          int current_starting_col_offset;
  
-        while (c != EOF && c != '\n') {
+        while (c != EOF && c != '\n' && c != '\r') {
              c = tok_nextc(tok);
          }
  
@@ -2002,6 +2006,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
          return MAKE_TOKEN(NAME);
      }
  
+    if (c == '\r') {
+        c = tok_nextc(tok);
+    }
+
      /* Newline */
      if (c == '\n') {
          tok->atbol = 1;
@@ -2405,7 +2413,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
              else {
                  end_quote_size = 0;
                  if (c == '\\') {
-                    tok_nextc(tok);  /* skip escaped char */
+                    c = tok_nextc(tok);  /* skip escaped char */
+                    if (c == '\r') {
+                        c = tok_nextc(tok);
+                    }
                  }
              }
          }
@@ -2696,6 +2707,9 @@ f_string_middle:
              return MAKE_TOKEN(FSTRING_MIDDLE);
          } else if (c == '\\') {
              int peek = tok_nextc(tok);
+            if (peek == '\r') {
+                peek = tok_nextc(tok);
+            }
              // Special case when the backslash is right before a curly
              // brace. We have to restore and return the control back
              // to the loop for the next iteration.
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h

index 019f533ef2a2601671ecd96f82ef72110e9534fb..02749e355da81241ec152d89d9ec6a7ee9921ff3 100644 (file)
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -135,8 +135,8 @@ struct tok_state {
  #endif
  };
  
-extern struct tok_state *_PyTokenizer_FromString(const char *, int);
-extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int);
+extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
+extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
  extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
                                                const char *, const char *);
  extern void _PyTokenizer_Free(struct tok_state *);
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c

index 01c2215366a73633a7aede0ee52a45c6127823f6..4eced66b6177085c946155ab07c42675903f0aee 100644 (file)
--- a/Python/Python-tokenize.c
+++ b/Python/Python-tokenize.c
@@ -55,7 +55,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
      if (filename == NULL) {
          return NULL;
      }
-    self->tok = _PyTokenizer_FromUTF8(source, 1);
+    self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
      if (self->tok == NULL) {
          Py_DECREF(filename);
          return NULL;
@@ -240,7 +240,12 @@ tokenizeriter_next(tokenizeriterobject *it)
              type = NAME;
          }
          else if (type == NEWLINE) {
-            str = PyUnicode_FromString("\n");
+            Py_DECREF(str);
+            if (it->tok->start[0] == '\r') {
+                str = PyUnicode_FromString("\r\n");
+            } else {
+                str = PyUnicode_FromString("\n");
+            }
              end_col_offset++;
          }
      }
author	Marta Gómez Macías <mgmacias@google.com>
	Sun, 28 May 2023 14:15:53 +0000 (15:15 +0100)
committer	GitHub <noreply@github.com>
	Sun, 28 May 2023 14:15:53 +0000 (15:15 +0100)
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2023-05-27-21-50-48.gh-issue-105017.4sDyDV.rst	[new file with mode: 0644]	patch \| blob
Parser/pegen.c		patch \| blob \| blame \| history
Parser/tokenizer.c		patch \| blob \| blame \| history
Parser/tokenizer.h		patch \| blob \| blame \| history
Python/Python-tokenize.c		patch \| blob \| blame \| history