gh-104169: Fix test_peg_generator after tokenizer refactoring (#110727)

author Lysandros Nikolaou <lisandrosnik@gmail.com>

Thu, 12 Oct 2023 07:34:35 +0000 (09:34 +0200)

committer GitHub <noreply@github.com>

Thu, 12 Oct 2023 07:34:35 +0000 (09:34 +0200)
author Lysandros Nikolaou <lisandrosnik@gmail.com>
Thu, 12 Oct 2023 07:34:35 +0000 (09:34 +0200)
committer GitHub <noreply@github.com>
Thu, 12 Oct 2023 07:34:35 +0000 (09:34 +0200)
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py

index 106baf959a6898ac9a29806cbd18bf148cd67d18..05a89e7705e90f42d071b6ef2545ab446012c92b 100644 (file)
--- a/Lib/test/test_exceptions.py
+++ b/Lib/test/test_exceptions.py
@@ -253,7 +253,7 @@ class ExceptionTests(unittest.TestCase):
          check('try:\n  pass\nexcept*:\n  pass', 3, 8)
          check('try:\n  pass\nexcept*:\n  pass\nexcept* ValueError:\n  pass', 3, 8)
  
-        # Errors thrown by tokenizer.c
+        # Errors thrown by the tokenizer
          check('(0x+1)', 1, 3)
          check('x = 0xI', 1, 6)
          check('0010 + 2', 1, 1)
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py

index 27871378f1c79e146133745374bc3a93a51b891a..61b00778f8361c68ca6d4a7cb0c4fed3b40c3a84 100644 (file)
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -255,7 +255,7 @@ class UTF8ValidatorTest(unittest.TestCase):
      def test_invalid_utf8(self):
          # This is a port of test_utf8_decode_invalid_sequences in
          # test_unicode.py to exercise the separate utf8 validator in
-        # Parser/tokenizer.c used when reading source files.
+        # Parser/tokenizer/helpers.c used when reading source files.
  
          # That file is written using low-level C file I/O, so the only way to
          # test it is to write actual files to disk.
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 06517acb0b24393a9aa43a821416c8a1e625e066..41b9ebe3374d622436f47228d78c695d6eba7f0f 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1435,7 +1435,7 @@ class TestDetectEncoding(TestCase):
          self.assertEqual(consumed_lines, expected)
  
      def test_latin1_normalization(self):
-        # See get_normal_name() in tokenizer.c.
+        # See get_normal_name() in Parser/tokenizer/helpers.c.
          encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
                       "iso-8859-1-unix", "iso-latin-1-mac")
          for encoding in encodings:
@@ -1460,7 +1460,7 @@ class TestDetectEncoding(TestCase):
  
  
      def test_utf8_normalization(self):
-        # See get_normal_name() in tokenizer.c.
+        # See get_normal_name() in Parser/tokenizer/helpers.c.
          encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
          for encoding in encodings:
              for rep in ("-", "_"):
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index c21876fb403d8f1c2e93db0b371b31150d7ffb23..0ab1893d42f72f142d35bbcd7bee5f8eb366ab8f 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -298,7 +298,7 @@ def untokenize(iterable):
  
  
  def _get_normal_name(orig_enc):
-    """Imitates get_normal_name in tokenizer.c."""
+    """Imitates get_normal_name in Parser/tokenizer/helpers.c."""
      # Only care about the first 12 characters.
      enc = orig_enc[:12].lower().replace("_", "-")
      if enc == "utf-8" or enc.startswith("utf-8-"):
diff --git a/Modules/config.c.in b/Modules/config.c.in

index 6081f95759538f8c52ac4b688ce72aa72bae32b8..53b4fb285498d0e39321cb070d99c7a787ab56ef 100644 (file)
--- a/Modules/config.c.in
+++ b/Modules/config.c.in
@@ -45,7 +45,7 @@ struct _inittab _PyImport_Inittab[] = {
      /* This lives in Python/Python-ast.c */
      {"_ast", PyInit__ast},
  
-    /* This lives in Python/Python-tokenizer.c */
+    /* This lives in Python/Python-tokenize.c */
      {"_tokenize", PyInit__tokenize},
  
      /* These entries are here for sys.builtin_module_names */
diff --git a/Parser/myreadline.c b/Parser/myreadline.c

index 719a178f244a2810d9abd47a41dbb843e8a3e17c..1825665354844b98791f354239559f28a815353b 100644 (file)
--- a/Parser/myreadline.c
+++ b/Parser/myreadline.c
@@ -1,5 +1,5 @@
  
-/* Readline interface for tokenizer.c and [raw_]input() in bltinmodule.c.
+/* Readline interface for the tokenizer and [raw_]input() in bltinmodule.c.
     By default, or when stdin is not a tty device, we have a super
     simple my_readline function using fgets.
     Optionally, we can use the GNU readline library.
@@ -364,7 +364,7 @@ PyOS_StdioReadline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
  char *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, const char *) = NULL;
  
  
-/* Interface used by tokenizer.c and bltinmodule.c */
+/* Interface used by file_tokenizer.c and bltinmodule.c */
  
  char *
  PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
diff --git a/Parser/string_parser.c b/Parser/string_parser.c

index c5f421844e9c52981dd4f21828e89e24ba481944..f1e027765c86b9712ea387f324aa06a07321254f 100644 (file)
--- a/Parser/string_parser.c
+++ b/Parser/string_parser.c
@@ -14,8 +14,9 @@ static int
  warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
  {
      unsigned char c = *first_invalid_escape;
-    if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) {  // in this case the tokenizer has already emitted a warning,
-                                                                                            // see tokenizer.c:warn_invalid_escape_sequence
+    if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) {
+        // in this case the tokenizer has already emitted a warning,
+        // see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence
          return 0;
      }
  
diff --git a/Python/traceback.c b/Python/traceback.c

index 5de1bff9943c6c642645ebc31042e822568ead88..f786144eda217c18f16f8db67622adb622cc62f1 100644 (file)
--- a/Python/traceback.c
+++ b/Python/traceback.c
@@ -32,7 +32,7 @@
  #define MAX_FRAME_DEPTH 100
  #define MAX_NTHREADS 100
  
-/* Function from Parser/tokenizer.c */
+/* Function from Parser/tokenizer/file_tokenizer.c */
  extern char* _PyTokenizer_FindEncodingFilename(int, PyObject *);
  
  /*[clinic input]
diff --git a/Tools/c-analyzer/TODO b/Tools/c-analyzer/TODO

index 27a535814ea52b5fe2a2753279d2150d9ba41b1b..3d599538510bd9edf2c45fdc7eec4d4a1f383bf3 100644 (file)
--- a/Tools/c-analyzer/TODO
+++ b/Tools/c-analyzer/TODO
@@ -428,8 +428,8 @@ Objects/typeobject.c:type_new():PyId___slots__                   _Py_IDENTIFIER(
  Objects/unicodeobject.c:unicodeiter_reduce():PyId_iter           _Py_IDENTIFIER(iter)
  Objects/weakrefobject.c:proxy_bytes():PyId___bytes__             _Py_IDENTIFIER(__bytes__)
  Objects/weakrefobject.c:weakref_repr():PyId___name__             _Py_IDENTIFIER(__name__)
-Parser/tokenizer.c:fp_setreadl():PyId_open                       _Py_IDENTIFIER(open)
-Parser/tokenizer.c:fp_setreadl():PyId_readline                   _Py_IDENTIFIER(readline)
+Parser/tokenizer/file_tokenizer.c:fp_setreadl():PyId_open        _Py_IDENTIFIER(open)
+Parser/tokenizer/file_tokenizer.c:fp_setreadl():PyId_readline    _Py_IDENTIFIER(readline)
  Python/Python-ast.c:ast_type_reduce():PyId___dict__              _Py_IDENTIFIER(__dict__)
  Python/Python-ast.c:make_type():PyId___module__                  _Py_IDENTIFIER(__module__)
  Python/_warnings.c:PyId_stderr                                   _Py_IDENTIFIER(stderr)
diff --git a/Tools/peg_generator/pegen/build.py b/Tools/peg_generator/pegen/build.py

index 6b04ae9ec7025c6fa87fc30d33918bf81f1a2222..30bfb31471c7b26ad5552bf6b13347efe78e1d85 100644 (file)
--- a/Tools/peg_generator/pegen/build.py
+++ b/Tools/peg_generator/pegen/build.py
@@ -123,7 +123,14 @@ def compile_c_extension(
      common_sources = [
          str(MOD_DIR.parent.parent.parent / "Python" / "Python-ast.c"),
          str(MOD_DIR.parent.parent.parent / "Python" / "asdl.c"),
-        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer.c"),
+        str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "lexer.c"),
+        str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "state.c"),
+        str(MOD_DIR.parent.parent.parent / "Parser" / "lexer" / "buffer.c"),
+        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "string_tokenizer.c"),
+        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "file_tokenizer.c"),
+        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "utf8_tokenizer.c"),
+        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "readline_tokenizer.c"),
+        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer" / "helpers.c"),
          str(MOD_DIR.parent.parent.parent / "Parser" / "pegen.c"),
          str(MOD_DIR.parent.parent.parent / "Parser" / "pegen_errors.c"),
          str(MOD_DIR.parent.parent.parent / "Parser" / "action_helpers.c"),
@@ -133,6 +140,8 @@ def compile_c_extension(
      include_dirs = [
          str(MOD_DIR.parent.parent.parent / "Include" / "internal"),
          str(MOD_DIR.parent.parent.parent / "Parser"),
+        str(MOD_DIR.parent.parent.parent / "Parser" / "lexer"),
+        str(MOD_DIR.parent.parent.parent / "Parser" / "tokenizer"),
      ]
      extension = Extension(
          extension_name,
author	Lysandros Nikolaou <lisandrosnik@gmail.com>
	Thu, 12 Oct 2023 07:34:35 +0000 (09:34 +0200)
committer	GitHub <noreply@github.com>
	Thu, 12 Oct 2023 07:34:35 +0000 (09:34 +0200)
Lib/test/test_exceptions.py		patch \| blob \| blame \| history
Lib/test/test_source_encoding.py		patch \| blob \| blame \| history
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Lib/tokenize.py		patch \| blob \| blame \| history
Modules/config.c.in		patch \| blob \| blame \| history
Parser/myreadline.c		patch \| blob \| blame \| history
Parser/string_parser.c		patch \| blob \| blame \| history
Python/traceback.c		patch \| blob \| blame \| history
Tools/c-analyzer/TODO		patch \| blob \| blame \| history
Tools/peg_generator/pegen/build.py		patch \| blob \| blame \| history