gh-130057: Pygettext: Support translator comments (GH-130061)

author Tomas R. <tomas.roun8@gmail.com>

Mon, 17 Feb 2025 10:41:28 +0000 (11:41 +0100)

committer GitHub <noreply@github.com>

Mon, 17 Feb 2025 10:41:28 +0000 (12:41 +0200)
author Tomas R. <tomas.roun8@gmail.com>
Mon, 17 Feb 2025 10:41:28 +0000 (11:41 +0100)
committer GitHub <noreply@github.com>
Mon, 17 Feb 2025 10:41:28 +0000 (12:41 +0200)
diff --git a/Lib/test/test_tools/i18n_data/comments.pot b/Lib/test/test_tools/i18n_data/comments.pot

new file mode 100644 (file)

index 0000000..a1df46d
--- /dev/null
+++ b/Lib/test/test_tools/i18n_data/comments.pot
@@ -0,0 +1,110 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR ORGANIZATION
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2000-01-01 00:00+0000\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: pygettext.py 1.5\n"
+
+
+#: comments.py:4
+msgid "foo"
+msgstr ""
+
+#. i18n: This is a translator comment
+#: comments.py:7
+msgid "bar"
+msgstr ""
+
+#. i18n: This is a translator comment
+#. i18n: This is another translator comment
+#: comments.py:11
+msgid "baz"
+msgstr ""
+
+#. i18n: This is a translator comment
+#. with multiple
+#. lines
+#: comments.py:16
+msgid "qux"
+msgstr ""
+
+#. i18n: This is a translator comment
+#: comments.py:21
+msgid "quux"
+msgstr ""
+
+#. i18n: This is a translator comment
+#. with multiple lines
+#. i18n: This is another translator comment
+#. with multiple lines
+#: comments.py:27
+msgid "corge"
+msgstr ""
+
+#: comments.py:31
+msgid "grault"
+msgstr ""
+
+#. i18n: This is another translator comment
+#: comments.py:36
+msgid "garply"
+msgstr ""
+
+#: comments.py:40
+msgid "george"
+msgstr ""
+
+#. i18n: This is another translator comment
+#: comments.py:45
+msgid "waldo"
+msgstr ""
+
+#. i18n: This is a translator comment
+#. i18n: This is also a translator comment
+#. i18n: This is another translator comment
+#: comments.py:50
+msgid "waldo2"
+msgstr ""
+
+#. i18n: This is a translator comment
+#. i18n: This is another translator comment
+#. i18n: This is yet another translator comment
+#. i18n: This is a translator comment
+#. with multiple lines
+#: comments.py:53 comments.py:56 comments.py:59 comments.py:63
+msgid "fred"
+msgstr ""
+
+#: comments.py:65
+msgid "plugh"
+msgstr ""
+
+#: comments.py:67
+msgid "foobar"
+msgstr ""
+
+#. i18n: This is a translator comment
+#: comments.py:71
+msgid "xyzzy"
+msgstr ""
+
+#: comments.py:72
+msgid "thud"
+msgstr ""
+
+#. i18n: This is a translator comment
+#. i18n: This is another translator comment
+#. i18n: This is yet another translator comment
+#: comments.py:78
+msgid "foos"
+msgstr ""
+
diff --git a/Lib/test/test_tools/i18n_data/comments.py b/Lib/test/test_tools/i18n_data/comments.py

new file mode 100644 (file)

index 0000000..dca4dfa
--- /dev/null
+++ b/Lib/test/test_tools/i18n_data/comments.py
@@ -0,0 +1,78 @@
+from gettext import gettext as _
+
+# Not a translator comment
+_('foo')
+
+# i18n: This is a translator comment
+_('bar')
+
+# i18n: This is a translator comment
+# i18n: This is another translator comment
+_('baz')
+
+# i18n: This is a translator comment
+# with multiple
+# lines
+_('qux')
+
+# This comment should not be included because
+# it does not start with the prefix
+# i18n: This is a translator comment
+_('quux')
+
+# i18n: This is a translator comment
+# with multiple lines
+# i18n: This is another translator comment
+# with multiple lines
+_('corge')
+
+# i18n: This comment should be ignored
+
+_('grault')
+
+# i18n: This comment should be ignored
+
+# i18n: This is another translator comment
+_('garply')
+
+# i18n: comment should be ignored
+x = 1
+_('george')
+
+# i18n: This comment should be ignored
+x = 1
+# i18n: This is another translator comment
+_('waldo')
+
+# i18n: This is a translator comment
+x = 1  # i18n: This is also a translator comment
+# i18n: This is another translator comment
+_('waldo2')
+
+# i18n: This is a translator comment
+_('fred')
+
+# i18n: This is another translator comment
+_('fred')
+
+# i18n: This is yet another translator comment
+_('fred')
+
+# i18n: This is a translator comment
+# with multiple lines
+_('fred')
+
+_('plugh')  # i18n: This comment should be ignored
+
+_('foo'  # i18n: This comment should be ignored
+  'bar')  # i18n: This comment should be ignored
+
+# i18n: This is a translator comment
+_('xyzzy')
+_('thud')
+
+
+## i18n: This is a translator comment
+# # i18n: This is another translator comment
+### ###    i18n: This is yet another translator comment
+_('foos')
diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py

index f5aba31ed42c10256f63a8c063664ee2ab69957e..d97fdb116fcd1919aa7f3607a9ddbd7b91773d85 100644 (file)
--- a/Lib/test/test_tools/test_i18n.py
+++ b/Lib/test/test_tools/test_i18n.py
@@ -87,7 +87,8 @@ class Test_pygettext(unittest.TestCase):
          self.maxDiff = None
          self.assertEqual(normalize_POT_file(expected), normalize_POT_file(actual))
  
-    def extract_from_str(self, module_content, *, args=(), strict=True, with_stderr=False):
+    def extract_from_str(self, module_content, *, args=(), strict=True,
+                         with_stderr=False, raw=False):
          """Return all msgids extracted from module_content."""
          filename = 'test.py'
          with temp_cwd(None):
@@ -98,10 +99,11 @@ class Test_pygettext(unittest.TestCase):
                  self.assertEqual(res.err, b'')
              with open('messages.pot', encoding='utf-8') as fp:
                  data = fp.read()
-        msgids = self.get_msgids(data)
+        if not raw:
+            data = self.get_msgids(data)
          if not with_stderr:
-            return msgids
-        return msgids, res.err
+            return data
+        return data, res.err
  
      def extract_docstrings_from_str(self, module_content):
          """Return all docstrings extracted from module_content."""
@@ -381,7 +383,8 @@ class Test_pygettext(unittest.TestCase):
                  contents = input_file.read_text(encoding='utf-8')
                  with temp_cwd(None):
                      Path(input_file.name).write_text(contents)
-                    assert_python_ok('-Xutf8', self.script, '--docstrings', input_file.name)
+                    assert_python_ok('-Xutf8', self.script, '--docstrings',
+                                     '--add-comments=i18n:', input_file.name)
                      output = Path('messages.pot').read_text(encoding='utf-8')
  
                  expected = output_file.read_text(encoding='utf-8')
@@ -437,6 +440,51 @@ class Test_pygettext(unittest.TestCase):
              "*** test.py:3: Variable positional arguments are not allowed in gettext calls\n"
          )
  
+    def test_extract_all_comments(self):
+        """
+        Test that the --add-comments option without an
+        explicit tag extracts all translator comments.
+        """
+        for arg in ('--add-comments', '-c'):
+            with self.subTest(arg=arg):
+                data = self.extract_from_str(dedent('''\
+                # Translator comment
+                _("foo")
+                '''), args=(arg,), raw=True)
+                self.assertIn('#. Translator comment', data)
+
+    def test_comments_with_multiple_tags(self):
+        """
+        Test that multiple --add-comments tags can be specified.
+        """
+        for arg in ('--add-comments={}', '-c{}'):
+            with self.subTest(arg=arg):
+                args = (arg.format('foo:'), arg.format('bar:'))
+                data = self.extract_from_str(dedent('''\
+                # foo: comment
+                _("foo")
+
+                # bar: comment
+                _("bar")
+
+                # baz: comment
+                _("baz")
+                '''), args=args, raw=True)
+                self.assertIn('#. foo: comment', data)
+                self.assertIn('#. bar: comment', data)
+                self.assertNotIn('#. baz: comment', data)
+
+    def test_comments_not_extracted_without_tags(self):
+        """
+        Test that translator comments are not extracted without
+        specifying --add-comments.
+        """
+        data = self.extract_from_str(dedent('''\
+        # Translator comment
+        _("foo")
+        '''), raw=True)
+        self.assertNotIn('#.', data)
+
  
  def update_POT_snapshots():
      for input_file in DATA_DIR.glob('*.py'):
@@ -444,7 +492,8 @@ def update_POT_snapshots():
          contents = input_file.read_bytes()
          with temp_cwd(None):
              Path(input_file.name).write_bytes(contents)
-            assert_python_ok('-Xutf8', Test_pygettext.script, '--docstrings', input_file.name)
+            assert_python_ok('-Xutf8', Test_pygettext.script, '--docstrings',
+                             '--add-comments=i18n:', input_file.name)
              output = Path('messages.pot').read_text(encoding='utf-8')
  
          output = normalize_POT_file(output)
diff --git a/Misc/NEWS.d/next/Tools-Demos/2025-02-12-23-24-37.gh-issue-130057.TKUKI6.rst b/Misc/NEWS.d/next/Tools-Demos/2025-02-12-23-24-37.gh-issue-130057.TKUKI6.rst

new file mode 100644 (file)

index 0000000..0e89fa6
--- /dev/null
+++ b/Misc/NEWS.d/next/Tools-Demos/2025-02-12-23-24-37.gh-issue-130057.TKUKI6.rst
@@ -0,0 +1 @@
+Add support for translator comments in :program:`pygettext.py`.
diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py

index 4177d46048f9b9ea2a631c4f40084d362342d4e4..9654dd45067ff9e6c62739146e9e170146a6551f 100755 (executable)
--- a/Tools/i18n/pygettext.py
+++ b/Tools/i18n/pygettext.py
@@ -46,6 +46,12 @@ Options:
      --extract-all
          Extract all strings.
  
+    -cTAG
+    --add-comments=TAG
+        Extract translator comments.  Comments must start with TAG and
+        must precede the gettext call.  Multiple -cTAG options are allowed.
+        In that case, any comment matching any of the TAGs will be extracted.
+
      -d name
      --default-domain=name
          Rename the default output file from messages.pot to name.pot.
@@ -141,7 +147,9 @@ import importlib.util
  import os
  import sys
  import time
+import tokenize
  from dataclasses import dataclass, field
+from io import BytesIO
  from operator import itemgetter
  
  __version__ = '1.5'
@@ -302,12 +310,30 @@ class Message:
      msgctxt: str | None
      locations: set[Location] = field(default_factory=set)
      is_docstring: bool = False
+    comments: list[str] = field(default_factory=list)
  
-    def add_location(self, filename, lineno, msgid_plural=None, *, is_docstring=False):
+    def add_location(self, filename, lineno, msgid_plural=None, *,
+                     is_docstring=False, comments=None):
          if self.msgid_plural is None:
              self.msgid_plural = msgid_plural
          self.locations.add(Location(filename, lineno))
          self.is_docstring |= is_docstring
+        if comments:
+            self.comments.extend(comments)
+
+
+def get_source_comments(source):
+    """
+    Return a dictionary mapping line numbers to
+    comments in the source code.
+    """
+    comments = {}
+    for token in tokenize.tokenize(BytesIO(source).readline):
+        if token.type == tokenize.COMMENT:
+            # Remove any leading combination of '#' and whitespace
+            comment = token.string.lstrip('# \t')
+            comments[token.start[0]] = comment
+    return comments
  
  
  class GettextVisitor(ast.NodeVisitor):
@@ -316,10 +342,18 @@ class GettextVisitor(ast.NodeVisitor):
          self.options = options
          self.filename = None
          self.messages = {}
+        self.comments = {}
+
+    def visit_file(self, source, filename):
+        try:
+            module_tree = ast.parse(source)
+        except SyntaxError:
+            return
  
-    def visit_file(self, node, filename):
          self.filename = filename
-        self.visit(node)
+        if self.options.comment_tags:
+            self.comments = get_source_comments(source)
+        self.visit(module_tree)
  
      def visit_Module(self, node):
          self._extract_docstring(node)
@@ -372,14 +406,51 @@ class GettextVisitor(ast.NodeVisitor):
              msg_data[arg_type] = arg.value
  
          lineno = node.lineno
-        self._add_message(lineno, **msg_data)
+        comments = self._extract_comments(node)
+        self._add_message(lineno, **msg_data, comments=comments)
+
+    def _extract_comments(self, node):
+        """Extract translator comments.
+
+        Translator comments must precede the gettext call and
+        start with one of the comment prefixes defined by
+        --add-comments=TAG. See the tests for examples.
+        """
+        if not self.options.comment_tags:
+            return []
+
+        comments = []
+        lineno = node.lineno - 1
+        # Collect an unbroken sequence of comments starting from
+        # the line above the gettext call.
+        while lineno >= 1:
+            comment = self.comments.get(lineno)
+            if comment is None:
+                break
+            comments.append(comment)
+            lineno -= 1
+
+        # Find the first translator comment in the sequence and
+        # return all comments starting from that comment.
+        comments = comments[::-1]
+        first_index = next((i for i, comment in enumerate(comments)
+                            if self._is_translator_comment(comment)), None)
+        if first_index is None:
+            return []
+        return comments[first_index:]
+
+    def _is_translator_comment(self, comment):
+        return comment.startswith(self.options.comment_tags)
  
      def _add_message(
              self, lineno, msgid, msgid_plural=None, msgctxt=None, *,
-            is_docstring=False):
+            is_docstring=False, comments=None):
          if msgid in self.options.toexclude:
              return
  
+        if not comments:
+            comments = []
+
          key = self._key_for(msgid, msgctxt)
          message = self.messages.get(key)
          if message:
@@ -388,6 +459,7 @@ class GettextVisitor(ast.NodeVisitor):
                  lineno,
                  msgid_plural,
                  is_docstring=is_docstring,
+                comments=comments,
              )
          else:
              self.messages[key] = Message(
@@ -396,6 +468,7 @@ class GettextVisitor(ast.NodeVisitor):
                  msgctxt=msgctxt,
                  locations={Location(self.filename, lineno)},
                  is_docstring=is_docstring,
+                comments=comments,
              )
  
      @staticmethod
@@ -435,6 +508,10 @@ def write_pot_file(messages, options, fp):
  
      for key, locations in sorted_keys:
          msg = messages[key]
+
+        for comment in msg.comments:
+            print(f'#. {comment}', file=fp)
+
          if options.writelocations:
              # location comments are different b/w Solaris and GNU:
              if options.locationstyle == options.SOLARIS:
@@ -473,9 +550,9 @@ def main():
      try:
          opts, args = getopt.getopt(
              sys.argv[1:],
-            'ad:DEhk:Kno:p:S:Vvw:x:X:',
-            ['extract-all', 'default-domain=', 'escape', 'help',
-             'keyword=', 'no-default-keywords',
+            'ac::d:DEhk:Kno:p:S:Vvw:x:X:',
+            ['extract-all', 'add-comments=?', 'default-domain=', 'escape',
+             'help', 'keyword=', 'no-default-keywords',
               'add-location', 'no-location', 'output=', 'output-dir=',
               'style=', 'verbose', 'version', 'width=', 'exclude-file=',
               'docstrings', 'no-docstrings',
@@ -501,6 +578,7 @@ def main():
          excludefilename = ''
          docstrings = 0
          nodocstrings = {}
+        comment_tags = set()
  
      options = Options()
      locations = {'gnu' : options.GNU,
@@ -513,6 +591,8 @@ def main():
              usage(0)
          elif opt in ('-a', '--extract-all'):
              options.extractall = 1
+        elif opt in ('-c', '--add-comments'):
+            options.comment_tags.add(arg)
          elif opt in ('-d', '--default-domain'):
              options.outfile = arg + '.pot'
          elif opt in ('-E', '--escape'):
@@ -558,6 +638,8 @@ def main():
              finally:
                  fp.close()
  
+    options.comment_tags = tuple(options.comment_tags)
+
      # calculate escapes
      make_escapes(not options.escape)
  
@@ -600,12 +682,7 @@ def main():
              with open(filename, 'rb') as fp:
                  source = fp.read()
  
-        try:
-            module_tree = ast.parse(source)
-        except SyntaxError:
-            continue
-
-        visitor.visit_file(module_tree, filename)
+        visitor.visit_file(source, filename)
  
      # write the output
      if options.outfile == '-':
author	Tomas R. <tomas.roun8@gmail.com>
	Mon, 17 Feb 2025 10:41:28 +0000 (11:41 +0100)
committer	GitHub <noreply@github.com>
	Mon, 17 Feb 2025 10:41:28 +0000 (12:41 +0200)
Lib/test/test_tools/i18n_data/comments.pot	[new file with mode: 0644]	patch \| blob
Lib/test/test_tools/i18n_data/comments.py	[new file with mode: 0644]	patch \| blob
Lib/test/test_tools/test_i18n.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Tools-Demos/2025-02-12-23-24-37.gh-issue-130057.TKUKI6.rst	[new file with mode: 0644]	patch \| blob
Tools/i18n/pygettext.py		patch \| blob \| blame \| history