Extract: Merge in per-format keywords and auto_comments (#1243)

author Aarni Koskela <akx@iki.fi>

Sat, 10 Jan 2026 12:32:01 +0000 (14:32 +0200)

committer GitHub <noreply@github.com>

Sat, 10 Jan 2026 12:32:01 +0000 (14:32 +0200)
author Aarni Koskela <akx@iki.fi>
Sat, 10 Jan 2026 12:32:01 +0000 (14:32 +0200)
committer GitHub <noreply@github.com>
Sat, 10 Jan 2026 12:32:01 +0000 (14:32 +0200)
diff --git a/babel/messages/extract.py b/babel/messages/extract.py

index 3c196f0e03f5a66acbb7e6341c9a27a2054e0840..ad37085384fc2f015cac634cb761662bfeee00c5 100644 (file)
--- a/babel/messages/extract.py
+++ b/babel/messages/extract.py
@@ -316,13 +316,31 @@ def check_and_call_extract_file(
              if pathmatch(opattern, filename):
                  options = odict
                  break
+
+        # Merge keywords and comment_tags from per-format options if present.
+        file_keywords = keywords
+        file_comment_tags = comment_tags
+        if keywords_opt := options.get("keywords"):
+            if not isinstance(keywords_opt, dict):  # pragma: no cover
+                raise TypeError(
+                    f"The `keywords` option must be a dict of parsed keywords, not {keywords_opt!r}",
+                )
+            file_keywords = {**keywords, **keywords_opt}
+
+        if comments_opt := options.get("add_comments"):
+            if not isinstance(comments_opt, (list, tuple, set)):  # pragma: no cover
+                raise TypeError(
+                    f"The `add_comments` option must be a collection of comment tags, not {comments_opt!r}.",
+                )
+            file_comment_tags = tuple(set(comment_tags) | set(comments_opt))
+
          if callback:
              callback(filename, method, options)
          for message_tuple in extract_from_file(
              method,
              filepath,
-            keywords=keywords,
-            comment_tags=comment_tags,
+            keywords=file_keywords,
+            comment_tags=file_comment_tags,
              options=options,
              strip_comment_tags=strip_comment_tags,
          ):
diff --git a/babel/messages/frontend.py b/babel/messages/frontend.py

index 6ef62ec4aca6fb9b82f74fdfb69bbd3237c60c77..1e13b6cc6b24a28ed33f9b03118e910b890ac09b 100644 (file)
--- a/babel/messages/frontend.py
+++ b/babel/messages/frontend.py
@@ -23,7 +23,7 @@ import tempfile
  import warnings
  from configparser import RawConfigParser
  from io import StringIO
-from typing import BinaryIO, Iterable, Literal
+from typing import Any, BinaryIO, Iterable, Literal
  
  from babel import Locale, localedata
  from babel import __version__ as VERSION
@@ -584,7 +584,7 @@ class ExtractMessages(CommandMixin):
                      method_map, options_map = [], {}
                      for pattern, method, options in mapping:
                          method_map.append((pattern, method))
-                        options_map[pattern] = options or {}
+                        options_map[pattern] = _parse_string_options(options or {})
                  mappings.append((path, method_map, options_map))
  
          else:
@@ -1075,7 +1075,7 @@ def parse_mapping_cfg(fileobj, filename=None):
          else:
              method, pattern = (part.strip() for part in section.split(':', 1))
              method_map.append((pattern, method))
-            options_map[pattern] = dict(parser.items(section))
+            options_map[pattern] = _parse_string_options(dict(parser.items(section)))
  
      if extractors:
          for idx, (pattern, method) in enumerate(method_map):
@@ -1086,6 +1086,25 @@ def parse_mapping_cfg(fileobj, filename=None):
      return method_map, options_map
  
  
+def _parse_string_options(options: dict[str, str]) -> dict[str, Any]:
+    """
+    Parse string-formatted options from a mapping configuration.
+
+    The `keywords` and `add_comments` options are parsed into a canonical
+    internal format, so they can be merged with global keywords/comment tags
+    during extraction.
+    """
+    options: dict[str, Any] = options.copy()
+
+    if keywords_val := options.pop("keywords", None):
+        options['keywords'] = parse_keywords(listify_value(keywords_val))
+
+    if comments_val := options.pop("add_comments", None):
+        options['add_comments'] = listify_value(comments_val)
+
+    return options
+
+
  def _parse_config_object(config: dict, *, filename="(unknown)"):
      extractors = {}
      method_map = []
@@ -1140,6 +1159,26 @@ def _parse_config_object(config: dict, *, filename="(unknown)"):
          if not isinstance(pattern, list):
              pattern = [pattern]
  
+        if keywords_val := entry.pop("keywords", None):
+            if isinstance(keywords_val, str):
+                entry["keywords"] = parse_keywords(listify_value(keywords_val))
+            elif isinstance(keywords_val, list):
+                entry["keywords"] = parse_keywords(keywords_val)
+            else:
+                raise ConfigurationError(
+                    f"{filename}: mappings[{idx}]: 'keywords' must be a string or list, got {keywords_val!r}",
+                )
+
+        if comments_val := entry.pop("add_comments", None):
+            if isinstance(comments_val, str):
+                entry["add_comments"] = [comments_val]
+            elif isinstance(comments_val, list):
+                entry["add_comments"] = comments_val
+            else:
+                raise ConfigurationError(
+                    f"{filename}: mappings[{idx}]: 'add_comments' must be a string or list, got {comments_val!r}",
+                )
+
          for pat in pattern:
              if not isinstance(pat, str):
                  raise ConfigurationError(
diff --git a/docs/messages.rst b/docs/messages.rst

index 0f57eb1170fbacb02ea72cd4b91175d76002ffb6..c835d60a7dd1971e901dc8abe2dc144e2ff0a029 100644 (file)
--- a/docs/messages.rst
+++ b/docs/messages.rst
@@ -139,14 +139,6 @@ Genshi markup templates and text templates:
      [javascript: **.js]
      extract_messages = $._, jQuery._
  
-The configuration file syntax is based on the format commonly found in ``.INI``
-files on Windows systems, and as supported by the ``ConfigParser`` module in
-the Python standard library. Section names (the strings enclosed in square
-brackets) specify both the name of the extraction method, and the extended glob
-pattern to specify the files that this extraction method should be used for,
-separated by a colon. The options in the sections are passed to the extraction
-method. Which options are available is specific to the extraction method used.
-
  The extended glob patterns used in this configuration are similar to the glob
  patterns provided by most shells. A single asterisk (``*``) is a wildcard for
  any number of characters (except for the pathname component separator "/"),
@@ -155,9 +147,132 @@ two subsequent asterisk characters (``**``) can be used to make the wildcard
  match any directory level, so the pattern ``**.txt`` matches any file with the
  extension ``.txt`` in any directory.
  
+Babel supports two configuration file formats: INI and TOML.
+
+INI Configuration Format
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The INI configuration file syntax is based on the format commonly found in ``.INI``
+files on Windows systems, and as supported by the ``ConfigParser`` module in
+the Python standard library. Section names (the strings enclosed in square
+brackets) specify both the name of the extraction method, and the extended glob
+pattern to specify the files that this extraction method should be used for,
+separated by a colon. The options in the sections are passed to the extraction
+method. Which options are available is specific to the extraction method used.
+
  Lines that start with a ``#`` or ``;`` character are ignored and can be used
  for comments. Empty lines are ignored, too.
  
+TOML Configuration Format
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Babel also supports TOML format for configuration files, when the ``tomllib``
+module is available (Python 3.11+), or when the ``tomli`` package is installed
+(for Python versions prior to 3.11).
+
+TOML provides a more structured format and is particularly useful when combined
+with ``pyproject.toml``.
+
+The same configuration examples shown above can be written in TOML format:
+
+.. code-block:: toml
+
+    # Extraction from Python source files
+    [[mappings]]
+    method = "python"
+    pattern = "**.py"
+
+    # Extraction from Genshi HTML and text templates
+    [[mappings]]
+    method = "genshi"
+    pattern = "**/templates/**.html"
+    ignore_tags = "script,style"
+    include_attrs = "alt title summary"
+
+    [[mappings]]
+    method = "genshi"
+    pattern = "**/templates/**.txt"
+    template_class = "genshi.template:TextTemplate"
+    encoding = "ISO-8819-15"
+
+    # Extraction from JavaScript files
+    [[mappings]]
+    method = "javascript"
+    pattern = "**.js"
+    extract_messages = "$._, jQuery._"
+
+In TOML format, each ``[[mappings]]`` section defines a mapping. The ``method``
+and ``pattern`` fields are required. The ``pattern`` field can be a string or
+an array of strings to match multiple patterns with the same configuration.
+
+If you're using ``pyproject.toml``, nest the configuration under ``[tool.babel]``:
+
+.. code-block:: toml
+
+    [tool.babel]
+    [[tool.babel.mappings]]
+    method = "python"
+    pattern = "**.py"
+
+You can reference custom extractors in both formats. In TOML:
+
+.. code-block:: toml
+
+    [extractors]
+    custom = "mypackage.module:extract_custom"
+
+    [[mappings]]
+    method = "custom"
+    pattern = "**.ctm"
+    some_option = "foo"
+
+Common Options
+^^^^^^^^^^^^^^
+
+In addition to extractor-specific options, the following options can be specified
+in any mapping section and will be merged with global settings:
+
+``keywords``
+  A list of keywords (function names) to extract messages from.
+  This uses the same syntax as the ``--keyword`` command-line option.
+  Keywords specified here are added to (not replacing) the default keywords or
+  those specified via command-line.
+
+  In INI format, whitespace-separated: ``keywords = _ gettext ngettext:1,2 pgettext:1c,2``
+
+  In TOML format, use either a whitespace-separated string or an array:
+  ``keywords = "_ gettext ngettext:1,2"`` or
+  ``keywords = ["_", "gettext", "ngettext:1,2"]``
+
+``add_comments``
+  A list of comment tag prefixes to extract and include in the
+  output. This uses the same syntax as the ``--add-comments`` command-line option.
+  Comment tags specified here are added to those specified via command-line.
+
+  In INI format, whitespace-separated: ``add_comments = TRANSLATOR: NOTE:``
+
+  In TOML format, use either a string or an array:
+  ``add_comments = "TRANSLATOR NOTE:"`` (parsed as a single string!) or
+  ``add_comments = ["TRANSLATOR:", "NOTE:"]``
+
+**Example in INI format:**
+
+.. code-block:: ini
+
+    [python: **.py]
+    keywords = _ _l _n:1,2
+    add_comments = TRANSLATOR:
+
+**Example in TOML format:**
+
+.. code-block:: toml
+
+    [[mappings]]
+    method = "python"
+    pattern = "**.py"
+    keywords = ["_", "_l", "_n:1,2"]
+    add_comments = ["TRANSLATOR:"]
+
  .. note:: if you're performing message extraction using the command Babel
            provides for integration into ``setup.py`` scripts, you can also
            provide this configuration in a different way, namely as a keyword
diff --git a/tests/messages/data/mapping_with_keywords.cfg b/tests/messages/data/mapping_with_keywords.cfg

new file mode 100644 (file)

index 0000000..710e681
--- /dev/null
+++ b/tests/messages/data/mapping_with_keywords.cfg
@@ -0,0 +1,5 @@
+# Test mapping file with keywords option (issue #1224)
+
+[python: **.py]
+encoding = utf-8
+keywords = _ _l _n:1,2 _nl:1,2 _p:1c,2 _pl:1c,2 _np:1c,2,3 _npl:1c,2,3
diff --git a/tests/messages/data/mapping_with_keywords_and_comments.toml b/tests/messages/data/mapping_with_keywords_and_comments.toml

new file mode 100644 (file)

index 0000000..0a5135f
--- /dev/null
+++ b/tests/messages/data/mapping_with_keywords_and_comments.toml
@@ -0,0 +1,8 @@
+# Test mapping file with keywords and add_comments options (issue #1224)
+
+[[mappings]]
+method = "python"
+pattern = "**.py"
+encoding = "utf-8"
+keywords = ["_", "_l", "_n:1,2"]
+add_comments = ["SPECIAL:"]
diff --git a/tests/messages/data/project/issue_1224_test.py b/tests/messages/data/project/issue_1224_test.py

new file mode 100644 (file)

index 0000000..8e4f7a6
--- /dev/null
+++ b/tests/messages/data/project/issue_1224_test.py
@@ -0,0 +1,12 @@
+from myproject.i18n import lazy_gettext as _l, lazy_ngettext as _n
+
+
+class Choices:
+    # SPECIAL: This comment should be extracted
+    CHOICE_X = 1, _l("Choice X")
+    # SPECIAL: Another special comment
+    CHOICE_Y = 2, _l("Choice Y")
+    # No comment...
+    OPTION_C = 3, _l("Option C")
+    # Test for _n too! (but no comment... shush...)
+    OPTION_A = 4, (_n("Option A", "Options of the A kind", 1))
diff --git a/tests/messages/frontend/test_extract.py b/tests/messages/frontend/test_extract.py

index 1c4532f5fd5d6c69a5d0e6a1211a77beca8661ae..712200fbb8aa4cf21115b564bf273c052776f12f 100644 (file)
--- a/tests/messages/frontend/test_extract.py
+++ b/tests/messages/frontend/test_extract.py
@@ -281,3 +281,54 @@ msgstr[1] ""
  
  """
      assert expected_content == pot_file.read_text()
+
+
+def test_extraction_with_mapping_file_with_keywords(extract_cmd, pot_file):
+    """
+    Test that keywords specified in mapping config file are properly parsed,
+    and merged with default keywords.
+    """
+    extract_cmd.mapping_file = 'mapping_with_keywords.cfg'
+    extract_cmd.output_file = pot_file
+    extract_cmd.input_paths = 'project'
+
+    extract_cmd.finalize_options()
+    extract_cmd.run()
+
+    with pot_file.open() as f:
+        catalog = read_po(f)
+
+    for msgid in ('bar', 'Choice X', 'Choice Y', 'Option C', 'Option A'):
+        msg = catalog[msgid]
+        assert not msg.auto_comments  # This configuration didn't specify SPECIAL:...
+        assert msg.pluralizable == (msgid == 'Option A')
+
+
+def test_extraction_with_mapping_file_with_comments(extract_cmd, pot_file):
+    """
+    Test that add_comments specified in mapping config file are properly parsed.
+    Uses TOML format to test that code path.
+    """
+    extract_cmd.mapping_file = 'mapping_with_keywords_and_comments.toml'
+    extract_cmd.output_file = pot_file
+    extract_cmd.input_paths = 'project/issue_1224_test.py'
+
+    extract_cmd.finalize_options()
+    extract_cmd.run()
+
+    with pot_file.open() as f:
+        catalog = read_po(f)
+
+    # Check that messages were extracted and have the expected auto_comments
+    for msgid, expected_comment in [
+        ('Choice X', 'extracted'),
+        ('Choice Y', 'special'),
+        ('Option C', None),
+        ('Option A', None),
+    ]:
+        msg = catalog[msgid]
+        if expected_comment:
+            assert any('SPECIAL' in comment and expected_comment in comment for comment in msg.auto_comments)
+        else:
+            assert not msg.auto_comments
+        assert msg.pluralizable == (msgid == 'Option A')
diff --git a/tests/messages/test_toml_config.py b/tests/messages/test_toml_config.py

index 6a3c1570005a262ea26bc25bf574eacbb408f01f..1dd37a7ac7db266eb87336f34f97ab7a2cce73d3 100644 (file)
--- a/tests/messages/test_toml_config.py
+++ b/tests/messages/test_toml_config.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
  import pathlib
  from io import BytesIO
  
@@ -9,19 +11,64 @@ toml_test_cases_path = pathlib.Path(__file__).parent / "toml-test-cases"
  assert toml_test_cases_path.is_dir(), "toml-test-cases directory not found"
  
  
+def parse_toml(cfg: bytes | str):
+    if isinstance(cfg, str):
+        cfg = cfg.encode("utf-8")
+    return frontend._parse_mapping_toml(BytesIO(cfg))
+
+
  def test_toml_mapping_multiple_patterns():
      """
      Test that patterns may be specified as a list in TOML,
      and are expanded to multiple entries in the method map.
      """
-    method_map, options_map = frontend._parse_mapping_toml(BytesIO(b"""
+    method_map, options_map = parse_toml("""
  [[mappings]]
  method = "python"
  pattern = ["xyz/**.py", "foo/**.py"]
-"""))
-    assert len(method_map) == 2
-    assert method_map[0] == ('xyz/**.py', 'python')
-    assert method_map[1] == ('foo/**.py', 'python')
+""")
+    assert method_map == [
+        ('xyz/**.py', 'python'),
+        ('foo/**.py', 'python'),
+    ]
+
+
+@pytest.mark.parametrize(
+    ("keywords_val", "expected"),
+    [
+        pytest.param('"foo bar quz"', {'bar': None, 'foo': None, 'quz': None}, id='string'),
+        pytest.param('["foo", "bar", "quz"]', {'bar': None, 'foo': None, 'quz': None}, id='list'),
+        pytest.param('"foo:1,2 bar quz"', {'bar': None, 'foo': (1, 2), 'quz': None}, id='s-args'),
+        pytest.param('["bar", "foo:1,2", "quz"]', {'bar': None, 'foo': (1, 2), 'quz': None}, id='l-args'),
+        pytest.param('[]', None, id='empty'),
+    ],
+)
+def test_toml_mapping_keywords_parsing(keywords_val, expected):
+    method_map, options_map = parse_toml(f"""
+[[mappings]]
+method = "python"
+pattern = ["**.py"]
+keywords = {keywords_val}
+""")
+    assert options_map['**.py'].get('keywords') == expected
+
+
+@pytest.mark.parametrize(
+    ("add_comments_val", "expected"),
+    [
+        ('"SPECIAL SAUCE"', ['SPECIAL SAUCE']),  # TOML will allow this as a single string
+        ('["SPECIAL", "SAUCE"]', ['SPECIAL', 'SAUCE']),
+        ('[]', None),
+    ],
+)
+def test_toml_mapping_add_comments_parsing(add_comments_val, expected):
+    method_map, options_map = parse_toml(f"""
+[[mappings]]
+method = "python"
+pattern = ["**.py"]
+add_comments = {add_comments_val}
+""")
+    assert options_map['**.py'].get('add_comments') == expected
  
  
  @pytest.mark.parametrize("test_case", toml_test_cases_path.glob("bad.*.toml"), ids=lambda p: p.name)
author	Aarni Koskela <akx@iki.fi>
	Sat, 10 Jan 2026 12:32:01 +0000 (14:32 +0200)
committer	GitHub <noreply@github.com>
	Sat, 10 Jan 2026 12:32:01 +0000 (14:32 +0200)
babel/messages/extract.py		patch \| blob \| blame \| history
babel/messages/frontend.py		patch \| blob \| blame \| history
docs/messages.rst		patch \| blob \| blame \| history
tests/messages/data/mapping_with_keywords.cfg	[new file with mode: 0644]	patch \| blob
tests/messages/data/mapping_with_keywords_and_comments.toml	[new file with mode: 0644]	patch \| blob
tests/messages/data/project/issue_1224_test.py	[new file with mode: 0644]	patch \| blob
tests/messages/frontend/test_extract.py		patch \| blob \| blame \| history
tests/messages/test_toml_config.py		patch \| blob \| blame \| history