]> git.ipfire.org Git - thirdparty/babel.git/commitdiff
Use `ast` instead of `eval` for string extraction
authorAarni Koskela <akx@iki.fi>
Mon, 31 Oct 2022 11:05:00 +0000 (13:05 +0200)
committerAarni Koskela <akx@iki.fi>
Tue, 1 Nov 2022 08:48:43 +0000 (10:48 +0200)
This is safer (as we don't actually execute anything),
and allows us to parse f-strings too.

Closes #769 (supersedes it)
Refs #715 (doesn't add an error yet, but doesn't crash on f-strings)

babel/messages/extract.py
tests/messages/test_extract.py

index c95f1cbc9aec256adabd99c89c5f239efd8a50e8..74e57a1817bcac6e313e87e180768746fb1b4cd2 100644 (file)
@@ -15,7 +15,7 @@
     :copyright: (c) 2013-2022 by the Babel Team.
     :license: BSD, see LICENSE for more details.
 """
-
+import ast
 import os
 from os.path import relpath
 import sys
@@ -487,14 +487,9 @@ def extract_python(fileobj, keywords, comment_tags, options):
                 if nested:
                     funcname = value
             elif tok == STRING:
-                # Unwrap quotes in a safe manner, maintaining the string's
-                # encoding
-                # https://sourceforge.net/tracker/?func=detail&atid=355470&
-                # aid=617979&group_id=5470
-                code = compile('# coding=%s\n%s' % (str(encoding), value),
-                               '<string>', 'eval', future_flags)
-                value = eval(code, {'__builtins__': {}}, {})
-                buf.append(value)
+                val = _parse_python_string(value, encoding, future_flags)
+                if val is not None:
+                    buf.append(val)
             elif tok == OP and value == ',':
                 if buf:
                     messages.append(''.join(buf))
@@ -516,6 +511,28 @@ def extract_python(fileobj, keywords, comment_tags, options):
             funcname = value
 
 
+def _parse_python_string(value, encoding, future_flags):
+    # Unwrap quotes in a safe manner, maintaining the string's encoding
+    # https://sourceforge.net/tracker/?func=detail&atid=355470&aid=617979&group_id=5470
+    code = compile(
+        f'# coding={str(encoding)}\n{value}',
+        '<string>',
+        'eval',
+        ast.PyCF_ONLY_AST | future_flags,
+    )
+    if isinstance(code, ast.Expression):
+        body = code.body
+        if isinstance(body, ast.Str):
+            return body.s
+        if isinstance(body, ast.JoinedStr):  # f-string
+            if all(isinstance(node, ast.Str) for node in body.values):
+                return ''.join(node.s for node in body.values)
+            if all(isinstance(node, ast.Constant) for node in body.values):
+                return ''.join(str(node.value) for node in body.values)
+            # TODO: we could raise an error or warning when not all nodes are constants
+    return None
+
+
 def extract_javascript(fileobj, keywords, comment_tags, options):
     """Extract messages from JavaScript source code.
 
index 47fe3060359ae22494b2732ebbeade4d0eb70c6d..3873191138b4d1e33758ca63f7625fad8b131e45 100644 (file)
@@ -528,3 +528,30 @@ nbsp = _('\xa0')
         messages = list(extract.extract('python', buf,
                                         extract.DEFAULT_KEYWORDS, [], {}))
         assert messages[0][1] == u'\xa0'
+
+    def test_f_strings(self):
+        buf = BytesIO(br"""
+t1 = _('foobar')
+t2 = _(f'spameggs' f'feast')  # should be extracted; constant parts only
+t2 = _(f'spameggs' 'kerroshampurilainen')  # should be extracted (mixing f with no f)
+t3 = _(f'''whoa! a '''  # should be extracted (continues on following lines)
+f'flying shark'
+    '... hello'
+)
+t4 = _(f'spameggs {t1}')  # should not be extracted
+""")
+        messages = list(extract.extract('python', buf, extract.DEFAULT_KEYWORDS, [], {}))
+        assert len(messages) == 4
+        assert messages[0][1] == u'foobar'
+        assert messages[1][1] == u'spameggsfeast'
+        assert messages[2][1] == u'spameggskerroshampurilainen'
+        assert messages[3][1] == u'whoa! a flying shark... hello'
+
+    def test_f_strings_non_utf8(self):
+        buf = BytesIO(b"""
+# -- coding: latin-1 --
+t2 = _(f'\xe5\xe4\xf6' f'\xc5\xc4\xd6')
+""")
+        messages = list(extract.extract('python', buf, extract.DEFAULT_KEYWORDS, [], {}))
+        assert len(messages) == 1
+        assert messages[0][1] == u'åäöÅÄÖ'