import os
import sys
import time
-import tokenize
-from collections import defaultdict
from dataclasses import dataclass, field
from operator import itemgetter
return ''.join(escapes[b] for b in s.encode(encoding))
-def is_literal_string(s):
- return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"')
-
-
-def safe_eval(s):
- # unwrap quotes, safely
- return eval(s, {'__builtins__':{}}, {})
-
-
def normalize(s, encoding):
# This converts the various Python string types into a format that is
# appropriate for .po files, namely much closer to C style.
}
-def matches_spec(message, spec):
- """Check if a message has all the keys defined by the keyword spec."""
- return all(key in message for key in spec.values())
-
-
@dataclass(frozen=True)
class Location:
filename: str
self.is_docstring |= is_docstring
-class TokenEater:
+class GettextVisitor(ast.NodeVisitor):
def __init__(self, options):
- self.__options = options
- self.__messages = {}
- self.__state = self.__waiting
- self.__data = defaultdict(str)
- self.__curr_arg = 0
- self.__curr_keyword = None
- self.__lineno = -1
- self.__freshmodule = 1
- self.__curfile = None
- self.__enclosurecount = 0
-
- def __call__(self, ttype, tstring, stup, etup, line):
- # dispatch
-## import token
-## print('ttype:', token.tok_name[ttype], 'tstring:', tstring,
-## file=sys.stderr)
- self.__state(ttype, tstring, stup[0])
-
- @property
- def messages(self):
- return self.__messages
-
- def __waiting(self, ttype, tstring, lineno):
- opts = self.__options
- # Do docstring extractions, if enabled
- if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
- # module docstring?
- if self.__freshmodule:
- if ttype == tokenize.STRING and is_literal_string(tstring):
- self.__addentry({'msgid': safe_eval(tstring)}, lineno, is_docstring=True)
- self.__freshmodule = 0
- return
- if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING):
- return
- self.__freshmodule = 0
- # class or func/method docstring?
- if ttype == tokenize.NAME and tstring in ('class', 'def'):
- self.__state = self.__suiteseen
- return
- if ttype == tokenize.NAME and tstring in ('class', 'def'):
- self.__state = self.__ignorenext
+ super().__init__()
+ self.options = options
+ self.filename = None
+ self.messages = {}
+
+ def visit_file(self, node, filename):
+ self.filename = filename
+ self.visit(node)
+
+ def visit_Module(self, node):
+ self._extract_docstring(node)
+ self.generic_visit(node)
+
+ visit_ClassDef = visit_FunctionDef = visit_AsyncFunctionDef = visit_Module
+
+ def visit_Call(self, node):
+ self._extract_message(node)
+ self.generic_visit(node)
+
+ def _extract_docstring(self, node):
+ if (not self.options.docstrings or
+ self.options.nodocstrings.get(self.filename)):
return
- if ttype == tokenize.NAME and tstring in opts.keywords:
- self.__state = self.__keywordseen
- self.__curr_keyword = tstring
+
+ docstring = ast.get_docstring(node)
+ if docstring is not None:
+ lineno = node.body[0].lineno # The first statement is the docstring
+ self._add_message(lineno, docstring, is_docstring=True)
+
+ def _extract_message(self, node):
+ func_name = self._get_func_name(node)
+ spec = self.options.keywords.get(func_name)
+ if spec is None:
+ return
+
+ max_index = max(spec)
+ has_var_positional = any(isinstance(arg, ast.Starred) for
+ arg in node.args[:max_index+1])
+ if has_var_positional:
+ print(f'*** {self.filename}:{node.lineno}: Variable positional '
+ f'arguments are not allowed in gettext calls', file=sys.stderr)
return
- if ttype == tokenize.STRING:
- maybe_fstring = ast.parse(tstring, mode='eval').body
- if not isinstance(maybe_fstring, ast.JoinedStr):
- return
- for value in filter(lambda node: isinstance(node, ast.FormattedValue),
- maybe_fstring.values):
- for call in filter(lambda node: isinstance(node, ast.Call),
- ast.walk(value)):
- func = call.func
- if isinstance(func, ast.Name):
- func_name = func.id
- elif isinstance(func, ast.Attribute):
- func_name = func.attr
- else:
- continue
-
- if func_name not in opts.keywords:
- continue
- if len(call.args) != 1:
- print((
- '*** %(file)s:%(lineno)s: Seen unexpected amount of'
- ' positional arguments in gettext call: %(source_segment)s'
- ) % {
- 'source_segment': ast.get_source_segment(tstring, call) or tstring,
- 'file': self.__curfile,
- 'lineno': lineno
- }, file=sys.stderr)
- continue
- if call.keywords:
- print((
- '*** %(file)s:%(lineno)s: Seen unexpected keyword arguments'
- ' in gettext call: %(source_segment)s'
- ) % {
- 'source_segment': ast.get_source_segment(tstring, call) or tstring,
- 'file': self.__curfile,
- 'lineno': lineno
- }, file=sys.stderr)
- continue
- arg = call.args[0]
- if not isinstance(arg, ast.Constant):
- print((
- '*** %(file)s:%(lineno)s: Seen unexpected argument type'
- ' in gettext call: %(source_segment)s'
- ) % {
- 'source_segment': ast.get_source_segment(tstring, call) or tstring,
- 'file': self.__curfile,
- 'lineno': lineno
- }, file=sys.stderr)
- continue
- if isinstance(arg.value, str):
- self.__curr_keyword = func_name
- self.__addentry({'msgid': arg.value}, lineno)
-
- def __suiteseen(self, ttype, tstring, lineno):
- # skip over any enclosure pairs until we see the colon
- if ttype == tokenize.OP:
- if tstring == ':' and self.__enclosurecount == 0:
- # we see a colon and we're not in an enclosure: end of def
- self.__state = self.__suitedocstring
- elif tstring in '([{':
- self.__enclosurecount += 1
- elif tstring in ')]}':
- self.__enclosurecount -= 1
-
- def __suitedocstring(self, ttype, tstring, lineno):
- # ignore any intervening noise
- if ttype == tokenize.STRING and is_literal_string(tstring):
- self.__addentry({'msgid': safe_eval(tstring)}, lineno, is_docstring=True)
- self.__state = self.__waiting
- elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
- tokenize.COMMENT):
- # there was no class docstring
- self.__state = self.__waiting
-
- def __keywordseen(self, ttype, tstring, lineno):
- if ttype == tokenize.OP and tstring == '(':
- self.__data.clear()
- self.__curr_arg = 0
- self.__enclosurecount = 0
- self.__lineno = lineno
- self.__state = self.__openseen
- else:
- self.__state = self.__waiting
-
- def __openseen(self, ttype, tstring, lineno):
- spec = self.__options.keywords[self.__curr_keyword]
- arg_type = spec.get(self.__curr_arg)
- expect_string_literal = arg_type is not None
-
- if ttype == tokenize.OP and self.__enclosurecount == 0:
- if tstring == ')':
- # We've seen the last of the translatable strings. Record the
- # line number of the first line of the strings and update the list
- # of messages seen. Reset state for the next batch. If there
- # were no strings inside _(), then just ignore this entry.
- if self.__data:
- self.__addentry(self.__data)
- self.__state = self.__waiting
- return
- elif tstring == ',':
- # Advance to the next argument
- self.__curr_arg += 1
- return
- if expect_string_literal:
- if ttype == tokenize.STRING and is_literal_string(tstring):
- self.__data[arg_type] += safe_eval(tstring)
- elif ttype not in (tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT,
- tokenize.NEWLINE, tokenize.NL):
- # We are inside an argument which is a translatable string and
- # we encountered a token that is not a string. This is an error.
- self.warn_unexpected_token(tstring)
- self.__enclosurecount = 0
- self.__state = self.__waiting
- elif ttype == tokenize.OP:
- if tstring in '([{':
- self.__enclosurecount += 1
- elif tstring in ')]}':
- self.__enclosurecount -= 1
-
- def __ignorenext(self, ttype, tstring, lineno):
- self.__state = self.__waiting
-
- def __addentry(self, msg, lineno=None, *, is_docstring=False):
- msgid = msg.get('msgid')
- if msgid in self.__options.toexclude:
+ if max_index >= len(node.args):
+ print(f'*** {self.filename}:{node.lineno}: Expected at least '
+ f'{max(spec) + 1} positional argument(s) in gettext call, '
+ f'got {len(node.args)}', file=sys.stderr)
return
- if not is_docstring:
- spec = self.__options.keywords[self.__curr_keyword]
- if not matches_spec(msg, spec):
+
+ msg_data = {}
+ for position, arg_type in spec.items():
+ arg = node.args[position]
+ if not self._is_string_const(arg):
+ print(f'*** {self.filename}:{arg.lineno}: Expected a string '
+ f'constant for argument {position + 1}, '
+ f'got {ast.unparse(arg)}', file=sys.stderr)
return
- if lineno is None:
- lineno = self.__lineno
- msgctxt = msg.get('msgctxt')
- msgid_plural = msg.get('msgid_plural')
+ msg_data[arg_type] = arg.value
+
+ lineno = node.lineno
+ self._add_message(lineno, **msg_data)
+
+ def _add_message(
+ self, lineno, msgid, msgid_plural=None, msgctxt=None, *,
+ is_docstring=False):
+ if msgid in self.options.toexclude:
+ return
+
key = self._key_for(msgid, msgctxt)
- if key in self.__messages:
- self.__messages[key].add_location(
- self.__curfile,
+ message = self.messages.get(key)
+ if message:
+ message.add_location(
+ self.filename,
lineno,
msgid_plural,
is_docstring=is_docstring,
)
else:
- self.__messages[key] = Message(
+ self.messages[key] = Message(
msgid=msgid,
msgid_plural=msgid_plural,
msgctxt=msgctxt,
- locations={Location(self.__curfile, lineno)},
+ locations={Location(self.filename, lineno)},
is_docstring=is_docstring,
)
return (msgctxt, msgid)
return msgid
- def warn_unexpected_token(self, token):
- print((
- '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
- ) % {
- 'token': token,
- 'file': self.__curfile,
- 'lineno': self.__lineno
- }, file=sys.stderr)
-
- def set_filename(self, filename):
- self.__curfile = filename
- self.__freshmodule = 1
+ def _get_func_name(self, node):
+ match node.func:
+ case ast.Name(id=id):
+ return id
+ case ast.Attribute(attr=attr):
+ return attr
+ case _:
+ return None
+ def _is_string_const(self, node):
+ return isinstance(node, ast.Constant) and isinstance(node.value, str)
def write_pot_file(messages, options, fp):
timestamp = time.strftime('%Y-%m-%d %H:%M%z')
args = expanded
# slurp through all the files
- eater = TokenEater(options)
+ visitor = GettextVisitor(options)
for filename in args:
if filename == '-':
if options.verbose:
print('Reading standard input')
- fp = sys.stdin.buffer
- closep = 0
+ source = sys.stdin.buffer.read()
else:
if options.verbose:
print(f'Working on {filename}')
- fp = open(filename, 'rb')
- closep = 1
+ with open(filename, 'rb') as fp:
+ source = fp.read()
+
try:
- eater.set_filename(filename)
- try:
- tokens = tokenize.tokenize(fp.readline)
- for _token in tokens:
- eater(*_token)
- except tokenize.TokenError as e:
- print('%s: %s, line %d, column %d' % (
- e.args[0], filename, e.args[1][0], e.args[1][1]),
- file=sys.stderr)
- finally:
- if closep:
- fp.close()
+ module_tree = ast.parse(source)
+ except SyntaxError:
+ continue
+
+ visitor.visit_file(module_tree, filename)
# write the output
if options.outfile == '-':
fp = open(options.outfile, 'w')
closep = 1
try:
- write_pot_file(eater.messages, options, fp)
+ write_pot_file(visitor.messages, options, fp)
finally:
if closep:
fp.close()