From: Aarni Koskela Date: Sat, 5 Apr 2025 15:21:04 +0000 (+0300) Subject: Optimizations for read_po (#1200) X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=d7a7589a6cee3aa4c68de60f4d69a9cdad50a7ff;p=thirdparty%2Fbabel.git Optimizations for read_po (#1200) * Avoid extra casts (`Message()` takes care of those) * Optimize empty normalized strings * Don't sort translations unless plural * Optimize unescape() * Optimize line processing * Optimize keyword parsing * Optimize comment parsing * Avoid hot `isinstance`ing in PO file parse loop * Add fast paths in `python_format` and `python_brace_format` * Inline distincting in `catalog.py` --- diff --git a/babel/messages/catalog.py b/babel/messages/catalog.py index de96ea57..e1d61e09 100644 --- a/babel/messages/catalog.py +++ b/babel/messages/catalog.py @@ -23,7 +23,7 @@ from babel import __version__ as VERSION from babel.core import Locale, UnknownLocaleError from babel.dates import format_datetime from babel.messages.plurals import get_plural -from babel.util import LOCALTZ, _cmp, distinct +from babel.util import LOCALTZ, _cmp if TYPE_CHECKING: from typing_extensions import TypeAlias @@ -164,7 +164,7 @@ class Message: if not string and self.pluralizable: string = ('', '') self.string = string - self.locations = list(distinct(locations)) + self.locations = list(dict.fromkeys(locations)) if locations else [] self.flags = set(flags) if id and self.python_format: self.flags.add('python-format') @@ -174,12 +174,15 @@ class Message: self.flags.add('python-brace-format') else: self.flags.discard('python-brace-format') - self.auto_comments = list(distinct(auto_comments)) - self.user_comments = list(distinct(user_comments)) - if isinstance(previous_id, str): - self.previous_id = [previous_id] + self.auto_comments = list(dict.fromkeys(auto_comments)) if auto_comments else [] + self.user_comments = list(dict.fromkeys(user_comments)) if user_comments else [] + if previous_id: + if isinstance(previous_id, str): + self.previous_id = [previous_id] + else: + self.previous_id = list(previous_id) else: - self.previous_id = list(previous_id) + self.previous_id = [] self.lineno = lineno self.context = context @@ -289,9 +292,12 @@ class Message: :type: `bool`""" ids = self.id - if not isinstance(ids, (list, tuple)): - ids = [ids] - return any(PYTHON_FORMAT.search(id) for id in ids) + if isinstance(ids, (list, tuple)): + for id in ids: # Explicit loop for performance reasons. + if PYTHON_FORMAT.search(id): + return True + return False + return bool(PYTHON_FORMAT.search(ids)) @property def python_brace_format(self) -> bool: @@ -304,9 +310,12 @@ class Message: :type: `bool`""" ids = self.id - if not isinstance(ids, (list, tuple)): - ids = [ids] - return any(_has_python_brace_format(id) for id in ids) + if isinstance(ids, (list, tuple)): + for id in ids: # Explicit loop for performance reasons. + if _has_python_brace_format(id): + return True + return False + return _has_python_brace_format(ids) class TranslationError(Exception): @@ -729,12 +738,9 @@ class Catalog: # The new message adds pluralization current.id = message.id current.string = message.string - current.locations = list(distinct(current.locations + - message.locations)) - current.auto_comments = list(distinct(current.auto_comments + - message.auto_comments)) - current.user_comments = list(distinct(current.user_comments + - message.user_comments)) + current.locations = list(dict.fromkeys([*current.locations, *message.locations])) + current.auto_comments = list(dict.fromkeys([*current.auto_comments, *message.auto_comments])) + current.user_comments = list(dict.fromkeys([*current.user_comments, *message.user_comments])) current.flags |= message.flags elif id == '': # special treatment for the header message @@ -916,8 +922,8 @@ class Catalog: assert oldmsg is not None message.string = oldmsg.string - if keep_user_comments: - message.user_comments = list(distinct(oldmsg.user_comments)) + if keep_user_comments and oldmsg.user_comments: + message.user_comments = list(dict.fromkeys(oldmsg.user_comments)) if isinstance(message.id, (list, tuple)): if not isinstance(message.string, (list, tuple)): diff --git a/babel/messages/pofile.py b/babel/messages/pofile.py index 3afdd606..8220637b 100644 --- a/babel/messages/pofile.py +++ b/babel/messages/pofile.py @@ -17,7 +17,7 @@ from typing import TYPE_CHECKING, Literal from babel.core import Locale from babel.messages.catalog import Catalog, Message -from babel.util import TextWrapper, _cmp +from babel.util import TextWrapper if TYPE_CHECKING: from typing import IO, AnyStr @@ -25,6 +25,9 @@ if TYPE_CHECKING: from _typeshed import SupportsWrite +_unescape_re = re.compile(r'\\([\\trn"])') + + def unescape(string: str) -> str: r"""Reverse `escape` the given string. @@ -45,7 +48,10 @@ def unescape(string: str) -> str: return '\r' # m is \ or " return m - return re.compile(r'\\([\\trn"])').sub(replace_escapes, string[1:-1]) + + if "\\" not in string: # Fast path: there's nothing to unescape + return string[1:-1] + return _unescape_re.sub(replace_escapes, string[1:-1]) def denormalize(string: str) -> str: @@ -73,7 +79,7 @@ def denormalize(string: str) -> str: escaped_lines = string.splitlines() if string.startswith('""'): escaped_lines = escaped_lines[1:] - return ''.join(unescape(line) for line in escaped_lines) + return ''.join(map(unescape, escaped_lines)) else: return unescape(string) @@ -132,48 +138,14 @@ class PoFileError(Exception): self.lineno = lineno -class _NormalizedString: - +class _NormalizedString(list): def __init__(self, *args: str) -> None: - self._strs: list[str] = [] - for arg in args: - self.append(arg) - - def append(self, s: str) -> None: - self._strs.append(s.strip()) + super().__init__(map(str.strip, args)) def denormalize(self) -> str: - return ''.join(unescape(s) for s in self._strs) - - def __bool__(self) -> bool: - return bool(self._strs) - - def __repr__(self) -> str: - return os.linesep.join(self._strs) - - def __cmp__(self, other: object) -> int: - if not other: - return 1 - - return _cmp(str(self), str(other)) - - def __gt__(self, other: object) -> bool: - return self.__cmp__(other) > 0 - - def __lt__(self, other: object) -> bool: - return self.__cmp__(other) < 0 - - def __ge__(self, other: object) -> bool: - return self.__cmp__(other) >= 0 - - def __le__(self, other: object) -> bool: - return self.__cmp__(other) <= 0 - - def __eq__(self, other: object) -> bool: - return self.__cmp__(other) == 0 - - def __ne__(self, other: object) -> bool: - return self.__cmp__(other) != 0 + if not self: + return "" + return ''.join(map(unescape, self)) class PoFileParser: @@ -183,13 +155,6 @@ class PoFileParser: See `read_po` for simple cases. """ - _keywords = [ - 'msgid', - 'msgstr', - 'msgctxt', - 'msgid_plural', - ] - def __init__(self, catalog: Catalog, ignore_obsolete: bool = False, abort_invalid: bool = False) -> None: self.catalog = catalog self.ignore_obsolete = ignore_obsolete @@ -216,23 +181,20 @@ class PoFileParser: Add a message to the catalog based on the current parser state and clear the state ready to process the next message. """ - self.translations.sort() if len(self.messages) > 1: msgid = tuple(m.denormalize() for m in self.messages) - else: - msgid = self.messages[0].denormalize() - if isinstance(msgid, (list, tuple)): string = ['' for _ in range(self.catalog.num_plurals)] - for idx, translation in self.translations: + for idx, translation in sorted(self.translations): if idx >= self.catalog.num_plurals: self._invalid_pofile("", self.offset, "msg has more translations than num_plurals of catalog") continue string[idx] = translation.denormalize() string = tuple(string) else: + msgid = self.messages[0].denormalize() string = self.translations[0][1].denormalize() msgctxt = self.context.denormalize() if self.context else None - message = Message(msgid, string, list(self.locations), set(self.flags), + message = Message(msgid, string, self.locations, self.flags, self.auto_comments, self.user_comments, lineno=self.offset + 1, context=msgctxt) if self.obsolete: @@ -247,27 +209,19 @@ class PoFileParser: if self.messages: if not self.translations: self._invalid_pofile("", self.offset, f"missing msgstr for msgid '{self.messages[0].denormalize()}'") - self.translations.append([0, _NormalizedString("")]) + self.translations.append([0, _NormalizedString()]) self._add_message() def _process_message_line(self, lineno, line, obsolete=False) -> None: - if line.startswith('"'): + if not line: + return + if line[0] == '"': self._process_string_continuation_line(line, lineno) else: self._process_keyword_line(lineno, line, obsolete) def _process_keyword_line(self, lineno, line, obsolete=False) -> None: - - for keyword in self._keywords: - try: - if line.startswith(keyword) and line[len(keyword)] in [' ', '[']: - arg = line[len(keyword):] - break - except IndexError: - self._invalid_pofile(line, lineno, "Keyword must be followed by a string") - else: - self._invalid_pofile(line, lineno, "Start of line didn't match any expected keyword.") - return + keyword, _, arg = line.partition(' ') if keyword in ['msgid', 'msgctxt']: self._finish_current_message() @@ -283,19 +237,23 @@ class PoFileParser: self.in_msgctxt = False self.in_msgid = True self.messages.append(_NormalizedString(arg)) + return - elif keyword == 'msgstr': + if keyword == 'msgctxt': + self.in_msgctxt = True + self.context = _NormalizedString(arg) + return + + if keyword == 'msgstr' or keyword.startswith('msgstr['): self.in_msgid = False self.in_msgstr = True - if arg.startswith('['): - idx, msg = arg[1:].split(']', 1) - self.translations.append([int(idx), _NormalizedString(msg)]) - else: - self.translations.append([0, _NormalizedString(arg)]) + kwarg, has_bracket, idxarg = keyword.partition('[') + idx = int(idxarg[:-1]) if has_bracket else 0 + s = _NormalizedString(arg) if arg != '""' else _NormalizedString() + self.translations.append([idx, s]) + return - elif keyword == 'msgctxt': - self.in_msgctxt = True - self.context = _NormalizedString(arg) + self._invalid_pofile(line, lineno, "Unknown or misformatted keyword") def _process_string_continuation_line(self, line, lineno) -> None: if self.in_msgid: @@ -307,49 +265,62 @@ class PoFileParser: else: self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt") return - s.append(line) + s.append(line.strip()) # For performance reasons, `NormalizedString` doesn't strip internally def _process_comment(self, line) -> None: self._finish_current_message() - if line[1:].startswith(':'): + prefix = line[:2] + if prefix == '#:': for location in _extract_locations(line[2:]): - pos = location.rfind(':') - if pos >= 0: + a, colon, b = location.rpartition(':') + if colon: try: - lineno = int(location[pos + 1:]) + self.locations.append((a, int(b))) except ValueError: continue - self.locations.append((location[:pos], lineno)) - else: + else: # No line number specified self.locations.append((location, None)) - elif line[1:].startswith(','): - for flag in line[2:].lstrip().split(','): - self.flags.append(flag.strip()) - elif line[1:].startswith('.'): + return + + if prefix == '#,': + self.flags.extend(flag.strip() for flag in line[2:].lstrip().split(',')) + return + + if prefix == '#.': # These are called auto-comments comment = line[2:].strip() if comment: # Just check that we're not adding empty comments self.auto_comments.append(comment) - else: - # These are called user comments - self.user_comments.append(line[1:].strip()) + return + + # These are called user comments + self.user_comments.append(line[1:].strip()) def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None: """ - Reads from the file-like object `fileobj` and adds any po file - units found in it to the `Catalog` supplied to the constructor. + Reads from the file-like object (or iterable of string-likes) `fileobj` + and adds any po file units found in it to the `Catalog` + supplied to the constructor. + + All of the items in the iterable must be the same type; either `str` + or `bytes` (decoded with the catalog charset), but not a mixture. """ + needs_decode = None for lineno, line in enumerate(fileobj): line = line.strip() - if not isinstance(line, str): - line = line.decode(self.catalog.charset) + if needs_decode is None: + # If we don't yet know whether we need to decode, + # let's find out now. + needs_decode = not isinstance(line, str) if not line: continue - if line.startswith('#'): - if line[1:].startswith('~'): + if needs_decode: + line = line.decode(self.catalog.charset) + if line[0] == '#': + if line[:2] == '#~': self._process_message_line(lineno, line[2:].lstrip(), obsolete=True) else: try: @@ -364,8 +335,8 @@ class PoFileParser: # No actual messages found, but there was some info in comments, from which # we'll construct an empty header message if not self.counter and (self.flags or self.user_comments or self.auto_comments): - self.messages.append(_NormalizedString('""')) - self.translations.append([0, _NormalizedString('""')]) + self.messages.append(_NormalizedString()) + self.translations.append([0, _NormalizedString()]) self._add_message() def _invalid_pofile(self, line, lineno, msg) -> None: diff --git a/tests/messages/test_normalized_string.py b/tests/messages/test_normalized_string.py deleted file mode 100644 index 9c95672b..00000000 --- a/tests/messages/test_normalized_string.py +++ /dev/null @@ -1,17 +0,0 @@ -from babel.messages.pofile import _NormalizedString - - -def test_normalized_string(): - ab1 = _NormalizedString('a', 'b ') - ab2 = _NormalizedString('a', ' b') - ac1 = _NormalizedString('a', 'c') - ac2 = _NormalizedString(' a', 'c ') - z = _NormalizedString() - assert ab1 == ab2 and ac1 == ac2 # __eq__ - assert ab1 < ac1 # __lt__ - assert ac1 > ab2 # __gt__ - assert ac1 >= ac2 # __ge__ - assert ab1 <= ab2 # __le__ - assert ab1 != ac1 # __ne__ - assert not z # __nonzero__ / __bool__ - assert sorted([ab1, ab2, ac1]) # the sort order is not stable so we can't really check it, just that we can sort diff --git a/tests/messages/test_pofile.py b/tests/messages/test_pofile.py index 2bcc3df8..ffc95295 100644 --- a/tests/messages/test_pofile.py +++ b/tests/messages/test_pofile.py @@ -1068,11 +1068,18 @@ def test_iterable_of_strings(): """ Test we can parse from an iterable of strings. """ - catalog = pofile.read_po(['msgid "foo"', b'msgstr "Voh"'], locale="en_US") + catalog = pofile.read_po(['msgid "foo"', 'msgstr "Voh"'], locale="en_US") assert catalog.locale == Locale("en", "US") assert catalog.get("foo").string == "Voh" +@pytest.mark.parametrize("order", [1, -1]) +def test_iterable_of_mismatching_strings(order): + # Mixing and matching byteses and strs in the same read_po call is not allowed. + with pytest.raises(Exception): # noqa: B017 (will raise either TypeError or AttributeError) + pofile.read_po(['msgid "foo"', b'msgstr "Voh"'][::order]) + + def test_issue_1087(): buf = StringIO(r''' msgid ""