* Fixed invalid message extraction methods causing an UnboundLocalError.
* The stripping of the comment tags in comments is optional now and
is done for each line in a comment.
-
+ * a JavaScript extractor was added.
+
Version 0.9.2
http://svn.edgewall.org/repos/babel/tags/0.9.2/
funcname = None
elif tok == NAME and value in keywords:
funcname = value
+
+def extract_javascript(fileobj, keywords, comment_tags, options):
+ """Extract messages from JavaScript source code.
+
+ :param fileobj: the seekable, file-like object the messages should be
+ extracted from
+ :param keywords: a list of keywords (i.e. function names) that should be
+ recognized as translation functions
+ :param comment_tags: a list of translator tags to search for and include
+ in the results
+ :param options: a dictionary of additional options (optional)
+ :return: an iterator over ``(lineno, funcname, message, comments)`` tuples
+ :rtype: ``iterator``
+ """
+ from babel.messages.jslexer import tokenize, unquote_string
+ funcname = message_lineno = None
+ messages = []
+ last_argument = None
+ translator_comments = []
+ encoding = options.get('encoding', 'utf-8')
+ last_token = None
+ call_stack = -1
+
+ for token in tokenize(fileobj.read().decode(encoding)):
+ if token.type == 'operator' and token.value == '(':
+ if funcname:
+ message_lineno = token.lineno
+ call_stack += 1
+
+ elif call_stack == -1 and token.type == 'linecomment':
+ value = token.value[2:].strip()
+ if translator_comments and \
+ translator_comments[-1][0] == token.lineno - 1:
+ translator_comments.append((token.lineno, value))
+ continue
+
+ for comment_tag in comment_tags:
+ if value.startswith(comment_tag):
+ translator_comments.append((token.lineno, value.strip()))
+ break
+
+ elif token.type == 'multilinecomment':
+ # only one multi-line comment may preceed a translation
+ translator_comments = []
+ value = token.value[2:-2].strip()
+ for comment_tag in comment_tags:
+ if value.startswith(comment_tag):
+ lines = value.splitlines()
+ if lines:
+ lines[0] = lines[0].strip()
+ lines[1:] = dedent('\n'.join(lines[1:])).splitlines()
+ for offset, line in enumerate(lines):
+ translator_comments.append((token.lineno + offset,
+ line))
+ break
+
+ elif funcname and call_stack == 0:
+ if token.type == 'operator' and token.value == ')':
+ if last_argument is not None:
+ messages.append(last_argument)
+ if len(messages) > 1:
+ messages = tuple(messages)
+ elif messages:
+ messages = messages[0]
+ else:
+ messages = None
+
+ # Comments don't apply unless they immediately preceed the
+ # message
+ if translator_comments and \
+ translator_comments[-1][0] < message_lineno - 1:
+ translator_comments = []
+
+ if messages is not None:
+ yield (message_lineno, funcname, messages,
+ [comment[1] for comment in translator_comments])
+
+ funcname = message_lineno = last_argument = None
+ translator_comments = []
+ messages = []
+ call_stack = -1
+
+ elif token.type == 'string':
+ last_argument = unquote_string(token.value)
+
+ elif token.type == 'operator' and token.value == ',':
+ if last_argument is not None:
+ messages.append(last_argument)
+ last_argument = None
+ else:
+ messages.append(None)
+
+ elif call_stack > 0 and token.type == 'operator' \
+ and token.value == ')':
+ call_stack -= 1
+
+ elif funcname and call_stack == -1:
+ funcname = None
+
+ elif call_stack == -1 and token.type == 'name' and \
+ token.value in keywords and \
+ (last_token is None or last_token.type != 'name' or
+ last_token.value != 'function'):
+ funcname = token.value
+
+ last_token = token
--- /dev/null
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2008 Edgewall Software
+# All rights reserved.
+#
+# This software is licensed as described in the file COPYING, which
+# you should have received as part of this distribution. The terms
+# are also available at http://babel.edgewall.org/wiki/License.
+#
+# This software consists of voluntary contributions made by many
+# individuals. For the exact contribution history, see the revision
+# history and logs, available at http://babel.edgewall.org/log/.
+
+"""A simple JavaScript 1.5 lexer which is used for the JavaScript
+extractor.
+"""
+
+import re
+from operator import itemgetter
+
+
+operators = [
+ '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
+ '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
+ '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
+ '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.'
+]
+operators.sort(lambda a, b: cmp(-len(a), -len(b)))
+
+escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
+
+rules = [
+ (None, re.compile(r'\s+(?u)')),
+ (None, re.compile(r'<!--.*')),
+ ('linecomment', re.compile(r'//.*')),
+ ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
+ ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
+ ('number', re.compile(r'''(?x)(
+ (?:0|[1-9]\d*)
+ (\.\d+)?
+ ([eE][-+]?\d+)? |
+ (0x[a-fA-F0-9]+)
+ )''')),
+ ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
+ ('string', re.compile(r'''(?xs)(
+ '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
+ "(?:[^"\\]*(?:\\.[^"\\]*)*)"
+ )'''))
+]
+
+division_re = re.compile(r'/=?')
+regex_re = re.compile(r'/.+?/[a-zA-Z]*(?s)')
+line_re = re.compile(r'(\r\n|\n|\r)')
+line_join_re = re.compile(r'\\' + line_re.pattern)
+uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
+
+
+class TokenError(ValueError):
+ """Raised if the tokenizer stumbled upon invalid tokens."""
+
+class Token(tuple):
+ """Represents a token as returned by `tokenize`."""
+ __slots__ = ()
+
+ def __new__(cls, type, value, lineno):
+ return tuple.__new__(cls, (type, value, lineno))
+
+ type = property(itemgetter(0))
+ value = property(itemgetter(1))
+ lineno = property(itemgetter(2))
+
+def indicates_division(token):
+ """A helper function that helps the tokenizer to decide if the current
+ token may be followed by a division operator.
+ """
+ if token.type == 'operator':
+ return token.value in (')', ']', '}', '++', '--')
+ return token.type in ('name', 'number', 'string', 'regexp')
+
+def unquote_string(string):
+ """Unquote a string with JavaScript rules. The string has to start with
+ string delimiters (``'`` or ``"``.)
+
+ :return: a string
+ """
+ assert string and string[0] == string[-1] and string[0] in '"\'', \
+ 'string provided is not properly delimited'
+ string = line_join_re.sub('\\1', string[1:-1])
+ result = []
+ add = result.append
+ pos = 0
+
+ while 1:
+ # scan for the next escape
+ escape_pos = string.find('\\', pos)
+ if escape_pos < 0:
+ break
+ add(string[pos:escape_pos])
+
+ # check which character is escaped
+ next_char = string[escape_pos + 1]
+ if next_char in escapes:
+ add(escapes[next_char])
+
+ # unicode escapes. trie to consume up to four characters of
+ # hexadecimal characters and try to interpret them as unicode
+ # character point. If there is no such character point, put
+ # all the consumed characters into the string.
+ elif next_char in 'uU':
+ escaped = uni_escape_re.match(string, escape_pos + 2)
+ if escaped is not None:
+ escaped_value = escaped.group()
+ if len(escaped_value) == 4:
+ try:
+ add(unichr(int(escaped_value, 16)))
+ except ValueError:
+ pass
+ else:
+ pos = escape_pos + 6
+ continue
+ add(next_char + escaped_value)
+ pos = escaped.end()
+ continue
+ else:
+ add(next_char)
+
+ # bogus escape. Just remove the backslash.
+ else:
+ add(next_char)
+ pos = escape_pos + 2
+
+ if pos < len(string):
+ add(string[pos:])
+
+ return u''.join(result)
+
+def tokenize(source):
+ """Tokenize a JavaScript source.
+
+ :return: generator of `Token`\s
+ """
+ may_divide = False
+ pos = 0
+ lineno = 1
+ end = len(source)
+
+ while pos < end:
+ # handle regular rules first
+ for token_type, rule in rules:
+ match = rule.match(source, pos)
+ if match is not None:
+ break
+ # if we don't have a match we don't give up yet, but check for
+ # division operators or regular expression literals, based on
+ # the status of `may_divide` which is determined by the last
+ # processed non-whitespace token using `indicates_division`.
+ else:
+ if may_divide:
+ match = division_re.match(source, pos)
+ token_type = 'operator'
+ else:
+ match = regex_re.match(source, pos)
+ token_type = 'regexp'
+ if match is None:
+ raise TokenError('invalid syntax around line %d' % lineno)
+
+ token_value = match.group()
+ if token_type is not None:
+ token = Token(token_type, token_value, lineno)
+ may_divide = indicates_division(token)
+ yield token
+ lineno += len(line_re.findall(token_value))
+ pos = match.end()
self.assertEqual([u'This is a multiline comment with',
u'a prefix too'], messages[1][2])
+class ExtractJavaScriptTestCase(unittest.TestCase):
+
+ def test_simple_extract(self):
+ buf = StringIO("""\
+msg1 = _('simple')
+msg2 = gettext('simple')
+msg3 = ngettext('s', 'p', 42)
+ """)
+ messages = \
+ list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS,
+ [], {}))
+
+ self.assertEqual([(1, 'simple', []),
+ (2, 'simple', []),
+ (3, ('s', 'p'), [])], messages)
+
+ def test_various_calls(self):
+ buf = StringIO("""\
+msg1 = _(i18n_arg.replace(/"/, '"'))
+msg2 = ungettext(i18n_arg.replace(/"/, '"'), multi_arg.replace(/"/, '"'), 2)
+msg3 = ungettext("Babel", multi_arg.replace(/"/, '"'), 2)
+msg4 = ungettext(i18n_arg.replace(/"/, '"'), "Babels", 2)
+msg5 = ungettext('bunny', 'bunnies', parseInt(Math.random() * 2 + 1))
+msg6 = ungettext(arg0, 'bunnies', rparseInt(Math.random() * 2 + 1))
+msg7 = _(hello.there)
+msg8 = gettext('Rabbit')
+msg9 = dgettext('wiki', model.addPage())
+msg10 = dngettext(domain, 'Page', 'Pages', 3)
+""")
+ messages = \
+ list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, [],
+ {}))
+ self.assertEqual([(5, (u'bunny', u'bunnies'), []),
+ (8, u'Rabbit', []),
+ (10, (u'Page', u'Pages'), [])], messages)
+
+ def test_message_with_line_comment(self):
+ buf = StringIO("""\
+// NOTE: hello
+msg = _('Bonjour à tous')
+""")
+ messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
+ self.assertEqual(u'Bonjour à tous', messages[0][2])
+ self.assertEqual([u'NOTE: hello'], messages[0][3])
+
+ def test_message_with_multiline_comment(self):
+ buf = StringIO("""\
+/* NOTE: hello
+ and bonjour
+ and servus */
+msg = _('Bonjour à tous')
+""")
+ messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
+ self.assertEqual(u'Bonjour à tous', messages[0][2])
+ self.assertEqual([u'NOTE: hello', 'and bonjour', ' and servus'], messages[0][3])
+
+ def test_ignore_function_definitions(self):
+ buf = StringIO("""\
+function gettext(value) {
+ return translations[language][value] || value;
+}""")
+
+ messages = list(extract.extract_javascript(buf, ('gettext',), [], {}))
+ self.assertEqual(messages, [])
+
+ def test_misplaced_comments(self):
+ buf = StringIO("""\
+/* NOTE: this won't show up */
+foo()
+
+/* NOTE: this will */
+msg = _('Something')
+
+// NOTE: this will show up
+// too.
+msg = _('Something else')
+
+// NOTE: but this won't
+bar()
+
+_('no comment here')
+""")
+ messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
+ self.assertEqual(u'Something', messages[0][2])
+ self.assertEqual([u'NOTE: this will'], messages[0][3])
+ self.assertEqual(u'Something else', messages[1][2])
+ self.assertEqual([u'NOTE: this will show up', 'too.'], messages[1][3])
+ self.assertEqual(u'no comment here', messages[2][2])
+ self.assertEqual([], messages[2][3])
+
class ExtractTestCase(unittest.TestCase):
def test_invalid_filter(self):
suite = unittest.TestSuite()
suite.addTest(doctest.DocTestSuite(extract))
suite.addTest(unittest.makeSuite(ExtractPythonTestCase))
+ suite.addTest(unittest.makeSuite(ExtractJavaScriptTestCase))
suite.addTest(unittest.makeSuite(ExtractTestCase))
return suite
[babel.extractors]
ignore = babel.messages.extract:extract_nothing
python = babel.messages.extract:extract_python
+ javascript = babel.messages.extract:extract_javascript
""",
cmdclass = {'build_doc': build_doc, 'test_doc': test_doc}