Added !JavaScript extractor

author Armin Ronacher <armin.ronacher@active-4.com>

Thu, 12 Jun 2008 16:26:52 +0000 (16:26 +0000)

committer Armin Ronacher <armin.ronacher@active-4.com>

Thu, 12 Jun 2008 16:26:52 +0000 (16:26 +0000)
author Armin Ronacher <armin.ronacher@active-4.com>
Thu, 12 Jun 2008 16:26:52 +0000 (16:26 +0000)
committer Armin Ronacher <armin.ronacher@active-4.com>
Thu, 12 Jun 2008 16:26:52 +0000 (16:26 +0000)
diff --git a/ChangeLog b/ChangeLog

index 6bf399d6316cdabd91163fc18dd518a620626995..ac80bd0e9f2a09a40151a06327a8776359c112df 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -5,7 +5,8 @@ http://svn.edgewall.org/repos/babel/tags/0.9.3/
   * Fixed invalid message extraction methods causing an UnboundLocalError.
   * The stripping of the comment tags in comments is optional now and
     is done for each line in a comment.
-       
+ * a JavaScript extractor was added.
+
  
  Version 0.9.2
  http://svn.edgewall.org/repos/babel/tags/0.9.2/
diff --git a/babel/messages/extract.py b/babel/messages/extract.py

index 6f8a0d36f9db171cf538288e4b4f225437bd6693..4914c172ef57bf50ec5f5fd042d89a5a32cb7dfb 100644 (file)
--- a/babel/messages/extract.py
+++ b/babel/messages/extract.py
@@ -428,3 +428,109 @@ def extract_python(fileobj, keywords, comment_tags, options):
              funcname = None
          elif tok == NAME and value in keywords:
              funcname = value
+
+def extract_javascript(fileobj, keywords, comment_tags, options):
+    """Extract messages from JavaScript source code.
+
+    :param fileobj: the seekable, file-like object the messages should be
+                    extracted from
+    :param keywords: a list of keywords (i.e. function names) that should be
+                     recognized as translation functions
+    :param comment_tags: a list of translator tags to search for and include
+                         in the results
+    :param options: a dictionary of additional options (optional)
+    :return: an iterator over ``(lineno, funcname, message, comments)`` tuples
+    :rtype: ``iterator``
+    """
+    from babel.messages.jslexer import tokenize, unquote_string
+    funcname = message_lineno = None
+    messages = []
+    last_argument = None
+    translator_comments = []
+    encoding = options.get('encoding', 'utf-8')
+    last_token = None
+    call_stack = -1
+
+    for token in tokenize(fileobj.read().decode(encoding)):
+        if token.type == 'operator' and token.value == '(':
+            if funcname:
+                message_lineno = token.lineno
+                call_stack += 1
+
+        elif call_stack == -1 and token.type == 'linecomment':
+            value = token.value[2:].strip()
+            if translator_comments and \
+               translator_comments[-1][0] == token.lineno - 1:
+                translator_comments.append((token.lineno, value))
+                continue
+
+            for comment_tag in comment_tags:
+                if value.startswith(comment_tag):
+                    translator_comments.append((token.lineno, value.strip()))
+                    break
+
+        elif token.type == 'multilinecomment':
+            # only one multi-line comment may preceed a translation
+            translator_comments = []
+            value = token.value[2:-2].strip()
+            for comment_tag in comment_tags:
+                if value.startswith(comment_tag):
+                    lines = value.splitlines()
+                    if lines:
+                        lines[0] = lines[0].strip()
+                        lines[1:] = dedent('\n'.join(lines[1:])).splitlines()
+                        for offset, line in enumerate(lines):
+                            translator_comments.append((token.lineno + offset,
+                                                        line))
+                    break
+
+        elif funcname and call_stack == 0:
+            if token.type == 'operator' and token.value == ')':
+                if last_argument is not None:
+                    messages.append(last_argument)
+                if len(messages) > 1:
+                    messages = tuple(messages)
+                elif messages:
+                    messages = messages[0]
+                else:
+                    messages = None
+
+                # Comments don't apply unless they immediately preceed the
+                # message
+                if translator_comments and \
+                   translator_comments[-1][0] < message_lineno - 1:
+                    translator_comments = []
+
+                if messages is not None:
+                    yield (message_lineno, funcname, messages,
+                           [comment[1] for comment in translator_comments])
+
+                funcname = message_lineno = last_argument = None
+                translator_comments = []
+                messages = []
+                call_stack = -1
+
+            elif token.type == 'string':
+                last_argument = unquote_string(token.value)
+
+            elif token.type == 'operator' and token.value == ',':
+                if last_argument is not None:
+                    messages.append(last_argument)
+                    last_argument = None
+                else:
+                    messages.append(None)
+
+        elif call_stack > 0 and token.type == 'operator' \
+             and token.value == ')':
+            call_stack -= 1
+
+        elif funcname and call_stack == -1:
+            funcname = None
+
+        elif call_stack == -1 and token.type == 'name' and \
+             token.value in keywords and \
+             (last_token is None or last_token.type != 'name' or
+              last_token.value != 'function'):
+            funcname = token.value
+
+        last_token = token
diff --git a/babel/messages/jslexer.py b/babel/messages/jslexer.py

new file mode 100644 (file)

index 0000000..a5f4436
--- /dev/null
+++ b/babel/messages/jslexer.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2008 Edgewall Software
+# All rights reserved.
+#
+# This software is licensed as described in the file COPYING, which
+# you should have received as part of this distribution. The terms
+# are also available at http://babel.edgewall.org/wiki/License.
+#
+# This software consists of voluntary contributions made by many
+# individuals. For the exact contribution history, see the revision
+# history and logs, available at http://babel.edgewall.org/log/.
+
+"""A simple JavaScript 1.5 lexer which is used for the JavaScript
+extractor.
+"""
+
+import re
+from operator import itemgetter
+
+
+operators = [
+    '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
+    '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
+    '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
+    '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.'
+]
+operators.sort(lambda a, b: cmp(-len(a), -len(b)))
+
+escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
+
+rules = [
+    (None, re.compile(r'\s+(?u)')),
+    (None, re.compile(r'<!--.*')),
+    ('linecomment', re.compile(r'//.*')),
+    ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
+    ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')),
+    ('number', re.compile(r'''(?x)(
+        (?:0|[1-9]\d*)
+        (\.\d+)?
+        ([eE][-+]?\d+)? |
+        (0x[a-fA-F0-9]+)
+    )''')),
+    ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
+    ('string', re.compile(r'''(?xs)(
+        '(?:[^'\\]*(?:\\.[^'\\]*)*)'  |
+        "(?:[^"\\]*(?:\\.[^"\\]*)*)"
+    )'''))
+]
+
+division_re = re.compile(r'/=?')
+regex_re = re.compile(r'/.+?/[a-zA-Z]*(?s)')
+line_re = re.compile(r'(\r\n|\n|\r)')
+line_join_re = re.compile(r'\\' + line_re.pattern)
+uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
+
+
+class TokenError(ValueError):
+    """Raised if the tokenizer stumbled upon invalid tokens."""
+
+class Token(tuple):
+    """Represents a token as returned by `tokenize`."""
+    __slots__ = ()
+
+    def __new__(cls, type, value, lineno):
+        return tuple.__new__(cls, (type, value, lineno))
+
+    type = property(itemgetter(0))
+    value = property(itemgetter(1))
+    lineno = property(itemgetter(2))
+
+def indicates_division(token):
+    """A helper function that helps the tokenizer to decide if the current
+    token may be followed by a division operator.
+    """
+    if token.type == 'operator':
+        return token.value in (')', ']', '}', '++', '--')
+    return token.type in ('name', 'number', 'string', 'regexp')
+
+def unquote_string(string):
+    """Unquote a string with JavaScript rules.  The string has to start with
+    string delimiters (``'`` or ``"``.)
+
+    :return: a string
+    """
+    assert string and string[0] == string[-1] and string[0] in '"\'', \
+        'string provided is not properly delimited'
+    string = line_join_re.sub('\\1', string[1:-1])
+    result = []
+    add = result.append
+    pos = 0
+
+    while 1:
+        # scan for the next escape
+        escape_pos = string.find('\\', pos)
+        if escape_pos < 0:
+            break
+        add(string[pos:escape_pos])
+
+        # check which character is escaped
+        next_char = string[escape_pos + 1]
+        if next_char in escapes:
+            add(escapes[next_char])
+
+        # unicode escapes.  trie to consume up to four characters of
+        # hexadecimal characters and try to interpret them as unicode
+        # character point.  If there is no such character point, put
+        # all the consumed characters into the string.
+        elif next_char in 'uU':
+            escaped = uni_escape_re.match(string, escape_pos + 2)
+            if escaped is not None:
+                escaped_value = escaped.group()
+                if len(escaped_value) == 4:
+                    try:
+                        add(unichr(int(escaped_value, 16)))
+                    except ValueError:
+                        pass
+                    else:
+                        pos = escape_pos + 6
+                        continue
+                add(next_char + escaped_value)
+                pos = escaped.end()
+                continue
+            else:
+                add(next_char)
+
+        # bogus escape.  Just remove the backslash.
+        else:
+            add(next_char)
+        pos = escape_pos + 2
+
+    if pos < len(string):
+        add(string[pos:])
+
+    return u''.join(result)
+
+def tokenize(source):
+    """Tokenize a JavaScript source.
+
+    :return: generator of `Token`\s
+    """
+    may_divide = False
+    pos = 0
+    lineno = 1
+    end = len(source)
+
+    while pos < end:
+        # handle regular rules first
+        for token_type, rule in rules:
+            match = rule.match(source, pos)
+            if match is not None:
+                break
+        # if we don't have a match we don't give up yet, but check for
+        # division operators or regular expression literals, based on
+        # the status of `may_divide` which is determined by the last
+        # processed non-whitespace token using `indicates_division`.
+        else:
+            if may_divide:
+                match = division_re.match(source, pos)
+                token_type = 'operator'
+            else:
+                match = regex_re.match(source, pos)
+                token_type = 'regexp'
+            if match is None:
+                raise TokenError('invalid syntax around line %d' % lineno)
+
+        token_value = match.group()
+        if token_type is not None:
+            token = Token(token_type, token_value, lineno)
+            may_divide = indicates_division(token)
+            yield token
+        lineno += len(line_re.findall(token_value))
+        pos = match.end()
diff --git a/babel/messages/tests/extract.py b/babel/messages/tests/extract.py

index 57d5703aec7565518b2f3f46bf5516ad8dc5729c..5d0a1cd70202034e70e57689f93edb51a2151004 100644 (file)
--- a/babel/messages/tests/extract.py
+++ b/babel/messages/tests/extract.py
@@ -321,6 +321,96 @@ _('Babatschi')""")
          self.assertEqual([u'This is a multiline comment with',
                            u'a prefix too'], messages[1][2])
  
+class ExtractJavaScriptTestCase(unittest.TestCase):
+
+    def test_simple_extract(self):
+        buf = StringIO("""\
+msg1 = _('simple')
+msg2 = gettext('simple')
+msg3 = ngettext('s', 'p', 42)
+        """)
+        messages = \
+            list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS,
+                                 [], {}))
+
+        self.assertEqual([(1, 'simple', []),
+                          (2, 'simple', []),
+                          (3, ('s', 'p'), [])], messages)
+
+    def test_various_calls(self):
+        buf = StringIO("""\
+msg1 = _(i18n_arg.replace(/"/, '"'))
+msg2 = ungettext(i18n_arg.replace(/"/, '"'), multi_arg.replace(/"/, '"'), 2)
+msg3 = ungettext("Babel", multi_arg.replace(/"/, '"'), 2)
+msg4 = ungettext(i18n_arg.replace(/"/, '"'), "Babels", 2)
+msg5 = ungettext('bunny', 'bunnies', parseInt(Math.random() * 2 + 1))
+msg6 = ungettext(arg0, 'bunnies', rparseInt(Math.random() * 2 + 1))
+msg7 = _(hello.there)
+msg8 = gettext('Rabbit')
+msg9 = dgettext('wiki', model.addPage())
+msg10 = dngettext(domain, 'Page', 'Pages', 3)
+""")
+        messages = \
+            list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, [],
+                                 {}))
+        self.assertEqual([(5, (u'bunny', u'bunnies'), []),
+                          (8, u'Rabbit', []),
+                          (10, (u'Page', u'Pages'), [])], messages)
+
+    def test_message_with_line_comment(self):
+        buf = StringIO("""\
+// NOTE: hello
+msg = _('Bonjour à tous')
+""")
+        messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
+        self.assertEqual(u'Bonjour à tous', messages[0][2])
+        self.assertEqual([u'NOTE: hello'], messages[0][3])
+
+    def test_message_with_multiline_comment(self):
+        buf = StringIO("""\
+/* NOTE: hello
+   and bonjour
+     and servus */
+msg = _('Bonjour à tous')
+""")
+        messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
+        self.assertEqual(u'Bonjour à tous', messages[0][2])
+        self.assertEqual([u'NOTE: hello', 'and bonjour', '  and servus'], messages[0][3])
+
+    def test_ignore_function_definitions(self):
+        buf = StringIO("""\
+function gettext(value) {
+    return translations[language][value] || value;
+}""")
+
+        messages = list(extract.extract_javascript(buf, ('gettext',), [], {}))
+        self.assertEqual(messages, [])
+
+    def test_misplaced_comments(self):
+        buf = StringIO("""\
+/* NOTE: this won't show up */
+foo()
+
+/* NOTE: this will */
+msg = _('Something')
+
+// NOTE: this will show up
+// too.
+msg = _('Something else')
+
+// NOTE: but this won't
+bar()
+
+_('no comment here')
+""")
+        messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {}))
+        self.assertEqual(u'Something', messages[0][2])
+        self.assertEqual([u'NOTE: this will'], messages[0][3])
+        self.assertEqual(u'Something else', messages[1][2])
+        self.assertEqual([u'NOTE: this will show up', 'too.'], messages[1][3])
+        self.assertEqual(u'no comment here', messages[2][2])
+        self.assertEqual([], messages[2][3])
+
  class ExtractTestCase(unittest.TestCase):
  
      def test_invalid_filter(self):
@@ -382,6 +472,7 @@ def suite():
      suite = unittest.TestSuite()
      suite.addTest(doctest.DocTestSuite(extract))
      suite.addTest(unittest.makeSuite(ExtractPythonTestCase))
+    suite.addTest(unittest.makeSuite(ExtractJavaScriptTestCase))
      suite.addTest(unittest.makeSuite(ExtractTestCase))
      return suite
  
diff --git a/setup.py b/setup.py

index bc0d107f51dc757a149cedef738ba70f167a9c6d..5cfbf6b6f9b522123afd5bcc73a45b609965779d 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -75,6 +75,7 @@ setup(
      [babel.extractors]
      ignore = babel.messages.extract:extract_nothing
      python = babel.messages.extract:extract_python
+    javascript = babel.messages.extract:extract_javascript
      """,
  
      cmdclass = {'build_doc': build_doc, 'test_doc': test_doc}
author	Armin Ronacher <armin.ronacher@active-4.com>
	Thu, 12 Jun 2008 16:26:52 +0000 (16:26 +0000)
committer	Armin Ronacher <armin.ronacher@active-4.com>
	Thu, 12 Jun 2008 16:26:52 +0000 (16:26 +0000)
ChangeLog		patch \| blob \| blame \| history
babel/messages/extract.py		patch \| blob \| blame \| history
babel/messages/jslexer.py	[new file with mode: 0644]	patch \| blob
babel/messages/tests/extract.py		patch \| blob \| blame \| history
setup.py		patch \| blob \| blame \| history