import sys
from tokenize import generate_tokens, COMMENT, NAME, OP, STRING
-from babel.util import pathmatch, relpath
+from babel.util import parse_encoding, pathmatch, relpath
__all__ = ['extract', 'extract_from_dir', 'extract_from_file']
__docformat__ = 'restructuredtext en'
>>> from StringIO import StringIO
>>> for message in extract('python', StringIO(source)):
... print message
- (3, 'Hello, world!', [])
+ (3, u'Hello, world!', [])
:param method: a string specifying the extraction method (.e.g. "python")
:param fileobj: the file-like object the messages should be extracted from
def extract_python(fileobj, keywords, comment_tags, options):
"""Extract messages from Python source code.
- :param fileobj: the file-like object the messages should be extracted from
+ :param fileobj: the seekable, file-like object the messages should be
+ extracted from
:param keywords: a list of keywords (i.e. function names) that should be
recognized as translation functions
:param comment_tags: a list of translator tags to search for and include
in_args = False
in_translator_comments = False
+ encoding = parse_encoding(fileobj) or options.get('encoding', 'ascii')
+
tokens = generate_tokens(fileobj.readline)
for tok, value, (lineno, _), _, _ in tokens:
if funcname and tok == OP and value == '(':
in_args = True
elif tok == COMMENT:
# Strip the comment token from the line
- value = value[1:].strip()
+ value = value.decode(encoding)[1:].strip()
if in_translator_comments and \
translator_comments[-1][0] == lineno - 1:
# We're already inside a translator comment, continue appending
messages = []
translator_comments = []
elif tok == STRING:
- # Unwrap quotes in a safe manner
- buf.append(eval(value, {'__builtins__':{}}, {}))
+ # Unwrap quotes in a safe manner, maintaining the string's
+ # encoding
+ # https://sourceforge.net/tracker/?func=detail&atid=355470&aid=617979&group_id=5470
+ value = eval('# coding=%s\n%s' % (encoding, value),
+ {'__builtins__':{}}, {})
+ if isinstance(value, str):
+ value = value.decode(encoding)
+ buf.append(value)
elif tok == OP and value == ',':
messages.append(''.join(buf))
del buf[:]
# individuals. For the exact contribution history, see the revision
# history and logs, available at http://babel.edgewall.org/log/.
+import codecs
import doctest
from StringIO import StringIO
import unittest
def test_unicode_string_arg(self):
buf = StringIO("msg = _(u'Foo Bar')")
messages = list(extract.extract_python(buf, ('_',), [], {}))
- self.assertEqual('Foo Bar', messages[0][2])
+ self.assertEqual(u'Foo Bar', messages[0][2])
def test_comment_tag(self):
buf = StringIO("""
msg = _(u'Foo Bar')
""")
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
- self.assertEqual('Foo Bar', messages[0][2])
- self.assertEqual(['A translation comment'], messages[0][3])
+ self.assertEqual(u'Foo Bar', messages[0][2])
+ self.assertEqual([u'A translation comment'], messages[0][3])
def test_comment_tag_multiline(self):
buf = StringIO("""
msg = _(u'Foo Bar')
""")
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
- self.assertEqual('Foo Bar', messages[0][2])
- self.assertEqual(['A translation comment', 'with a second line'],
+ self.assertEqual(u'Foo Bar', messages[0][2])
+ self.assertEqual([u'A translation comment', u'with a second line'],
messages[0][3])
def test_translator_comments_with_previous_non_translator_comments(self):
msg = _(u'Foo Bar')
""")
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
- self.assertEqual('Foo Bar', messages[0][2])
- self.assertEqual(['A translation comment', 'with a second line'],
+ self.assertEqual(u'Foo Bar', messages[0][2])
+ self.assertEqual([u'A translation comment', u'with a second line'],
messages[0][3])
def test_comment_tags_not_on_start_of_comment(self):
msg = _(u'Foo Bar')
""")
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
- self.assertEqual('Foo Bar', messages[0][2])
- self.assertEqual(['This one will be'], messages[0][3])
+ self.assertEqual(u'Foo Bar', messages[0][2])
+ self.assertEqual([u'This one will be'], messages[0][3])
def test_multiple_comment_tags(self):
buf = StringIO("""
""")
messages = list(extract.extract_python(buf, ('_',),
['NOTE1:', 'NOTE2:'], {}))
- self.assertEqual('Foo Bar1', messages[0][2])
- self.assertEqual(['A translation comment for tag1',
- 'with a second line'], messages[0][3])
- self.assertEqual('Foo Bar2', messages[1][2])
- self.assertEqual(['A translation comment for tag2'], messages[1][3])
+ self.assertEqual(u'Foo Bar1', messages[0][2])
+ self.assertEqual([u'A translation comment for tag1',
+ u'with a second line'], messages[0][3])
+ self.assertEqual(u'Foo Bar2', messages[1][2])
+ self.assertEqual([u'A translation comment for tag2'], messages[1][3])
def test_two_succeeding_comments(self):
buf = StringIO("""
msg = _(u'Foo Bar')
""")
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
- self.assertEqual('Foo Bar', messages[0][2])
- self.assertEqual(['one', 'NOTE: two'], messages[0][3])
+ self.assertEqual(u'Foo Bar', messages[0][2])
+ self.assertEqual([u'one', u'NOTE: two'], messages[0][3])
def test_invalid_translator_comments(self):
buf = StringIO("""
msg = _(u'Foo Bar')
""")
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
- self.assertEqual('Foo Bar', messages[0][2])
+ self.assertEqual(u'Foo Bar', messages[0][2])
self.assertEqual([], messages[0][3])
def test_invalid_translator_comments2(self):
hello = _('Hello')
""")
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
- self.assertEqual('Hi there!', messages[0][2])
- self.assertEqual(['Hi!'], messages[0][3])
- self.assertEqual('Hello', messages[1][2])
+ self.assertEqual(u'Hi there!', messages[0][2])
+ self.assertEqual([u'Hi!'], messages[0][3])
+ self.assertEqual(u'Hello', messages[1][2])
self.assertEqual([], messages[1][3])
def test_invalid_translator_comments3(self):
hithere = _('Hi there!')
""")
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
- self.assertEqual('Hi there!', messages[0][2])
+ self.assertEqual(u'Hi there!', messages[0][2])
self.assertEqual([], messages[0][3])
+ def test_utf8_message(self):
+ buf = StringIO("""
+# NOTE: hello
+msg = _('Bonjour à tous')
+""")
+ messages = list(extract.extract_python(buf, ('_',), ['NOTE:'],
+ {'encoding': 'utf-8'}))
+ self.assertEqual(u'Bonjour à tous', messages[0][2])
+ self.assertEqual([u'hello'], messages[0][3])
+
+ def test_utf8_message_with_magic_comment(self):
+ buf = StringIO("""# -*- coding: utf-8 -*-
+# NOTE: hello
+msg = _('Bonjour à tous')
+""")
+ messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
+ self.assertEqual(u'Bonjour à tous', messages[0][2])
+ self.assertEqual([u'hello'], messages[0][3])
+
+ def test_utf8_message_with_utf8_bom(self):
+ buf = StringIO(codecs.BOM_UTF8 + """
+# NOTE: hello
+msg = _('Bonjour à tous')
+""")
+ messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
+ self.assertEqual(u'Bonjour à tous', messages[0][2])
+ self.assertEqual([u'hello'], messages[0][3])
+
+ def test_utf8_raw_strings_match_unicode_strings(self):
+ buf = StringIO(codecs.BOM_UTF8 + """
+msg = _('Bonjour à tous')
+msgu = _(u'Bonjour à tous')
+""")
+ messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
+ self.assertEqual(u'Bonjour à tous', messages[0][2])
+ self.assertEqual(messages[0][2], messages[1][2])
+
def suite():
suite = unittest.TestSuite()
suite.addTest(doctest.DocTestSuite(extract))