made the python extractor detect source file encodings from the magic encoding

author Philip Jenvey <pjenvey@underboss.org>

Fri, 22 Jun 2007 00:38:54 +0000 (00:38 +0000)

committer Philip Jenvey <pjenvey@underboss.org>

Fri, 22 Jun 2007 00:38:54 +0000 (00:38 +0000)
author Philip Jenvey <pjenvey@underboss.org>
Fri, 22 Jun 2007 00:38:54 +0000 (00:38 +0000)
committer Philip Jenvey <pjenvey@underboss.org>
Fri, 22 Jun 2007 00:38:54 +0000 (00:38 +0000)
diff --git a/babel/messages/extract.py b/babel/messages/extract.py

index 69f4941e55c3723bf652cc62cccadd745730bd1a..1c95c20570e83c66934a436e343d4c2c05176201 100644 (file)
--- a/babel/messages/extract.py
+++ b/babel/messages/extract.py
@@ -29,7 +29,7 @@ except NameError:
  import sys
  from tokenize import generate_tokens, COMMENT, NAME, OP, STRING
  
-from babel.util import pathmatch, relpath
+from babel.util import parse_encoding, pathmatch, relpath
  
  __all__ = ['extract', 'extract_from_dir', 'extract_from_file']
  __docformat__ = 'restructuredtext en'
@@ -195,7 +195,7 @@ def extract(method, fileobj, keywords=DEFAULT_KEYWORDS, comment_tags=(),
      >>> from StringIO import StringIO
      >>> for message in extract('python', StringIO(source)):
      ...     print message
-    (3, 'Hello, world!', [])
+    (3, u'Hello, world!', [])
      
      :param method: a string specifying the extraction method (.e.g. "python")
      :param fileobj: the file-like object the messages should be extracted from
@@ -238,7 +238,8 @@ def extract_nothing(fileobj, keywords, comment_tags, options):
  def extract_python(fileobj, keywords, comment_tags, options):
      """Extract messages from Python source code.
      
-    :param fileobj: the file-like object the messages should be extracted from
+    :param fileobj: the seekable, file-like object the messages should be
+                    extracted from
      :param keywords: a list of keywords (i.e. function names) that should be
                       recognized as translation functions
      :param comment_tags: a list of translator tags to search for and include
@@ -255,13 +256,15 @@ def extract_python(fileobj, keywords, comment_tags, options):
      in_args = False
      in_translator_comments = False
  
+    encoding = parse_encoding(fileobj) or options.get('encoding', 'ascii')
+
      tokens = generate_tokens(fileobj.readline)
      for tok, value, (lineno, _), _, _ in tokens:
          if funcname and tok == OP and value == '(':
              in_args = True
          elif tok == COMMENT:
              # Strip the comment token from the line
-            value = value[1:].strip()
+            value = value.decode(encoding)[1:].strip()
              if in_translator_comments and \
                      translator_comments[-1][0] == lineno - 1:
                  # We're already inside a translator comment, continue appending
@@ -300,8 +303,14 @@ def extract_python(fileobj, keywords, comment_tags, options):
                  messages = []
                  translator_comments = []
              elif tok == STRING:
-                # Unwrap quotes in a safe manner
-                buf.append(eval(value, {'__builtins__':{}}, {}))
+                # Unwrap quotes in a safe manner, maintaining the string's
+                # encoding
+                # https://sourceforge.net/tracker/?func=detail&atid=355470&aid=617979&group_id=5470
+                value = eval('# coding=%s\n%s' % (encoding, value),
+                             {'__builtins__':{}}, {})
+                if isinstance(value, str):
+                    value = value.decode(encoding)
+                buf.append(value)
              elif tok == OP and value == ',':
                  messages.append(''.join(buf))
                  del buf[:]
diff --git a/babel/messages/tests/extract.py b/babel/messages/tests/extract.py

index ce6d70c7b727d4a19c4bf51e0b878d8c5b5b68cf..e4abb20ed4001dec5f611d2c7fc22fb3028e62ae 100644 (file)
--- a/babel/messages/tests/extract.py
+++ b/babel/messages/tests/extract.py
@@ -11,6 +11,7 @@
  # individuals. For the exact contribution history, see the revision
  # history and logs, available at http://babel.edgewall.org/log/.
  
+import codecs
  import doctest
  from StringIO import StringIO
  import unittest
@@ -23,7 +24,7 @@ class ExtractPythonTestCase(unittest.TestCase):
      def test_unicode_string_arg(self):
          buf = StringIO("msg = _(u'Foo Bar')")
          messages = list(extract.extract_python(buf, ('_',), [], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
+        self.assertEqual(u'Foo Bar', messages[0][2])
  
      def test_comment_tag(self):
          buf = StringIO("""
@@ -31,8 +32,8 @@ class ExtractPythonTestCase(unittest.TestCase):
  msg = _(u'Foo Bar')
  """)
          messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
-        self.assertEqual(['A translation comment'], messages[0][3])
+        self.assertEqual(u'Foo Bar', messages[0][2])
+        self.assertEqual([u'A translation comment'], messages[0][3])
  
      def test_comment_tag_multiline(self):
          buf = StringIO("""
@@ -41,8 +42,8 @@ msg = _(u'Foo Bar')
  msg = _(u'Foo Bar')
  """)
          messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
-        self.assertEqual(['A translation comment', 'with a second line'],
+        self.assertEqual(u'Foo Bar', messages[0][2])
+        self.assertEqual([u'A translation comment', u'with a second line'],
                           messages[0][3])
          
      def test_translator_comments_with_previous_non_translator_comments(self):
@@ -54,8 +55,8 @@ msg = _(u'Foo Bar')
  msg = _(u'Foo Bar')
  """)
          messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
-        self.assertEqual(['A translation comment', 'with a second line'],
+        self.assertEqual(u'Foo Bar', messages[0][2])
+        self.assertEqual([u'A translation comment', u'with a second line'],
                           messages[0][3])
  
      def test_comment_tags_not_on_start_of_comment(self):
@@ -67,8 +68,8 @@ msg = _(u'Foo Bar')
  msg = _(u'Foo Bar')
  """)
          messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
-        self.assertEqual(['This one will be'], messages[0][3])
+        self.assertEqual(u'Foo Bar', messages[0][2])
+        self.assertEqual([u'This one will be'], messages[0][3])
  
      def test_multiple_comment_tags(self):
          buf = StringIO("""
@@ -81,11 +82,11 @@ msg = _(u'Foo Bar2')
  """)
          messages = list(extract.extract_python(buf, ('_',),
                                                 ['NOTE1:', 'NOTE2:'], {}))
-        self.assertEqual('Foo Bar1', messages[0][2])
-        self.assertEqual(['A translation comment for tag1',
-                          'with a second line'], messages[0][3])
-        self.assertEqual('Foo Bar2', messages[1][2])
-        self.assertEqual(['A translation comment for tag2'], messages[1][3])
+        self.assertEqual(u'Foo Bar1', messages[0][2])
+        self.assertEqual([u'A translation comment for tag1',
+                          u'with a second line'], messages[0][3])
+        self.assertEqual(u'Foo Bar2', messages[1][2])
+        self.assertEqual([u'A translation comment for tag2'], messages[1][3])
  
      def test_two_succeeding_comments(self):
          buf = StringIO("""
@@ -94,8 +95,8 @@ msg = _(u'Foo Bar2')
  msg = _(u'Foo Bar')
  """)
          messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
-        self.assertEqual(['one', 'NOTE: two'], messages[0][3])
+        self.assertEqual(u'Foo Bar', messages[0][2])
+        self.assertEqual([u'one', u'NOTE: two'], messages[0][3])
          
      def test_invalid_translator_comments(self):
          buf = StringIO("""
@@ -105,7 +106,7 @@ hello = 'there'
  msg = _(u'Foo Bar')
  """)
          messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
+        self.assertEqual(u'Foo Bar', messages[0][2])
          self.assertEqual([], messages[0][3])
  
      def test_invalid_translator_comments2(self):
@@ -120,9 +121,9 @@ rows = [[v for v in range(0,10)] for row in range(0,10)]
  hello = _('Hello')
  """)
          messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Hi there!', messages[0][2])
-        self.assertEqual(['Hi!'], messages[0][3])
-        self.assertEqual('Hello', messages[1][2])
+        self.assertEqual(u'Hi there!', messages[0][2])
+        self.assertEqual([u'Hi!'], messages[0][3])
+        self.assertEqual(u'Hello', messages[1][2])
          self.assertEqual([], messages[1][3])
  
      def test_invalid_translator_comments3(self):
@@ -133,9 +134,46 @@ hello = _('Hello')
  hithere = _('Hi there!')
  """)
          messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Hi there!', messages[0][2])
+        self.assertEqual(u'Hi there!', messages[0][2])
          self.assertEqual([], messages[0][3])
  
+    def test_utf8_message(self):
+        buf = StringIO("""
+# NOTE: hello
+msg = _('Bonjour à tous')
+""")
+        messages = list(extract.extract_python(buf, ('_',), ['NOTE:'],
+                                               {'encoding': 'utf-8'}))
+        self.assertEqual(u'Bonjour à tous', messages[0][2])
+        self.assertEqual([u'hello'], messages[0][3])
+
+    def test_utf8_message_with_magic_comment(self):
+        buf = StringIO("""# -*- coding: utf-8 -*-
+# NOTE: hello
+msg = _('Bonjour à tous')
+""")
+        messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
+        self.assertEqual(u'Bonjour à tous', messages[0][2])
+        self.assertEqual([u'hello'], messages[0][3])
+
+    def test_utf8_message_with_utf8_bom(self):
+        buf = StringIO(codecs.BOM_UTF8 + """
+# NOTE: hello
+msg = _('Bonjour à tous')
+""")
+        messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
+        self.assertEqual(u'Bonjour à tous', messages[0][2])
+        self.assertEqual([u'hello'], messages[0][3])
+
+    def test_utf8_raw_strings_match_unicode_strings(self):
+        buf = StringIO(codecs.BOM_UTF8 + """
+msg = _('Bonjour à tous')
+msgu = _(u'Bonjour à tous')
+""")
+        messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
+        self.assertEqual(u'Bonjour à tous', messages[0][2])
+        self.assertEqual(messages[0][2], messages[1][2])
+
  def suite():
      suite = unittest.TestSuite()
      suite.addTest(doctest.DocTestSuite(extract))
diff --git a/babel/util.py b/babel/util.py

index 55bdddbc428eb404e882868e3fe5a26aa084e1fa..3580bd01f69f524e9187413d94f4e6bb84f6b939 100644 (file)
--- a/babel/util.py
+++ b/babel/util.py
@@ -13,14 +13,65 @@
  
  """Various utility classes and functions."""
  
+import codecs
  from datetime import timedelta, tzinfo
  import os
+import parser
  import re
  import time
  
  __all__ = ['pathmatch', 'relpath', 'UTC', 'LOCALTZ']
  __docformat__ = 'restructuredtext en'
  
+# Regexp to match python magic encoding line
+PYTHON_MAGIC_COMMENT_re = re.compile(
+    r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)', re.VERBOSE)
+def parse_encoding(fp):
+    """Deduce the encoding of a source file from magic comment.
+
+    It does this in the same way as the `Python interpreter`__
+
+    .. __: http://docs.python.org/ref/encodings.html
+
+    The ``fp`` argument should be a seekable file object.
+
+    (From Jeff Dairiki)
+    """
+    pos = fp.tell()
+    fp.seek(0)
+    try:
+        line1 = fp.readline()
+        has_bom = line1.startswith(codecs.BOM_UTF8)
+        if has_bom:
+            line1 = line1[len(codecs.BOM_UTF8):]
+
+        m = PYTHON_MAGIC_COMMENT_re.match(line1)
+        if not m:
+            try:
+                parser.suite(line1)
+            except SyntaxError:
+                # Either it's a real syntax error, in which case the source is
+                # not valid python source, or line2 is a continuation of line1,
+                # in which case we don't want to scan line2 for a magic
+                # comment.
+                pass
+            else:
+                line2 = fp.readline()
+                m = PYTHON_MAGIC_COMMENT_re.match(line2)
+
+        if has_bom:
+            if m:
+                raise SyntaxError(
+                    "python refuses to compile code with both a UTF8 "
+                    "byte-order-mark and a magic encoding comment")
+            return 'utf_8'
+        elif m:
+            return m.group(1)
+        else:
+            return None
+    finally:
+        fp.seek(pos)
+
  def pathmatch(pattern, filename):
      """Extended pathname pattern matching.
author	Philip Jenvey <pjenvey@underboss.org>
	Fri, 22 Jun 2007 00:38:54 +0000 (00:38 +0000)
committer	Philip Jenvey <pjenvey@underboss.org>
	Fri, 22 Jun 2007 00:38:54 +0000 (00:38 +0000)
babel/messages/extract.py		patch \| blob \| blame \| history
babel/messages/tests/extract.py		patch \| blob \| blame \| history
babel/util.py		patch \| blob \| blame \| history