Add support for UTF-16 and UTF-8-with-BOM to CSV locale.

author Ben Darnell <ben@bendarnell.com>

Sun, 5 Jul 2015 03:54:13 +0000 (23:54 -0400)

committer Ben Darnell <ben@bendarnell.com>

Sun, 5 Jul 2015 03:56:40 +0000 (23:56 -0400)
author Ben Darnell <ben@bendarnell.com>
Sun, 5 Jul 2015 03:54:13 +0000 (23:54 -0400)
committer Ben Darnell <ben@bendarnell.com>
Sun, 5 Jul 2015 03:56:40 +0000 (23:56 -0400)
diff --git a/tornado/locale.py b/tornado/locale.py

index a668765bbc4c6a57f8fd24120a63f03bad8f84d8..e93fa1a9e2a0ab755601aaaf450530f2ff92bf5b 100644 (file)
--- a/tornado/locale.py
+++ b/tornado/locale.py
@@ -41,8 +41,10 @@ the `Locale.translate` method will simply return the original string.
  
  from __future__ import absolute_import, division, print_function, with_statement
  
+import codecs
  import csv
  import datetime
+from io import BytesIO
  import numbers
  import os
  import re
@@ -86,7 +88,7 @@ def set_default_locale(code):
      _supported_locales = frozenset(list(_translations.keys()) + [_default_locale])
  
  
-def load_translations(directory):
+def load_translations(directory, encoding=None):
      """Loads translations from CSV files in a directory.
  
      Translations are strings with optional Python-style named placeholders
@@ -106,12 +108,20 @@ def load_translations(directory):
      The file is read using the `csv` module in the default "excel" dialect.
      In this format there should not be spaces after the commas.
  
+    If no ``encoding`` parameter is given, the encoding will be
+    detected automatically (among UTF-8 and UTF-16) if the file
+    contains a byte-order marker (BOM), defaulting to UTF-8 if no BOM
+    is present.
+
      Example translation ``es_LA.csv``::
  
          "I love you","Te amo"
          "%(name)s liked this","A %(name)s les gustó esto","plural"
          "%(name)s liked this","A %(name)s le gustó esto","singular"
  
+    .. versionchanged:: 4.3
+       Added ``encoding`` parameter. Added support for BOM-based encoding
+       detection, UTF-16, and UTF-8-with-BOM.
      """
      global _translations
      global _supported_locales
@@ -125,13 +135,29 @@ def load_translations(directory):
                            os.path.join(directory, path))
              continue
          full_path = os.path.join(directory, path)
+        if encoding is None:
+            # Try to autodetect encoding based on the BOM.
+            with open(full_path, 'rb') as f:
+                data = f.read(len(codecs.BOM_UTF16_LE))
+            if data in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
+                encoding = 'utf-16'
+            else:
+                # utf-8-sig is "utf-8 with optional BOM". It's discouraged
+                # in most cases but is common with CSV files because Excel
+                # cannot read utf-8 files without a BOM.
+                encoding = 'utf-8-sig'
          try:
              # python 3: csv.reader requires a file open in text mode.
              # Force utf8 to avoid dependence on $LANG environment variable.
-            f = open(full_path, "r", encoding="utf-8")
+            f = open(full_path, "r", encoding=encoding)
          except TypeError:
-            # python 2: files return byte strings, which are decoded below.
-            f = open(full_path, "r")
+            # python 2: csv can only handle byte strings (in ascii-compatible
+            # encodings), which we decode below. Transcode everything into
+            # utf8 before passing it to csv.reader.
+            f = BytesIO()
+            with codecs.open(full_path, "r", encoding=encoding) as infile:
+                f.write(escape.utf8(infile.read()))
+            f.seek(0)
          _translations[locale] = {}
          for i, row in enumerate(csv.reader(f)):
              if not row or len(row) < 2:
diff --git a/tornado/test/locale_test.py b/tornado/test/locale_test.py

index 31c57a6194c9632931b6435efc9267d149918b3c..44726644d2b1e6785dd3a71e09cacd1975e9a5f6 100644 (file)
--- a/tornado/test/locale_test.py
+++ b/tornado/test/locale_test.py
@@ -2,8 +2,11 @@ from __future__ import absolute_import, division, print_function, with_statement
  
  import datetime
  import os
+import shutil
+import tempfile
+
  import tornado.locale
-from tornado.escape import utf8
+from tornado.escape import utf8, to_unicode
  from tornado.test.util import unittest
  from tornado.util import u, unicode_type
  
@@ -34,6 +37,26 @@ class TranslationLoaderTest(unittest.TestCase):
          self.assertTrue(isinstance(locale, tornado.locale.CSVLocale))
          self.assertEqual(locale.translate("school"), u("\u00e9cole"))
  
+    def test_csv_bom(self):
+        with open(os.path.join(os.path.dirname(__file__), 'csv_translations',
+                               'fr_FR.csv'), 'rb') as f:
+            char_data = to_unicode(f.read())
+        # Re-encode our input data (which is utf-8 without BOM) in
+        # encodings that use the BOM and ensure that we can still load
+        # it. Note that utf-16-le and utf-16-be do not write a BOM,
+        # so we only test whichver variant is native to our platform.
+        for encoding in ['utf-8-sig', 'utf-16']:
+            tmpdir = tempfile.mkdtemp()
+            try:
+                with open(os.path.join(tmpdir, 'fr_FR.csv'), 'wb') as f:
+                    f.write(char_data.encode(encoding))
+                tornado.locale.load_translations(tmpdir)
+                locale = tornado.locale.get('fr_FR')
+                self.assertIsInstance(locale, tornado.locale.CSVLocale)
+                self.assertEqual(locale.translate("school"), u("\u00e9cole"))
+            finally:
+                shutil.rmtree(tmpdir)
+
      def test_gettext(self):
          tornado.locale.load_gettext_translations(
              os.path.join(os.path.dirname(__file__), 'gettext_translations'),
author	Ben Darnell <ben@bendarnell.com>
	Sun, 5 Jul 2015 03:54:13 +0000 (23:54 -0400)
committer	Ben Darnell <ben@bendarnell.com>
	Sun, 5 Jul 2015 03:56:40 +0000 (23:56 -0400)
tornado/locale.py		patch \| blob \| blame \| history
tornado/test/locale_test.py		patch \| blob \| blame \| history