From 85484f66da15edc9436f8bd5c9d109ef88733138 Mon Sep 17 00:00:00 2001
From: Ben Darnell <ben@bendarnell.com>
Date: Sat, 4 Jul 2015 23:54:13 -0400
Subject: [PATCH] Add support for UTF-16 and UTF-8-with-BOM to CSV locale.

Auto-detect encoding based on BOM when possible.

Closes #1449
---
 tornado/locale.py           | 34 ++++++++++++++++++++++++++++++----
 tornado/test/locale_test.py | 25 ++++++++++++++++++++++++-
 2 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/tornado/locale.py b/tornado/locale.py
index a668765bb..e93fa1a9e 100644
--- a/tornado/locale.py
+++ b/tornado/locale.py
@@ -41,8 +41,10 @@ the `Locale.translate` method will simply return the original string.
 
 from __future__ import absolute_import, division, print_function, with_statement
 
+import codecs
 import csv
 import datetime
+from io import BytesIO
 import numbers
 import os
 import re
@@ -86,7 +88,7 @@ def set_default_locale(code):
     _supported_locales = frozenset(list(_translations.keys()) + [_default_locale])
 
 
-def load_translations(directory):
+def load_translations(directory, encoding=None):
     """Loads translations from CSV files in a directory.
 
     Translations are strings with optional Python-style named placeholders
@@ -106,12 +108,20 @@ def load_translations(directory):
     The file is read using the `csv` module in the default "excel" dialect.
     In this format there should not be spaces after the commas.
 
+    If no ``encoding`` parameter is given, the encoding will be
+    detected automatically (among UTF-8 and UTF-16) if the file
+    contains a byte-order marker (BOM), defaulting to UTF-8 if no BOM
+    is present.
+
     Example translation ``es_LA.csv``::
 
         "I love you","Te amo"
         "%(name)s liked this","A %(name)s les gustÃ³ esto","plural"
         "%(name)s liked this","A %(name)s le gustÃ³ esto","singular"
 
+    .. versionchanged:: 4.3
+       Added ``encoding`` parameter. Added support for BOM-based encoding
+       detection, UTF-16, and UTF-8-with-BOM.
     """
     global _translations
     global _supported_locales
@@ -125,13 +135,29 @@ def load_translations(directory):
                           os.path.join(directory, path))
             continue
         full_path = os.path.join(directory, path)
+        if encoding is None:
+            # Try to autodetect encoding based on the BOM.
+            with open(full_path, 'rb') as f:
+                data = f.read(len(codecs.BOM_UTF16_LE))
+            if data in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
+                encoding = 'utf-16'
+            else:
+                # utf-8-sig is "utf-8 with optional BOM". It's discouraged
+                # in most cases but is common with CSV files because Excel
+                # cannot read utf-8 files without a BOM.
+                encoding = 'utf-8-sig'
         try:
             # python 3: csv.reader requires a file open in text mode.
             # Force utf8 to avoid dependence on $LANG environment variable.
-            f = open(full_path, "r", encoding="utf-8")
+            f = open(full_path, "r", encoding=encoding)
         except TypeError:
-            # python 2: files return byte strings, which are decoded below.
-            f = open(full_path, "r")
+            # python 2: csv can only handle byte strings (in ascii-compatible
+            # encodings), which we decode below. Transcode everything into
+            # utf8 before passing it to csv.reader.
+            f = BytesIO()
+            with codecs.open(full_path, "r", encoding=encoding) as infile:
+                f.write(escape.utf8(infile.read()))
+            f.seek(0)
         _translations[locale] = {}
         for i, row in enumerate(csv.reader(f)):
             if not row or len(row) < 2:
diff --git a/tornado/test/locale_test.py b/tornado/test/locale_test.py
index 31c57a619..44726644d 100644
--- a/tornado/test/locale_test.py
+++ b/tornado/test/locale_test.py
@@ -2,8 +2,11 @@ from __future__ import absolute_import, division, print_function, with_statement
 
 import datetime
 import os
+import shutil
+import tempfile
+
 import tornado.locale
-from tornado.escape import utf8
+from tornado.escape import utf8, to_unicode
 from tornado.test.util import unittest
 from tornado.util import u, unicode_type
 
@@ -34,6 +37,26 @@ class TranslationLoaderTest(unittest.TestCase):
         self.assertTrue(isinstance(locale, tornado.locale.CSVLocale))
         self.assertEqual(locale.translate("school"), u("\u00e9cole"))
 
+    def test_csv_bom(self):
+        with open(os.path.join(os.path.dirname(__file__), 'csv_translations',
+                               'fr_FR.csv'), 'rb') as f:
+            char_data = to_unicode(f.read())
+        # Re-encode our input data (which is utf-8 without BOM) in
+        # encodings that use the BOM and ensure that we can still load
+        # it. Note that utf-16-le and utf-16-be do not write a BOM,
+        # so we only test whichver variant is native to our platform.
+        for encoding in ['utf-8-sig', 'utf-16']:
+            tmpdir = tempfile.mkdtemp()
+            try:
+                with open(os.path.join(tmpdir, 'fr_FR.csv'), 'wb') as f:
+                    f.write(char_data.encode(encoding))
+                tornado.locale.load_translations(tmpdir)
+                locale = tornado.locale.get('fr_FR')
+                self.assertIsInstance(locale, tornado.locale.CSVLocale)
+                self.assertEqual(locale.translate("school"), u("\u00e9cole"))
+            finally:
+                shutil.rmtree(tmpdir)
+
     def test_gettext(self):
         tornado.locale.load_gettext_translations(
             os.path.join(os.path.dirname(__file__), 'gettext_translations'),
-- 
2.47.2