From 85484f66da15edc9436f8bd5c9d109ef88733138 Mon Sep 17 00:00:00 2001 From: Ben Darnell Date: Sat, 4 Jul 2015 23:54:13 -0400 Subject: [PATCH] Add support for UTF-16 and UTF-8-with-BOM to CSV locale. Auto-detect encoding based on BOM when possible. Closes #1449 --- tornado/locale.py | 34 ++++++++++++++++++++++++++++++---- tornado/test/locale_test.py | 25 ++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/tornado/locale.py b/tornado/locale.py index a668765bb..e93fa1a9e 100644 --- a/tornado/locale.py +++ b/tornado/locale.py @@ -41,8 +41,10 @@ the `Locale.translate` method will simply return the original string. from __future__ import absolute_import, division, print_function, with_statement +import codecs import csv import datetime +from io import BytesIO import numbers import os import re @@ -86,7 +88,7 @@ def set_default_locale(code): _supported_locales = frozenset(list(_translations.keys()) + [_default_locale]) -def load_translations(directory): +def load_translations(directory, encoding=None): """Loads translations from CSV files in a directory. Translations are strings with optional Python-style named placeholders @@ -106,12 +108,20 @@ def load_translations(directory): The file is read using the `csv` module in the default "excel" dialect. In this format there should not be spaces after the commas. + If no ``encoding`` parameter is given, the encoding will be + detected automatically (among UTF-8 and UTF-16) if the file + contains a byte-order marker (BOM), defaulting to UTF-8 if no BOM + is present. + Example translation ``es_LA.csv``:: "I love you","Te amo" "%(name)s liked this","A %(name)s les gustó esto","plural" "%(name)s liked this","A %(name)s le gustó esto","singular" + .. versionchanged:: 4.3 + Added ``encoding`` parameter. Added support for BOM-based encoding + detection, UTF-16, and UTF-8-with-BOM. """ global _translations global _supported_locales @@ -125,13 +135,29 @@ def load_translations(directory): os.path.join(directory, path)) continue full_path = os.path.join(directory, path) + if encoding is None: + # Try to autodetect encoding based on the BOM. + with open(full_path, 'rb') as f: + data = f.read(len(codecs.BOM_UTF16_LE)) + if data in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): + encoding = 'utf-16' + else: + # utf-8-sig is "utf-8 with optional BOM". It's discouraged + # in most cases but is common with CSV files because Excel + # cannot read utf-8 files without a BOM. + encoding = 'utf-8-sig' try: # python 3: csv.reader requires a file open in text mode. # Force utf8 to avoid dependence on $LANG environment variable. - f = open(full_path, "r", encoding="utf-8") + f = open(full_path, "r", encoding=encoding) except TypeError: - # python 2: files return byte strings, which are decoded below. - f = open(full_path, "r") + # python 2: csv can only handle byte strings (in ascii-compatible + # encodings), which we decode below. Transcode everything into + # utf8 before passing it to csv.reader. + f = BytesIO() + with codecs.open(full_path, "r", encoding=encoding) as infile: + f.write(escape.utf8(infile.read())) + f.seek(0) _translations[locale] = {} for i, row in enumerate(csv.reader(f)): if not row or len(row) < 2: diff --git a/tornado/test/locale_test.py b/tornado/test/locale_test.py index 31c57a619..44726644d 100644 --- a/tornado/test/locale_test.py +++ b/tornado/test/locale_test.py @@ -2,8 +2,11 @@ from __future__ import absolute_import, division, print_function, with_statement import datetime import os +import shutil +import tempfile + import tornado.locale -from tornado.escape import utf8 +from tornado.escape import utf8, to_unicode from tornado.test.util import unittest from tornado.util import u, unicode_type @@ -34,6 +37,26 @@ class TranslationLoaderTest(unittest.TestCase): self.assertTrue(isinstance(locale, tornado.locale.CSVLocale)) self.assertEqual(locale.translate("school"), u("\u00e9cole")) + def test_csv_bom(self): + with open(os.path.join(os.path.dirname(__file__), 'csv_translations', + 'fr_FR.csv'), 'rb') as f: + char_data = to_unicode(f.read()) + # Re-encode our input data (which is utf-8 without BOM) in + # encodings that use the BOM and ensure that we can still load + # it. Note that utf-16-le and utf-16-be do not write a BOM, + # so we only test whichver variant is native to our platform. + for encoding in ['utf-8-sig', 'utf-16']: + tmpdir = tempfile.mkdtemp() + try: + with open(os.path.join(tmpdir, 'fr_FR.csv'), 'wb') as f: + f.write(char_data.encode(encoding)) + tornado.locale.load_translations(tmpdir) + locale = tornado.locale.get('fr_FR') + self.assertIsInstance(locale, tornado.locale.CSVLocale) + self.assertEqual(locale.translate("school"), u("\u00e9cole")) + finally: + shutil.rmtree(tmpdir) + def test_gettext(self): tornado.locale.load_gettext_translations( os.path.join(os.path.dirname(__file__), 'gettext_translations'), -- 2.47.2