from __future__ import absolute_import, division, print_function, with_statement
+import codecs
import csv
import datetime
+from io import BytesIO
import numbers
import os
import re
_supported_locales = frozenset(list(_translations.keys()) + [_default_locale])
-def load_translations(directory):
+def load_translations(directory, encoding=None):
"""Loads translations from CSV files in a directory.
Translations are strings with optional Python-style named placeholders
The file is read using the `csv` module in the default "excel" dialect.
In this format there should not be spaces after the commas.
+ If no ``encoding`` parameter is given, the encoding will be
+ detected automatically (among UTF-8 and UTF-16) if the file
+ contains a byte-order marker (BOM), defaulting to UTF-8 if no BOM
+ is present.
+
Example translation ``es_LA.csv``::
"I love you","Te amo"
"%(name)s liked this","A %(name)s les gustó esto","plural"
"%(name)s liked this","A %(name)s le gustó esto","singular"
+ .. versionchanged:: 4.3
+ Added ``encoding`` parameter. Added support for BOM-based encoding
+ detection, UTF-16, and UTF-8-with-BOM.
"""
global _translations
global _supported_locales
os.path.join(directory, path))
continue
full_path = os.path.join(directory, path)
+ if encoding is None:
+ # Try to autodetect encoding based on the BOM.
+ with open(full_path, 'rb') as f:
+ data = f.read(len(codecs.BOM_UTF16_LE))
+ if data in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
+ encoding = 'utf-16'
+ else:
+ # utf-8-sig is "utf-8 with optional BOM". It's discouraged
+ # in most cases but is common with CSV files because Excel
+ # cannot read utf-8 files without a BOM.
+ encoding = 'utf-8-sig'
try:
# python 3: csv.reader requires a file open in text mode.
# Force utf8 to avoid dependence on $LANG environment variable.
- f = open(full_path, "r", encoding="utf-8")
+ f = open(full_path, "r", encoding=encoding)
except TypeError:
- # python 2: files return byte strings, which are decoded below.
- f = open(full_path, "r")
+ # python 2: csv can only handle byte strings (in ascii-compatible
+ # encodings), which we decode below. Transcode everything into
+ # utf8 before passing it to csv.reader.
+ f = BytesIO()
+ with codecs.open(full_path, "r", encoding=encoding) as infile:
+ f.write(escape.utf8(infile.read()))
+ f.seek(0)
_translations[locale] = {}
for i, row in enumerate(csv.reader(f)):
if not row or len(row) < 2:
import datetime
import os
+import shutil
+import tempfile
+
import tornado.locale
-from tornado.escape import utf8
+from tornado.escape import utf8, to_unicode
from tornado.test.util import unittest
from tornado.util import u, unicode_type
self.assertTrue(isinstance(locale, tornado.locale.CSVLocale))
self.assertEqual(locale.translate("school"), u("\u00e9cole"))
+ def test_csv_bom(self):
+ with open(os.path.join(os.path.dirname(__file__), 'csv_translations',
+ 'fr_FR.csv'), 'rb') as f:
+ char_data = to_unicode(f.read())
+ # Re-encode our input data (which is utf-8 without BOM) in
+ # encodings that use the BOM and ensure that we can still load
+ # it. Note that utf-16-le and utf-16-be do not write a BOM,
+ # so we only test whichver variant is native to our platform.
+ for encoding in ['utf-8-sig', 'utf-16']:
+ tmpdir = tempfile.mkdtemp()
+ try:
+ with open(os.path.join(tmpdir, 'fr_FR.csv'), 'wb') as f:
+ f.write(char_data.encode(encoding))
+ tornado.locale.load_translations(tmpdir)
+ locale = tornado.locale.get('fr_FR')
+ self.assertIsInstance(locale, tornado.locale.CSVLocale)
+ self.assertEqual(locale.translate("school"), u("\u00e9cole"))
+ finally:
+ shutil.rmtree(tmpdir)
+
def test_gettext(self):
tornado.locale.load_gettext_translations(
os.path.join(os.path.dirname(__file__), 'gettext_translations'),