From: Ben Darnell <ben@bendarnell.com>
Date: Wed, 27 Jun 2012 07:02:58 +0000 (-0700)
Subject: Fix (and test and document) logging on py2 with non-ascii bytestrings.
X-Git-Tag: v2.4.0~49
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=83b38823ee48a6e5f3cafe604cb75ffd39bf45cf;p=thirdparty%2Ftornado.git

Fix (and test and document) logging on py2 with non-ascii bytestrings.

The immediate motivation was an error in request logging when the
request contains non-ascii characters, but a more general fix is
desirable since there are probably more cases where we log utf8 byte
strings, and in general a logging system that blows up on data that
has a perfectly useful representation is not ideal.

Closes #544.
---

diff --git a/tornado/options.py b/tornado/options.py
index 537891e0f..a3d74d68d 100644
--- a/tornado/options.py
+++ b/tornado/options.py
@@ -421,6 +421,7 @@ class _LogFormatter(logging.Formatter):
             record.message = record.getMessage()
         except Exception, e:
             record.message = "Bad message (%r): %r" % (e, record.__dict__)
+        assert isinstance(record.message, basestring)  # guaranteed by logging
         record.asctime = time.strftime(
             "%y%m%d %H:%M:%S", self.converter(record.created))
         prefix = '[%(levelname)1.1s %(asctime)s %(module)s:%(lineno)d]' % \
@@ -428,7 +429,29 @@ class _LogFormatter(logging.Formatter):
         if self._color:
             prefix = (self._colors.get(record.levelno, self._normal) +
                       prefix + self._normal)
-        formatted = prefix + " " + record.message
+
+        # Encoding notes:  The logging module prefers to work with character
+        # strings, but only enforces that log messages are instances of
+        # basestring.  In python 2, non-ascii bytestrings will make
+        # their way through the logging framework until they blow up with
+        # an unhelpful decoding error (with this formatter it happens
+        # when we attach the prefix, but there are other opportunities for
+        # exceptions further along in the framework).
+        #
+        # If a byte string makes it this far, convert it to unicode to
+        # ensure it will make it out to the logs.  Use repr() as a fallback
+        # to ensure that all byte strings can be converted successfully,
+        # but don't do it by default so we don't add extra quotes to ascii
+        # bytestrings.  This is a bit of a hacky place to do this, but
+        # it's worth it since the encoding errors that would otherwise
+        # result are so useless (and tornado is fond of using utf8-encoded
+        # byte strings whereever possible).
+        try:
+            message = _unicode(record.message)
+        except UnicodeDecodeError:
+            message = repr(record.message)
+
+        formatted = prefix + " " + message
         if record.exc_info:
             if not record.exc_text:
                 record.exc_text = self.formatException(record.exc_info)
diff --git a/tornado/test/options_test.py b/tornado/test/options_test.py
index 80b192920..e5411d945 100644
--- a/tornado/test/options_test.py
+++ b/tornado/test/options_test.py
@@ -1,7 +1,13 @@
 from __future__ import absolute_import, division, with_statement
+import logging
+import os
+import re
+import tempfile
 import unittest
 
-from tornado.options import _Options
+from tornado.escape import utf8
+from tornado.options import _Options, _LogFormatter
+from tornado.util import b, bytes_type
 
 
 class OptionsTest(unittest.TestCase):
@@ -17,3 +23,80 @@ class OptionsTest(unittest.TestCase):
     def test_parse_command_line(self):
         self.options.parse_command_line(["main.py", "--port=443"])
         self.assertEqual(self.options.port, 443)
+
+
+class LogFormatterTest(unittest.TestCase):
+    LINE_RE = re.compile(b("\x01\\[E [0-9]{6} [0-9]{2}:[0-9]{2}:[0-9]{2} options_test:[0-9]+\\]\x02 (.*)"))
+
+    def setUp(self):
+        self.formatter = _LogFormatter(color=False)
+        # Fake color support.  We can't guarantee anything about the $TERM
+        # variable when the tests are run, so just patch in some values
+        # for testing.  (testing with color off fails to expose some potential
+        # encoding issues from the control characters)
+        self.formatter._colors = {
+            logging.ERROR: u"\u0001",
+            }
+        self.formatter._normal = u"\u0002"
+        self.formatter._color = True
+        # construct a Logger directly to bypass getLogger's caching
+        self.logger = logging.Logger('LogFormatterTest')
+        self.logger.propagate = False
+        self.tempdir = tempfile.mkdtemp()
+        self.filename = os.path.join(self.tempdir, 'log.out')
+        self.handler = self.make_handler(self.filename)
+        self.handler.setFormatter(self.formatter)
+        self.logger.addHandler(self.handler)
+
+    def tearDown(self):
+        os.unlink(self.filename)
+        os.rmdir(self.tempdir)
+
+    def make_handler(self, filename):
+        # Base case: default setup without explicit encoding.
+        # In python 2, supports arbitrary byte strings and unicode objects
+        # that contain only ascii.  In python 3, supports ascii-only unicode
+        # strings (but byte strings will be repr'd automatically.
+        return logging.FileHandler(filename)
+
+    def get_output(self):
+        with open(self.filename, "rb") as f:
+            line = f.read().strip()
+            m = LogFormatterTest.LINE_RE.match(line)
+            if m:
+                return m.group(1)
+            else:
+                raise Exception("output didn't match regex: %r" % line)
+
+    def test_basic_logging(self):
+        self.logger.error("foo")
+        self.assertEqual(self.get_output(), b("foo"))
+
+    def test_bytes_logging(self):
+        self.logger.error(b("\xe9"))
+        # This will be "\xe9" on python 2 or "b'\xe9'" on python 3
+        self.assertEqual(self.get_output(), utf8(repr(b("\xe9"))))
+
+    def test_utf8_logging(self):
+        self.logger.error(u"\u00e9".encode("utf8"))
+        if issubclass(bytes_type, basestring):
+            # on python 2, utf8 byte strings (and by extension ascii byte
+            # strings) are passed through as-is.
+            self.assertEqual(self.get_output(), utf8(u"\u00e9"))
+        else:
+            # on python 3, byte strings always get repr'd even if
+            # they're ascii-only, so this degenerates into another
+            # copy of test_bytes_logging.
+            self.assertEqual(self.get_output(), utf8(repr(utf8(u"\u00e9"))))
+
+
+class UnicodeLogFormatterTest(LogFormatterTest):
+    def make_handler(self, filename):
+        # Adding an explicit encoding configuration allows non-ascii unicode
+        # strings in both python 2 and 3, without changing the behavior
+        # for byte strings.
+        return logging.FileHandler(filename, encoding="utf8")
+
+    def test_unicode_logging(self):
+        self.logger.error(u"\u00e9")
+        self.assertEqual(self.get_output(), utf8(u"\u00e9"))