From: Ben Darnell Date: Wed, 27 Jun 2012 07:02:58 +0000 (-0700) Subject: Fix (and test and document) logging on py2 with non-ascii bytestrings. X-Git-Tag: v2.4.0~49 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=83b38823ee48a6e5f3cafe604cb75ffd39bf45cf;p=thirdparty%2Ftornado.git Fix (and test and document) logging on py2 with non-ascii bytestrings. The immediate motivation was an error in request logging when the request contains non-ascii characters, but a more general fix is desirable since there are probably more cases where we log utf8 byte strings, and in general a logging system that blows up on data that has a perfectly useful representation is not ideal. Closes #544. --- diff --git a/tornado/options.py b/tornado/options.py index 537891e0f..a3d74d68d 100644 --- a/tornado/options.py +++ b/tornado/options.py @@ -421,6 +421,7 @@ class _LogFormatter(logging.Formatter): record.message = record.getMessage() except Exception, e: record.message = "Bad message (%r): %r" % (e, record.__dict__) + assert isinstance(record.message, basestring) # guaranteed by logging record.asctime = time.strftime( "%y%m%d %H:%M:%S", self.converter(record.created)) prefix = '[%(levelname)1.1s %(asctime)s %(module)s:%(lineno)d]' % \ @@ -428,7 +429,29 @@ class _LogFormatter(logging.Formatter): if self._color: prefix = (self._colors.get(record.levelno, self._normal) + prefix + self._normal) - formatted = prefix + " " + record.message + + # Encoding notes: The logging module prefers to work with character + # strings, but only enforces that log messages are instances of + # basestring. In python 2, non-ascii bytestrings will make + # their way through the logging framework until they blow up with + # an unhelpful decoding error (with this formatter it happens + # when we attach the prefix, but there are other opportunities for + # exceptions further along in the framework). + # + # If a byte string makes it this far, convert it to unicode to + # ensure it will make it out to the logs. Use repr() as a fallback + # to ensure that all byte strings can be converted successfully, + # but don't do it by default so we don't add extra quotes to ascii + # bytestrings. This is a bit of a hacky place to do this, but + # it's worth it since the encoding errors that would otherwise + # result are so useless (and tornado is fond of using utf8-encoded + # byte strings whereever possible). + try: + message = _unicode(record.message) + except UnicodeDecodeError: + message = repr(record.message) + + formatted = prefix + " " + message if record.exc_info: if not record.exc_text: record.exc_text = self.formatException(record.exc_info) diff --git a/tornado/test/options_test.py b/tornado/test/options_test.py index 80b192920..e5411d945 100644 --- a/tornado/test/options_test.py +++ b/tornado/test/options_test.py @@ -1,7 +1,13 @@ from __future__ import absolute_import, division, with_statement +import logging +import os +import re +import tempfile import unittest -from tornado.options import _Options +from tornado.escape import utf8 +from tornado.options import _Options, _LogFormatter +from tornado.util import b, bytes_type class OptionsTest(unittest.TestCase): @@ -17,3 +23,80 @@ class OptionsTest(unittest.TestCase): def test_parse_command_line(self): self.options.parse_command_line(["main.py", "--port=443"]) self.assertEqual(self.options.port, 443) + + +class LogFormatterTest(unittest.TestCase): + LINE_RE = re.compile(b("\x01\\[E [0-9]{6} [0-9]{2}:[0-9]{2}:[0-9]{2} options_test:[0-9]+\\]\x02 (.*)")) + + def setUp(self): + self.formatter = _LogFormatter(color=False) + # Fake color support. We can't guarantee anything about the $TERM + # variable when the tests are run, so just patch in some values + # for testing. (testing with color off fails to expose some potential + # encoding issues from the control characters) + self.formatter._colors = { + logging.ERROR: u"\u0001", + } + self.formatter._normal = u"\u0002" + self.formatter._color = True + # construct a Logger directly to bypass getLogger's caching + self.logger = logging.Logger('LogFormatterTest') + self.logger.propagate = False + self.tempdir = tempfile.mkdtemp() + self.filename = os.path.join(self.tempdir, 'log.out') + self.handler = self.make_handler(self.filename) + self.handler.setFormatter(self.formatter) + self.logger.addHandler(self.handler) + + def tearDown(self): + os.unlink(self.filename) + os.rmdir(self.tempdir) + + def make_handler(self, filename): + # Base case: default setup without explicit encoding. + # In python 2, supports arbitrary byte strings and unicode objects + # that contain only ascii. In python 3, supports ascii-only unicode + # strings (but byte strings will be repr'd automatically. + return logging.FileHandler(filename) + + def get_output(self): + with open(self.filename, "rb") as f: + line = f.read().strip() + m = LogFormatterTest.LINE_RE.match(line) + if m: + return m.group(1) + else: + raise Exception("output didn't match regex: %r" % line) + + def test_basic_logging(self): + self.logger.error("foo") + self.assertEqual(self.get_output(), b("foo")) + + def test_bytes_logging(self): + self.logger.error(b("\xe9")) + # This will be "\xe9" on python 2 or "b'\xe9'" on python 3 + self.assertEqual(self.get_output(), utf8(repr(b("\xe9")))) + + def test_utf8_logging(self): + self.logger.error(u"\u00e9".encode("utf8")) + if issubclass(bytes_type, basestring): + # on python 2, utf8 byte strings (and by extension ascii byte + # strings) are passed through as-is. + self.assertEqual(self.get_output(), utf8(u"\u00e9")) + else: + # on python 3, byte strings always get repr'd even if + # they're ascii-only, so this degenerates into another + # copy of test_bytes_logging. + self.assertEqual(self.get_output(), utf8(repr(utf8(u"\u00e9")))) + + +class UnicodeLogFormatterTest(LogFormatterTest): + def make_handler(self, filename): + # Adding an explicit encoding configuration allows non-ascii unicode + # strings in both python 2 and 3, without changing the behavior + # for byte strings. + return logging.FileHandler(filename, encoding="utf8") + + def test_unicode_logging(self): + self.logger.error(u"\u00e9") + self.assertEqual(self.get_output(), utf8(u"\u00e9"))