#18044: Fix parsing of encoded words of the form =?utf8?q?=XX...?=

author R David Murray <rdmurray@bitdance.com>

Thu, 11 Jul 2013 19:52:57 +0000 (15:52 -0400)

committer R David Murray <rdmurray@bitdance.com>

Thu, 11 Jul 2013 19:52:57 +0000 (15:52 -0400)
author R David Murray <rdmurray@bitdance.com>
Thu, 11 Jul 2013 19:52:57 +0000 (15:52 -0400)
committer R David Murray <rdmurray@bitdance.com>
Thu, 11 Jul 2013 19:52:57 +0000 (15:52 -0400)
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index 26cfa52723fe2db10419f618a259800ffa68316f..a01d845110f8f1b0fb483ec7cbb95011ddf8e533 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -69,6 +69,7 @@ XXX: provide complete list of token types.
  
  import re
  import urllib   # For urllib.parse.unquote
+from string import hexdigits
  from collections import namedtuple, OrderedDict
  from email import _encoded_words as _ew
  from email import errors
@@ -392,10 +393,6 @@ class UnstructuredTokenList(TokenList):
      token_type = 'unstructured'
  
      def _fold(self, folded):
-        if any(x.token_type=='encoded-word' for x in self):
-            return self._fold_encoded(folded)
-        # Here we can have either a pure ASCII string that may or may not
-        # have surrogateescape encoded bytes, or a unicode string.
          last_ew = None
          for part in self.parts:
              tstr = str(part)
@@ -1389,35 +1386,6 @@ def _get_ptext_to_endchars(value, endchars):
          pos = pos + 1
      return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
  
-def _decode_ew_run(value):
-    """ Decode a run of RFC2047 encoded words.
-
-        _decode_ew_run(value) -> (text, value, defects)
-
-    Scans the supplied value for a run of tokens that look like they are RFC
-    2047 encoded words, decodes those words into text according to RFC 2047
-    rules (whitespace between encoded words is discarded), and returns the text
-    and the remaining value (including any leading whitespace on the remaining
-    value), as well as a list of any defects encountered while decoding.  The
-    input value may not have any leading whitespace.
-
-    """
-    res = []
-    defects = []
-    last_ws = ''
-    while value:
-        try:
-            tok, ws, value = _wsp_splitter(value, 1)
-        except ValueError:
-            tok, ws, value = value, '', ''
-        if not (tok.startswith('=?') and tok.endswith('?=')):
-            return ''.join(res), last_ws + tok + ws + value, defects
-        text, charset, lang, new_defects = _ew.decode(tok)
-        res.append(text)
-        defects.extend(new_defects)
-        last_ws = ws
-    return ''.join(res), last_ws, defects
-
  def get_fws(value):
      """FWS = 1*WSP
  
@@ -1443,7 +1411,8 @@ def get_encoded_word(value):
          raise errors.HeaderParseError(
              "expected encoded word but found {}".format(value))
      remstr = ''.join(remainder)
-    if remstr[:2].isdigit():
+    if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
+        # The ? after the CTE was followed by an encoded word escape (=XX).
          rest, *remainder = remstr.split('?=', 1)
          tok = tok + '?=' + rest
      if len(tok.split()) > 1:
@@ -1491,8 +1460,8 @@ def get_unstructured(value):
  
      """
      # XXX: but what about bare CR and LF?  They might signal the start or
-    # end of an encoded word.  YAGNI for now, since out current parsers
-    # will never send us strings with bard CR or LF.
+    # end of an encoded word.  YAGNI for now, since our current parsers
+    # will never send us strings with bare CR or LF.
  
      unstructured = UnstructuredTokenList()
      while value:
@@ -1504,6 +1473,8 @@ def get_unstructured(value):
              try:
                  token, value = get_encoded_word(value)
              except errors.HeaderParseError:
+                # XXX: Need to figure out how to register defects when
+                # appropriate here.
                  pass
              else:
                  have_ws = True
diff --git a/Lib/test/test_email/test__encoded_words.py b/Lib/test/test_email/test__encoded_words.py

index 14395fed40d7ed0a1678a37d4f1d2f3cd8c5a12c..f8e380dc5549978fbaa013e5689b23543448d21c 100644 (file)
--- a/Lib/test/test_email/test__encoded_words.py
+++ b/Lib/test/test_email/test__encoded_words.py
@@ -122,6 +122,11 @@ class TestDecode(TestEmailBase):
                     # XXX Should this be a new Defect instead?
                     defects = [errors.CharsetError])
  
+    def test_q_nonascii(self):
+        self._test('=?utf-8?q?=C3=89ric?=',
+                   'Éric',
+                   charset='utf-8')
+
  
  class TestEncodeQ(TestEmailBase):
  
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py

index 6101e191c0107b877f2ed21e1d9d297a7ba70995..8917447217c53f84218e7fdc4aea33cec441be6c 100644 (file)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -170,6 +170,15 @@ class TestParser(TestParserMixin, TestEmailBase):
                          [],
                          '')
  
+    def test_get_encoded_word_quopri_utf_escape_follows_cte(self):
+        # Issue 18044
+        self._test_get_x(parser.get_encoded_word,
+                        '=?utf-8?q?=C3=89ric?=',
+                        'Éric',
+                        'Éric',
+                        [],
+                        '')
+
      # get_unstructured
  
      def _get_unst(self, value):
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py

index c0c81c1caa07d3c632576cb251853e6ba05549c7..80f1c0238e483d2bd8e86865b9eb55505520287f 100644 (file)
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -123,12 +123,45 @@ class TestBaseHeaderFeatures(TestHeaderBase):
      #    self.assertEqual(h, value)
      #    self.assertDefectsEqual(h.defects, [errors.ObsoleteHeaderDefect])
  
-    def test_RFC2047_value_decoded(self):
-        value = '=?utf-8?q?this_is_a_test?='
-        h = self.make_header('subject', value)
-        self.assertEqual(h, 'this is a test')
  
+@parameterize
+class TestUnstructuredHeader(TestHeaderBase):
  
+    def string_as_value(self,
+                        source,
+                        decoded,
+                        *args):
+        l = len(args)
+        defects = args[0] if l>0 else []
+        header = 'Subject:' + (' ' if source else '')
+        folded = header + (args[1] if l>1 else source) + '\n'
+        h = self.make_header('Subject', source)
+        self.assertEqual(h, decoded)
+        self.assertDefectsEqual(h.defects, defects)
+        self.assertEqual(h.fold(policy=policy.default), folded)
+
+    string_params = {
+
+        'rfc2047_simple_quopri': (
+            '=?utf-8?q?this_is_a_test?=',
+            'this is a test',
+            [],
+            'this is a test'),
+
+        'rfc2047_gb2312_base64': (
+            '=?gb2312?b?1eLKx9bQzsSy4srUo6E=?=',
+            '\u8fd9\u662f\u4e2d\u6587\u6d4b\u8bd5\uff01',
+            [],
+            '=?utf-8?b?6L+Z5piv5Lit5paH5rWL6K+V77yB?='),
+
+        'rfc2047_simple_nonascii_quopri': (
+            '=?utf-8?q?=C3=89ric?=',
+            'Éric'),
+
+    }
+
+
+@parameterize
  class TestDateHeader(TestHeaderBase):
  
      datestring = 'Sun, 23 Sep 2001 20:10:55 -0700'
diff --git a/Misc/NEWS b/Misc/NEWS

index 3ff4d3e35f28b39377e48fe3025b5898437a9063..c068ed81ee9e83e9652db7dd25f7c8d631ee0da9 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -47,6 +47,10 @@ Core and Builtins
  Library
  -------
  
+- Issue #18044: The new email header parser was mis-parsing encoded words where
+  an encoded character immediately followed the '?' that follows the CTE
+  character, resulting in a decoding failure.  They are now decoded correctly.
+
  - Issue #18101: Tcl.split() now process strings nested in a tuple as it
    do with byte strings.
author	R David Murray <rdmurray@bitdance.com>
	Thu, 11 Jul 2013 19:52:57 +0000 (15:52 -0400)
committer	R David Murray <rdmurray@bitdance.com>
	Thu, 11 Jul 2013 19:52:57 +0000 (15:52 -0400)
Lib/email/_header_value_parser.py		patch \| blob \| blame \| history
Lib/test/test_email/test__encoded_words.py		patch \| blob \| blame \| history
Lib/test/test_email/test__header_value_parser.py		patch \| blob \| blame \| history
Lib/test/test_email/test_headerregistry.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history