bpo-21315: Fix parsing of encoded words with missing leading ws. (GH-13425) (#13846)

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Thu, 6 Jun 2019 17:08:43 +0000 (10:08 -0700)

committer Barry Warsaw <barry@python.org>

Thu, 6 Jun 2019 17:08:43 +0000 (10:08 -0700)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Thu, 6 Jun 2019 17:08:43 +0000 (10:08 -0700)
committer Barry Warsaw <barry@python.org>
Thu, 6 Jun 2019 17:08:43 +0000 (10:08 -0700)
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py

index 958ef5018c259945205c18fced63276d58325da1..18aecbffa71a83d7d5e718b1279d5054f07811be 100644 (file)
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -97,6 +97,18 @@ EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
  def quote_string(value):
      return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
  
+# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
+rfc2047_matcher = re.compile(r'''
+   =\?            # literal =?
+   [^?]*          # charset
+   \?             # literal ?
+   [qQbB]         # literal 'q' or 'b', case insensitive
+   \?             # literal ?
+  .*?             # encoded word
+  \?=             # literal ?=
+''', re.VERBOSE | re.MULTILINE)
+
+
  #
  # TokenList and its subclasses
  #
@@ -1050,6 +1062,10 @@ def get_encoded_word(value):
          _validate_xtext(vtext)
          ew.append(vtext)
          text = ''.join(remainder)
+    # Encoded words should be followed by a WS
+    if value and value[0] not in WSP:
+        ew.defects.append(errors.InvalidHeaderDefect(
+            "missing trailing whitespace after encoded-word"))
      return ew, value
  
  def get_unstructured(value):
@@ -1102,6 +1118,11 @@ def get_unstructured(value):
                  unstructured.append(token)
                  continue
          tok, *remainder = _wsp_splitter(value, 1)
+        # Split in the middle of an atom if there is a rfc2047 encoded word
+        # which does not have WSP on both sides. The defect will be registered
+        # the next time through the loop.
+        if rfc2047_matcher.search(tok):
+            tok, *remainder = value.partition('=?')
          vtext = ValueTerminal(tok, 'vtext')
          _validate_xtext(vtext)
          unstructured.append(vtext)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py

index 676732bb3d026160543c64fc1fdce4babfff649b..693487bc960fc06aae1912d7cefd6efccc80b2eb 100644 (file)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -118,7 +118,7 @@ class TestParser(TestParserMixin, TestEmailBase):
                           '=?us-ascii?q?first?==?utf-8?q?second?=',
                           'first',
                           'first',
-                         [],
+                         [errors.InvalidHeaderDefect],
                           '=?utf-8?q?second?=')
  
      def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ class TestParser(TestParserMixin, TestEmailBase):
              '=?utf-8?q?foo?==?utf-8?q?bar?=',
              'foobar',
              'foobar',
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_leading_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            'nowhitespace=?utf-8?q?somevalue?=',
+            'nowhitespacesomevalue',
+            'nowhitespacesomevalue',
+            [errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_trailing_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            '=?utf-8?q?somevalue?=nowhitespace',
+            'somevaluenowhitespace',
+            'somevaluenowhitespace',
              [errors.InvalidHeaderDefect],
              '')
  
@@ -546,7 +565,8 @@ class TestParser(TestParserMixin, TestEmailBase):
              '"=?utf-8?Q?not_really_valid?="',
              '"not really valid"',
              'not really valid',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+             errors.InvalidHeaderDefect],
              '')
  
      # get_comment
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py

index d1007099f666c914b574c1aa42395112c7c1b0a4..e6db3acedcc139248a7eafadb0d687045369987d 100644 (file)
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
  
          'rfc2047_atom_in_quoted_string_is_decoded':
              ('"=?utf-8?q?=C3=89ric?=" <foo@example.com>',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
              'Éric <foo@example.com>',
              'Éric',
              'foo@example.com',
diff --git a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst

new file mode 100644 (file)

index 0000000..dd0dd7f
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
@@ -0,0 +1,4 @@
+Email headers containing RFC2047 encoded words are parsed despite the missing
+whitespace, and a defect registered. Also missing trailing whitespace after
+encoded words is now registered as a defect.
+
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Thu, 6 Jun 2019 17:08:43 +0000 (10:08 -0700)
committer	Barry Warsaw <barry@python.org>
	Thu, 6 Jun 2019 17:08:43 +0000 (10:08 -0700)
Lib/email/_header_value_parser.py		patch \| blob \| blame \| history
Lib/test/test_email/test__header_value_parser.py		patch \| blob \| blame \| history
Lib/test/test_email/test_headerregistry.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst	[new file with mode: 0644]	patch \| blob