parser: parse headers containing invalid characters or codings

author Daniel Axtens <dja@axtens.net>

Mon, 19 Sep 2016 14:23:58 +0000 (00:23 +1000)

committer Stephen Finucane <stephen@that.guru>

Sat, 8 Oct 2016 19:42:33 +0000 (20:42 +0100)
author Daniel Axtens <dja@axtens.net>
Mon, 19 Sep 2016 14:23:58 +0000 (00:23 +1000)
committer Stephen Finucane <stephen@that.guru>
Sat, 8 Oct 2016 19:42:33 +0000 (20:42 +0100)
diff --git a/patchwork/bin/parsemail.py b/patchwork/bin/parsemail.py

index dcb4fe00c0814d51b0bbdaa00679e61f6b7abd34..fa6e42ee5b9632473b33b8784a9993cb3ed61fbb 100755 (executable)
--- a/patchwork/bin/parsemail.py
+++ b/patchwork/bin/parsemail.py
@@ -25,12 +25,10 @@ import argparse
  import codecs
  import datetime
  from email import message_from_file
-from email.header import Header, decode_header
+from email.header import decode_header, make_header
  from email.utils import parsedate_tz, mktime_tz
  from fnmatch import fnmatch
-from functools import reduce
  import logging
-import operator
  import re
  import sys
  
@@ -39,7 +37,6 @@ from django.conf import settings
  from django.contrib.auth.models import User
  from django.utils.log import AdminEmailHandler
  from django.utils import six
-from django.utils.six.moves import map
  
  from patchwork.models import (Patch, Project, Person, Comment, State,
                                DelegationRule, get_default_initial_patch_state)
@@ -63,19 +60,84 @@ def normalise_space(str):
      return whitespace_re.sub(' ', str).strip()
  
  
+def sanitise_header(header_contents, header_name=None):
+    """Clean and individual mail header.
+
+    Given a header with header_contents, optionally labelled
+    header_name, decode it with decode_header, sanitise it to make
+    sure it decodes correctly and contains no invalid characters,
+    then encode the result with make_header()
+    """
+
+    # We have some Py2/Py3 issues here.
+    #
+    # Firstly, the email parser (before we get here)
+    # Python 3: headers with weird chars are email.header.Header
+    #           class, others as str
+    # Python 2: every header is an str
+    #
+    # Secondly, the behaviour of decode_header:
+    # Python 3: weird headers are labelled as unknown-8bit
+    # Python 2: weird headers are not labelled differently
+    #
+    # Lastly, aking matters worse, in Python2, unknown-8bit doesn't
+    # seem to be supported as an input to make_header, so not only do
+    # we have to detect dodgy headers, we have to fix them ourselves.
+    #
+    # We solve this by catching any Unicode errors, and then manually
+    # handling any interesting headers.
+
+    value = decode_header(header_contents)
+    try:
+        header = make_header(value,
+                             header_name=header_name,
+                             continuation_ws='\t')
+    except UnicodeDecodeError:
+        # At least one of the parts cannot be encoded as ascii.
+        # Find out which one and fix it somehow.
+        #
+        # We get here under Py2 when there's non-7-bit chars in header,
+        # or under Py2 or Py3 where decoding with the coding hint fails.
+
+        new_value = []
+
+        for (part, coding) in value:
+            # We have random bytes that aren't properly coded.
+            # If we had a coding hint, it failed to help.
+            if six.PY3:
+                # python3 - force coding to unknown-8bit
+                new_value += [(part, 'unknown-8bit')]
+            else:
+                # python2 - no support in make_header for unknown-8bit
+                # We should do unknown-8bit coding ourselves.
+                # For now, we're just going to replace any dubious
+                # chars with ?.
+                #
+                # TODO: replace it with a proper QP unknown-8bit codec.
+                new_value += [(part.decode('ascii', errors='replace')
+                               .encode('ascii', errors='replace'),
+                               None)]
+
+        header = make_header(new_value,
+                             header_name=header_name,
+                             continuation_ws='\t')
+
+    return header
+
+
  def clean_header(header):
      """Decode (possibly non-ascii) headers."""
-    def decode(fragment):
-        (frag_str, frag_encoding) = fragment
-        if frag_encoding:
-            return frag_str.decode(frag_encoding)
-        elif isinstance(frag_str, six.binary_type):  # python 2
-            return frag_str.decode()
-        return frag_str
  
-    fragments = list(map(decode, decode_header(header)))
+    sane_header = sanitise_header(header)
  
-    return normalise_space(u' '.join(fragments))
+    # on Py2, we want to do unicode(), on Py3, str().
+    # That gets us the decoded, un-wrapped header.
+    if six.PY2:
+        header_str = unicode(sane_header)
+    else:
+        header_str = str(sane_header)
+
+    return normalise_space(header_str)
  
  
  def find_project_by_id(list_id):
@@ -168,10 +230,13 @@ def mail_date(mail):
  
  
  def mail_headers(mail):
-    return reduce(operator.__concat__,
-                  ['%s: %s\n' % (k, Header(v, header_name=k,
-                                           continuation_ws='\t').encode())
-                   for (k, v) in list(mail.items())])
+    headers = [(key, sanitise_header(value, header_name=key))
+               for key, value in mail.items()]
+
+    strings = [('%s: %s' % (key, header.encode()))
+               for (key, header) in headers]
+
+    return '\n'.join(strings)
  
  
  def find_pull_request(content):
author	Daniel Axtens <dja@axtens.net>
	Mon, 19 Sep 2016 14:23:58 +0000 (00:23 +1000)
committer	Stephen Finucane <stephen@that.guru>
	Sat, 8 Oct 2016 19:42:33 +0000 (20:42 +0100)