From: Stephen Finucane Date: Fri, 6 May 2022 17:21:02 +0000 (+0100) Subject: parser: Ignore CFWS in Message-ID header X-Git-Tag: v3.1.0~11 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ad999401442718920bd91608345978eb952ef38e;p=thirdparty%2Fpatchwork.git parser: Ignore CFWS in Message-ID header We recently started stripping comments and folding white space from the In-Reply-To and References headers. Do so also for the Message-ID header. Because of the importance of the Message-ID header, we accept even non-compliant headers and because we now have a pattern for this, we also start (re-)accepting non-compliant In-Reply-To headers. Signed-off-by: Stephen Finucane Related: #399 --- diff --git a/patchwork/parser.py b/patchwork/parser.py index 779a519f..537f8bc0 100644 --- a/patchwork/parser.py +++ b/patchwork/parser.py @@ -246,15 +246,14 @@ def _find_series_by_references(project, mail): name, prefixes = clean_subject(subject, [project.linkname]) version = parse_version(name, prefixes) - refs = find_references(mail) - h = clean_header(mail.get('Message-Id')) - if h: - refs = [h] + refs + msg_id = find_message_id(mail) + refs = [msg_id] + find_references(mail) for ref in refs: try: series = SeriesReference.objects.get( - msgid=ref[:255], project=project + msgid=ref[:255], + project=project, ).series if series.version != version: @@ -492,6 +491,34 @@ def find_headers(mail): return '\n'.join(strings) +def find_message_id(mail): + """Extract the 'message-id' headers from a given mail and validate it. + + The validation here is simply checking that the Message-ID is correctly + formatted per RFC-2822. However, even if it's not we'll attempt to use what + we're given because a patch tracked in Patchwork with janky threading is + better than no patch whatsoever. + """ + header = clean_header(mail.get('Message-Id')) + if not header: + raise ValueError("Broken 'Message-Id' header") + + msgid = _msgid_re.search(header) + if msgid: + msgid = msgid.group(0) + else: + # This is only info level since the admin likely can't do anything + # about this + logger.info( + "Malformed 'Message-Id' header. The 'msg-id' component should be " + "surrounded by angle brackets. Saving raw header. This may " + "include comments and extra whitespace." + ) + msgid = header.strip() + + return msgid[:255] + + def find_references(mail): """Construct a list of possible reply message ids. @@ -501,19 +528,33 @@ def find_references(mail): """ refs = [] - if 'In-Reply-To' in mail: - for in_reply_to in mail.get_all('In-Reply-To'): - ref = _msgid_re.search(clean_header(in_reply_to)) - if ref: - refs.append(ref.group(0)) + for header in mail.get_all('In-Reply-To', []): + header = clean_header(header) + if not header: + continue + ref = _msgid_re.search(header) + if ref: + ref = ref.group(0) + else: + logger.info( + "Malformed 'In-Reply-To' header. The 'msg-id' component " + "should be surrounded by angle brackets. Saving raw header. " + "This may include comments and extra whitespace." + ) + ref = header.strip() + refs.append(ref) - if 'References' in mail: - for references_header in mail.get_all('References'): - references = _msgid_re.findall(clean_header(references_header)) - references.reverse() - for ref in references: - if ref not in refs: - refs.append(ref) + for header in mail.get_all('References', []): + header = clean_header(header) + if not header: + continue + # NOTE: We can't really implement a fallback here since without angle + # brackets there is no obvious way to delimit headers. + references = _msgid_re.findall(header) + references.reverse() + for ref in references: + if ref not in refs: + refs.append(ref) return refs @@ -1085,11 +1126,7 @@ def parse_mail(mail, list_id=None): # parse metadata - msgid = clean_header(mail.get('Message-Id')) - if not msgid: - raise ValueError("Broken 'Message-Id' header") - msgid = msgid[:255] - + msgid = find_message_id(mail) subject = mail.get('Subject') name, prefixes = clean_subject(subject, [project.linkname]) is_comment = subject_check(subject) diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py index b03e49d4..ec368108 100644 --- a/patchwork/tests/test_parser.py +++ b/patchwork/tests/test_parser.py @@ -1338,6 +1338,37 @@ class DuplicateMailTest(TestCase): self.assertEqual(Cover.objects.count(), 1) +class TestFindMessageID(TestCase): + def test_find_message_id__missing_header(self): + email = create_email('test') + del email['Message-Id'] + email['Message-Id'] = '' + + with self.assertRaises(ValueError) as cm: + parser.find_message_id(email) + self.assertIn("Broken 'Message-Id' header", str(cm.exeception)) + + def test_find_message_id__header_with_comments(self): + """Test that we strip comments from the Message-ID field.""" + message_id = ' (message ID with a comment)' + email = create_email('test', msgid=message_id) + + expected = '' + actual = parser.find_message_id(email) + + self.assertEqual(expected, actual) + + def test_find_message_id__invalid_header_fallback(self): + """Test that we accept badly formatted Message-ID fields.""" + message_id = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>' + email = create_email('test', msgid=message_id) + + expected = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>' + actual = parser.find_message_id(email) + + self.assertEqual(expected, actual) + + class TestFindReferences(TestCase): def test_find_references__header_with_comments(self): """Test that we strip comments from References, In-Reply-To fields.""" @@ -1383,6 +1414,19 @@ class TestFindReferences(TestCase): self.assertEqual(expected, actual) + def test_find_references__invalid_header_fallback(self): + """Test that we accept badly formatted In-Reply-To fields.""" + message_id = '' # noqa: E501 + in_reply_to = '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>' + email = create_email('test', msgid=message_id, in_reply_to=in_reply_to) + + expected = [ + '5899d592-8c87-47d9-92b6-d34260ce1aa4@radware.com>', + ] + actual = parser.find_references(email) + + self.assertEqual(expected, actual) + class TestCommentCorrelation(TestCase): def test_find_patch_for_comment__no_reply(self):