]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
optimize regex
authorphail <phail@hacknology.de>
Sun, 20 Nov 2022 11:48:03 +0000 (12:48 +0100)
committerphail <phail@hacknology.de>
Sun, 20 Nov 2022 11:48:03 +0000 (12:48 +0100)
src/paperless_mail/parsers.py
src/paperless_mail/tests/test_parsers.py

index c4ecaf8615b367c871c8e21aac1fbed46a9fa4ad..902619fd7ba550cb4e139177e5dc0cca0ed19f64 100644 (file)
@@ -105,9 +105,8 @@ class MailDocumentParser(DocumentParser):
 
     def parse(self, document_path, mime_type, file_name=None):
         def strip_text(text: str):
-            text = re.sub("\t", " ", text)
-            text = re.sub(" +", " ", text)
-            text = re.sub("(\n *)+", "\n", text)
+            text = re.sub(r"\s+", " ", text)
+            text = re.sub(r"(\n *)+", "\n", text)
             return text.strip()
 
         mail = self.get_parsed(document_path)
index 1a348b472b8aeec48b3c33077029ebdf6b34ab05..5cd614197b3d47f5aaf2a9a0521c580f934bf71e 100644 (file)
@@ -227,7 +227,7 @@ class TestParser(TestCase):
     @mock.patch("paperless_mail.parsers.MailDocumentParser.generate_pdf")
     def test_parse_html_eml(self, n, mock_tika_parse: mock.MagicMock):
         # Validate parsing returns the expected results
-        text_expected = "Some Text\nand an embedded image.\n\nSubject: HTML Message\n\nFrom: Name <someone@example.de>\n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return"
+        text_expected = "Some Text and an embedded image.\n\nSubject: HTML Message\n\nFrom: Name <someone@example.de>\n\nTo: someone@example.de\n\nAttachments: IntM6gnXFm00FEV5.png (6.89 KiB), 600+kbfile.txt (600.24 KiB)\n\nHTML content: tika return"
         mock_tika_parse.return_value = "tika return"
 
         self.parser.parse(os.path.join(self.SAMPLE_FILES, "html.eml"), "message/rfc822")