]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-106628: email parsing speedup (gh-106629)
authorCF Bolz-Tereick <cfbolz@gmx.de>
Thu, 13 Jul 2023 06:12:56 +0000 (08:12 +0200)
committerGitHub <noreply@github.com>
Thu, 13 Jul 2023 06:12:56 +0000 (15:12 +0900)
Lib/email/feedparser.py
Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst [new file with mode: 0644]

index 885097c7dda06739fd2bb0d73ff8deb264996a71..53d71f50225152ad6e640fd6d7ff070119afba2e 100644 (file)
@@ -37,6 +37,8 @@ NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
 headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
 EMPTYSTRING = ''
 NL = '\n'
+boundaryendRE = re.compile(
+    r'(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
 
 NeedMoreData = object()
 
@@ -327,9 +329,10 @@ class FeedParser:
             # this onto the input stream until we've scanned past the
             # preamble.
             separator = '--' + boundary
-            boundaryre = re.compile(
-                '(?P<sep>' + re.escape(separator) +
-                r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
+            def boundarymatch(line):
+                if not line.startswith(separator):
+                    return None
+                return boundaryendRE.match(line, len(separator))
             capturing_preamble = True
             preamble = []
             linesep = False
@@ -341,7 +344,7 @@ class FeedParser:
                     continue
                 if line == '':
                     break
-                mo = boundaryre.match(line)
+                mo = boundarymatch(line)
                 if mo:
                     # If we're looking at the end boundary, we're done with
                     # this multipart.  If there was a newline at the end of
@@ -373,13 +376,13 @@ class FeedParser:
                         if line is NeedMoreData:
                             yield NeedMoreData
                             continue
-                        mo = boundaryre.match(line)
+                        mo = boundarymatch(line)
                         if not mo:
                             self._input.unreadline(line)
                             break
                     # Recurse to parse this subpart; the input stream points
                     # at the subpart's first line.
-                    self._input.push_eof_matcher(boundaryre.match)
+                    self._input.push_eof_matcher(boundarymatch)
                     for retval in self._parsegen():
                         if retval is NeedMoreData:
                             yield NeedMoreData
diff --git a/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst b/Misc/NEWS.d/next/Library/2023-07-11-16-36-22.gh-issue-106628.Kx8Zvc.rst
new file mode 100644 (file)
index 0000000..6fa276e
--- /dev/null
@@ -0,0 +1,2 @@
+Speed up parsing of emails by about 20% by not compiling a new regular
+expression for every single email.