]> git.ipfire.org Git - thirdparty/public-inbox.git/commitdiff
searchidx: don't index Base-85 w/ CRLF endings
authorEric Wong <e@80x24.org>
Wed, 19 Feb 2025 10:10:32 +0000 (10:10 +0000)
committerEric Wong <e@80x24.org>
Thu, 20 Feb 2025 08:02:50 +0000 (08:02 +0000)
I encountered a false positive search result from a CRLF message
with a Base-85 patch in it.  It turns out our Base-85 filtering
code didn't account for the possibility of "\r" showing up in
patch messages, so just ignore all trailing spaces (not just
horizontal spaces) in index_diff().

While we're at it, exclude horizontal whitespace and CR
consistently from Base-85-looking quoted text in
index_body_text(), too, since I'm sure there's messages with
CRCRLF in the wild, too...

lib/PublicInbox/SearchIdx.pm

index 51c8b9c5263279fd3a4dcf469a6e1837a3229c7b..1e8246bb4900bdd354142e376d58941e28e19835 100644 (file)
@@ -285,7 +285,7 @@ sub index_diff ($$$) {
                                push @$xnq, shift(@l);
 
                                # skip base85 and empty lines
-                               while (@l && ($l[0] =~ /\A$BASE85\h*\z/o ||
+                               while (@l && ($l[0] =~ /\A$BASE85\s*\z/o ||
                                                $l[0] !~ /\S/)) {
                                        shift @l;
                                }
@@ -386,8 +386,8 @@ sub index_body_text {
                        if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) {
                                # get rid of Base-85 noise
                                $txt =~ s/^([>\h]+(?:literal|delta)
-                                               \x20[0-9]+\r?\n)
-                                       (?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx;
+                                               \x20[0-9]+\h*\r*\n)
+                                       (?:[>\h]+$BASE85\h*\r*\n)+/$1/gsmx;
                        }
                        index_text($self, $txt, 0, 'XQUOT');
                } else { # does it look like a diff?