From: Eric Wong Date: Wed, 19 Feb 2025 10:10:32 +0000 (+0000) Subject: searchidx: don't index Base-85 w/ CRLF endings X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=719dd657b349dc43fb44db2764d145b3d04deb9e;p=thirdparty%2Fpublic-inbox.git searchidx: don't index Base-85 w/ CRLF endings I encountered a false positive search result from a CRLF message with a Base-85 patch in it. It turns out our Base-85 filtering code didn't account for the possibility of "\r" showing up in patch messages, so just ignore all trailing spaces (not just horizontal spaces) in index_diff(). While we're at it, exclude horizontal whitespace and CR consistently from Base-85-looking quoted text in index_body_text(), too, since I'm sure there's messages with CRCRLF in the wild, too... --- diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 51c8b9c52..1e8246bb4 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -285,7 +285,7 @@ sub index_diff ($$$) { push @$xnq, shift(@l); # skip base85 and empty lines - while (@l && ($l[0] =~ /\A$BASE85\h*\z/o || + while (@l && ($l[0] =~ /\A$BASE85\s*\z/o || $l[0] !~ /\S/)) { shift @l; } @@ -386,8 +386,8 @@ sub index_body_text { if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) { # get rid of Base-85 noise $txt =~ s/^([>\h]+(?:literal|delta) - \x20[0-9]+\r?\n) - (?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx; + \x20[0-9]+\h*\r*\n) + (?:[>\h]+$BASE85\h*\r*\n)+/$1/gsmx; } index_text($self, $txt, 0, 'XQUOT'); } else { # does it look like a diff?