]> git.ipfire.org Git - thirdparty/public-inbox.git/commitdiff
searchidx: do not index quoted Base-85 patches
authorEric Wong <e@80x24.org>
Mon, 20 Feb 2023 09:21:50 +0000 (09:21 +0000)
committerEric Wong <e@80x24.org>
Mon, 20 Feb 2023 17:20:59 +0000 (17:20 +0000)
Base-85 binary patches were a source of false-positives in results
and we've filtered out in non-quoted text since July 2022.
Unfortunately, people were quoting binary patch contents
in replies (*sigh*) and triggering false positives in search
results.  So we must filter out base-85-looking contents from
quoted text, too.

Followup-to: 8fda04081acde705 (search: do not index base-85 binary patches, 2022-06-20)
Followup-to: 840785917bc74c8e (searchidx: skip "delta $N" sections for base-85, 2022-07-19)
lib/PublicInbox/SearchIdx.pm
t/search.t

index 257b83a5681693e9c03596a2698343706186fcdb..fc4643838a9798928049f75f593c933ce3b3d289 100644 (file)
@@ -37,7 +37,7 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff :
        # typical 32-bit system:
        (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024);
 use constant DEBUG => !!$ENV{DEBUG};
-my $BASE85 = qr/\A[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+\z/;
+my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
@@ -270,7 +270,7 @@ sub index_diff ($$$) {
                                push @$xnq, shift(@l);
 
                                # skip base85 and empty lines
-                               while (@l && ($l[0] =~ /$BASE85/o ||
+                               while (@l && ($l[0] =~ /\A$BASE85\h*\z/o ||
                                                $l[0] !~ /\S/)) {
                                        shift @l;
                                }
@@ -389,6 +389,12 @@ sub index_xapian { # msg_iter callback
        undef $s; # free memory
        for my $txt (@sections) {
                if ($txt =~ /\A>/) {
+                       if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) {
+                               # get rid of Base-85 noise
+                               $txt =~ s/^([>\h]+(?:literal|delta)
+                                               \x20[0-9]+\r?\n)
+                                       (?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx;
+                       }
                        index_text($self, $txt, 0, 'XQUOT');
                } else {
                        # does it look like a diff?
index dded6c40c2fbc0a40b8a169aafc0d3b59e36f53f..cf639a6deee69a51893d7f75a44b7615d4b33ce3 100644 (file)
@@ -534,7 +534,15 @@ $ibx->with_umask(sub {
                '20200418222508.GA13918@dcvr',
                'Subject search reaches inside message/rfc822');
 
-       $doc_id = $rw->add_message(eml_load('t/data/binary.patch'));
+       my $eml = eml_load('t/data/binary.patch');
+       my $body = $eml->body;
+       $rw->add_message($eml);
+
+       $body =~ s/^/> /gsm;
+       $eml = PublicInbox::Eml->new($eml->header_obj->as_string."\n".$body);
+       $eml->header_set('Message-ID', '<binary-patch-reply@example>');
+       $rw->add_message($eml);
+
        $rw->commit_txn_lazy;
        $ibx->search->reopen;
        my $res = $query->('HcmV');
@@ -542,8 +550,9 @@ $ibx->with_umask(sub {
        $res = $query->('IcmZPo000310RR91');
        is_deeply($res, [], 'no results against 1-byte binary patch');
        $res = $query->('"GIT binary patch"');
-       is(scalar(@$res), 1, 'got binary result from "GIT binary patch"');
+       is(scalar(@$res), 2, 'got binary results from "GIT binary patch"');
        is($res->[0]->{mid}, 'binary-patch-test@example', 'msgid for binary');
+       is($res->[1]->{mid}, 'binary-patch-reply@example', 'msgid for reply');
        my $s = $query->('"literal 1"');
        is_deeply($s, $res, 'got binary result from exact literal size');
        $s = $query->('"literal 2"');