From: Eric Wong Date: Thu, 20 Feb 2025 22:14:30 +0000 (+0000) Subject: search: index References: for thread:GHOST-MSGID X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a896657a6e22dc6fc28db3a1e3ec6e69022260a5;p=thirdparty%2Fpublic-inbox.git search: index References: for thread:GHOST-MSGID To search for messages in a thread with a ghost msgid, we need to be able to search against msgids in the References: header since (by definition) ghosts don't show up as any Message-ID: we've indexed. This should make our implementation of `thread:MSGID' queries equivalent in capability to `thread:THREADID' of notmuch. --- diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index cb1661016..0e288cf04 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -36,7 +36,7 @@ use constant { # 4 - change "Re: " normalization, avoid circular Reference ghosts # 5 - subject_path drops trailing '.' # 6 - preserve References: order in document data - # 7 - remove references and inreplyto terms + # 7 - remove references and inreplyto terms (restored in 15 (v2.0)) # 8 - remove redundant/unneeded document data # 9 - disable Message-ID compression (SHA-1) # 10 - optimize doc for NNTP overviews @@ -53,6 +53,7 @@ use constant { # * "lid:" and "l:" for List-Id searches # # v1.6.0 adds BYTES, UID and THREADID values + # v2.0.0 re-adds "references:" SCHEMA_VERSION => 15, # we may have up to 8 FDs per shard (depends on Xapian *shrug*) @@ -151,6 +152,7 @@ our %PATCH_BOOL_COMMON = ( my %bool_pfx_external = ( mid => 'Q', # Message-ID (full/exact), this is mostly uniQue lid => 'G', # newsGroup (or similar entity), just inside <> + references => 'XRF', %PATCH_BOOL_COMMON ); diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 1e8246bb4..db4fcf76b 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -481,6 +481,9 @@ sub eml2doc ($$$;$) { $doc->add_boolean_term('O'.$ekey) if ($ekey // '.') ne '.'; msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); + for (@{$smsg->parse_references($eml, $mids)}) { + $doc->add_boolean_term('XRF'.$_) + } # by default, we maintain compatibility with v1.5.0 and earlier # by writing to docdata.glass, users who never expect to downgrade can @@ -488,9 +491,7 @@ sub eml2doc ($$$;$) { if (!$self->{-skip_docdata}) { # WWW doesn't need {to} or {cc}, only NNTP $smsg->{to} = $smsg->{cc} = ''; - $smsg->parse_references($eml, $mids); - my $data = $smsg->to_doc_data; - $doc->set_data($data); + $doc->set_data($smsg->to_doc_data); } my $xtra = defined $ekey ? $self->{"-extra\t$ekey"} : undef; $xtra //= $self->{-extra}; diff --git a/lib/PublicInbox/xh_thread_fp.h b/lib/PublicInbox/xh_thread_fp.h index c7d36c362..2c88401c8 100644 --- a/lib/PublicInbox/xh_thread_fp.h +++ b/lib/PublicInbox/xh_thread_fp.h @@ -64,7 +64,9 @@ Xapian::Query ThreadFieldProcessor::operator()(const std::string &str) Xapian::Query qry; if (str.at(0) != '{') { // thread:$MSGID (no `{'/`}' encasement) - qry = Xapian::Query("Q" + str); + qry = Xapian::Query(Xapian::Query::OP_OR, + Xapian::Query("Q" + str), + Xapian::Query("XRF" + str)); } else if (str.size() <= 1 || str.at(str.size() - 1) != '}') { throw Xapian::QueryParserError("missing } in '" + str + "'"); } else { // thread:"{hello world}" diff --git a/t/search.t b/t/search.t index 8938e6c6d..a0f257699 100644 --- a/t/search.t +++ b/t/search.t @@ -135,6 +135,16 @@ my $query = sub { my $second = $res->[0]; isnt($first, $second, "offset returned different result from limit"); + + for my $f (qw(references)) { + $res = $query->($f . ':root@s'); + @res = filter_mids($res); + is_deeply \@res, [ 'last@s' ], + "got expected results for $f: match"; + diag explain(\@res); + $res = $query->($f . ':root'); + is scalar(@$res), 0, "no partial mid match"; + } } # ghost vivication diff --git a/t/xap_helper.t b/t/xap_helper.t index 3e8176a01..e87c9da8e 100644 --- a/t/xap_helper.t +++ b/t/xap_helper.t @@ -40,7 +40,7 @@ my $v2 = create_inbox 'v2', indexlevel => 'medium', version => 2, } }; -my $thr = create_inbox 'thr', indexlevel => 'medium', version => 2, +my $thr = create_inbox 'thr-ref+', indexlevel => 'medium', version => 2, tmpdir => "$tmp/thr", sub { my ($im) = @_; my $common = <('thread:ghost-root@example'); + is scalar(@art), 6, + 'expected number of results for thread:GHOST-MSGID'; + is scalar(grep { $_->{references} =~ /ghost-root/ } @art), + scalar(@art), + 'thread:MSGID works on ghosts'; + my $nr = $ENV{TEST_LEAK_NR} or skip 'TEST_LEAK_NR unset', 1; $ENV{VALGRIND} or diag "W: `VALGRIND=' unset w/ TEST_LEAK_NR (using -fsanitize?)";