From: Eric Wong Date: Wed, 13 Aug 2025 21:42:46 +0000 (+0000) Subject: extindex: reduce IPC for cross-posted messages X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=29c04518fc3c915f10126eb58820275e351f7cbf;p=thirdparty%2Fpublic-inbox.git extindex: reduce IPC for cross-posted messages When dealing with cross-posted messages, we can reduce IPC traffic by only sending the List-ID header(s) across the pipe instead of all headers of a message. List-ID headers only take up a small portion of all message headers so the IPC traffic reduction can be helpful in saving memory bandwidth to improve performance. This change will also make it easier to journal work for Xapian and perform all work for cross posted messages in the same transaction. This ought to reduce doing updates for a given document across transactions and hopefully reduce storage device wear while improving performance. --- diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 748743fe7..923de7873 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -199,8 +199,10 @@ sub _unref_doc ($$$$$;$) { if ($ibx) { my $ekey = $ibx->{-gc_eidx_key} // $ibx->eidx_key; my $idx = $self->idx_shard($docid); - $idx->ipc_do('remove_eidx_info', $docid, $ekey, $eml); - } # else: we can't remove_eidx_info in reindex-only path + my @list_ids = $eml->header_raw('List-Id'); + $idx->ipc_do('remove_eidx_info_raw', $docid, $ekey, + @list_ids); + } # else: we can't remove_eidx_info_raw in reindex-only path # replace invalidated blob ASAP with something which should be # readable since we may commit the transaction on checkpoint. @@ -230,7 +232,7 @@ sub do_xpost ($$) { my $xnum = $req->{xnum}; $self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key); my $idx = $self->idx_shard($docid); - $idx->ipc_do('add_eidx_info', $docid, $eidx_key, $eml); + $idx->add_eidx_info($docid, $eidx_key, $eml); apply_boost($req, $smsg) if $self->{boost_in_use}; } else { # 'd' no {xnum} $self->git->async_wait_all; @@ -588,9 +590,9 @@ sub _reindex_finalize ($$$) { $smsg->{eidx_key} = $ibx->eidx_key; $idx->index_eml($eml, $smsg); for my $x (reverse @$stable) { - $ibx = _ibx_for $self, $x; - my $hdr = delete $x->{hdr} // die 'BUG: no {hdr}'; - $idx->ipc_do('add_eidx_info', $docid, $ibx->eidx_key, $hdr); + my $lid = delete $x->{lid} // die 'BUG: no {lid}'; + @$lid and $idx->ipc_do('add_eidx_info_raw', $docid, + _ibx_for($self, $x)->eidx_key, @$lid); } return if $nr == 1; # likely, all good @@ -653,7 +655,7 @@ sub _reindex_oid { # git->cat_async callback $re_smsg->{chash} = $chash; $re_smsg->{xnum} = $req->{xr3r}->[$req->{ix}]->[1]; $re_smsg->{ibx_id} = $req->{xr3r}->[$req->{ix}]->[0]; - $re_smsg->{hdr} = $eml->header_obj; + @{$re_smsg->{lid}} = $eml->header_raw('List-Id'); push @{$req->{by_chash}->{$chash}}, $re_smsg; if (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) { $self->git->cat_async($next_oid, \&_reindex_oid, $req); diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm index 5b2c55878..46460c569 100644 --- a/lib/PublicInbox/LeiStore.pm +++ b/lib/PublicInbox/LeiStore.pm @@ -456,7 +456,7 @@ sub add_eml { $smsg->{-merge_vmd} = 1; $idx->index_eml($eml, $smsg); } else { # lse fuzzy hit off ale - $idx->ipc_do('add_eidx_info', $docid, '.', $eml); + $idx->add_eidx_info($docid, '.', $eml); } for my $oid (keys %$xoids) { $oidx->add_xref3($docid, -1, $oid, '.'); @@ -470,7 +470,7 @@ sub add_eml { my $idx = $eidx->idx_shard($docid); $oidx->add_xref3($docid, -1, $smsg->{blob}, '.'); # add_eidx_info for List-Id - $idx->ipc_do('add_eidx_info', $docid, '.', $eml); + $idx->add_eidx_info($docid, '.', $eml); _add_vmd($self, $idx, $docid, $vmd) if $vmd; } _docids_and_maybe_kw $self, \@docids; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index f4df9d0af..125f43f79 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -446,17 +446,22 @@ sub index_xapian { # msg_iter callback index_body_text($self, $doc, \$s); } -sub index_list_id ($$$) { - my ($self, $doc, $hdr) = @_; - for my $l ($hdr->header_raw('List-Id')) { +sub index_list_id_raw ($$@) { + my ($self, $doc, @list_ids) = @_; + for my $l (@list_ids) { $l =~ /<([^>]+)>/ or next; my $lid = lc $1; $lid =~ tr/\n\t\r\0//d; # same rules as Message-ID add_bool_term $doc, 'G' . $lid; - index_phrase($self, $lid, 1, 'XL'); # probabilistic + index_phrase $self, $lid, 1, 'XL'; # probabilistic } } +sub index_list_id ($$$) { + my ($self, $doc, $hdr) = @_; + index_list_id_raw $self, $doc, $hdr->header_raw('List-Id'); +} + sub index_ids ($$$$) { my ($self, $doc, $hdr, $mids) = @_; for my $mid (@$mids) { @@ -470,7 +475,7 @@ sub index_ids ($$$$) { } } add_bool_term($doc, 'Q'.$_) for @$mids; - index_list_id($self, $doc, $hdr); + index_list_id $self, $doc, $hdr; } sub eml2doc ($$$;$) { @@ -601,8 +606,8 @@ sub _get_doc ($$) { } } -sub add_eidx_info { - my ($self, $docid, $eidx_key, $eml) = @_; +sub add_eidx_info_raw { + my ($self, $docid, $eidx_key, @list_ids) = @_; begin_txn_lazy($self); my $doc = _get_doc($self, $docid) or return; term_generator($self)->set_document($doc); @@ -610,7 +615,7 @@ sub add_eidx_info { # '.' is special for lei_store add_bool_term($doc, 'O'.$eidx_key) if $eidx_key ne '.'; - index_list_id($self, $doc, $eml); + index_list_id_raw $self, $doc, @list_ids; $self->{xdb}->replace_document($docid, $doc); } @@ -620,13 +625,13 @@ sub get_terms { xap_terms($pfx, $self->{xdb}, $docid); } -sub remove_eidx_info { - my ($self, $docid, $eidx_key, $eml) = @_; +sub remove_eidx_info_raw { + my ($self, $docid, $eidx_key, @list_ids) = @_; begin_txn_lazy($self); my $doc = _get_doc($self, $docid) or return; eval { $doc->remove_term('O'.$eidx_key) }; warn "W: ->remove_term O$eidx_key: $@\n" if $@; - for my $l ($eml ? $eml->header_raw('List-Id') : ()) { + for my $l (@list_ids) { $l =~ /<([^>]+)>/ or next; my $lid = lc $1; eval { $doc->remove_term('G' . $lid) }; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index 7ee8a1214..108aaaeb7 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -53,6 +53,12 @@ sub index_eml { $self->ipc_do('add_xapian', $eml, $smsg); } +sub add_eidx_info { + my ($self, $docid, $eidx_key, $eml) = @_; + my @list_ids = $eml->header_raw('List-Id'); + $self->ipc_do('add_eidx_info_raw', $docid, $eidx_key, @list_ids); +} + # wait for return to determine when ipc_do('commit_txn_lazy') is done sub echo { shift;