From: Eric Wong Date: Thu, 7 Aug 2025 00:49:31 +0000 (+0000) Subject: *index: don't try to index boolean terms >245 bytes long X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=fa9c112306844defa83042808a72c68be9813916;p=thirdparty%2Fpublic-inbox.git *index: don't try to index boolean terms >245 bytes long Xapian's flint, chert, and glass backends only support a maximum term length of 245 bytes including the upper-case term prefix. Thus we can't index things like insanely long Message-IDs in References: or overly long pathnames or newsgroup names. The work-in-progress honey backend still doesn't support updates, yet, (only created from an existing glass DB), and I doubt the limit will increase since excessive term lengths are usually a mistake of mangled whitespace or a broken/spammy client somewhere. --- diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod index 0e3569614..1497219b0 100644 --- a/Documentation/public-inbox-config.pod +++ b/Documentation/public-inbox-config.pod @@ -50,6 +50,9 @@ public-inbox. This must be specified once. This was previously known as "mainrepo", which remains supported, but "inboxdir" takes precedence. +Absolute pathnames longer than 244 bytes cannot be indexed +with L + Default: none, required =item publicinbox..url @@ -74,8 +77,9 @@ Omitting this for a given inbox will prevent the inbox from being served by L, L, and/or L -Newsgroup names should be all lowercase. Uppercase characters are -converted to lowercase for compatibility with IMAP, POP3, and our +Newsgroup names should be all lowercase and have a maximum +length of 244 bytes. Uppercase characters are converted to +lowercase for compatibility with IMAP, POP3, and our L and L tools starting with public-inbox 2.0+ (they were unusable before). @@ -369,6 +373,8 @@ Default: none =item coderepo..dir The path to a git repository for "publicinbox..coderepo" +Absolute pathnames longer than 244 bytes cannot be indexed +with L =item coderepo..cgitUrl diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index 6dc2a7a9f..ddb598030 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -61,7 +61,7 @@ use File::Spec::Functions qw(canonpath); use List::Util qw(max); use PublicInbox::SHA qw(sha256_hex sha_all); use PublicInbox::Search qw(xap_terms); -use PublicInbox::SearchIdx qw(add_val); +use PublicInbox::SearchIdx qw(add_val add_bool_term); use PublicInbox::Config qw(glob2re rel2abs_collapsed); use PublicInbox::Spawn qw(which spawn popen_rd); use PublicInbox::OnDestroy; @@ -182,7 +182,7 @@ sub update_commit ($$$) { join(', ', map { "#$_" } @extra), "\n"; $self->{xdb}->delete_document($_) for @extra; my $doc = $PublicInbox::Search::X{Document}->new; - $doc->add_boolean_term($x); + $doc->add_boolean_term($x); # "Q$COMMIT_OIDHEX" $doc->add_boolean_term('G'.$_) for @$roots; $doc->add_boolean_term('XP'.$_) for split(/ /, $cmt->{P}); $doc->add_boolean_term('T'.'c'); @@ -247,7 +247,7 @@ sub store_repo { # wq_io_do, sends docid back $self->{xdb}->delete_document($_) for @{$repo->{to_delete}}; my $doc = $PublicInbox::Search::X{Document}->new; add_val($doc, PublicInbox::CodeSearch::CT, $repo->{ct}); - $doc->add_boolean_term("P$repo->{git_dir}"); + add_bool_term($doc, "P$repo->{git_dir}"); $doc->add_boolean_term('T'.'r'); $doc->add_boolean_term('G'.$_) for @{$repo->{roots}}; $doc->set_data($repo->{fp}); # \n delimited @@ -257,7 +257,7 @@ sub store_repo { # wq_io_do, sends docid back $OFMT2HEXLEN{$fmt} // warn <{git_dir} EOM - $doc->add_boolean_term('H'.$fmt); + add_bool_term($doc, 'H'.$fmt); my $did = $repo->{docid}; $did ? $self->{xdb}->replace_document($did, $doc) : ($did = $self->{xdb}->add_document($doc)); @@ -908,7 +908,7 @@ sub store_objfmt { # via wq_do - make early cidx users happy warn "BUG? #$docid for $git_dir has no P(ath)"; @p == 1 or return warn "BUG? #$docid $git_dir multi: @p"; $p[0] eq $git_dir or return warn "BUG? #$docid $git_dir != @p"; - $doc->add_boolean_term('H'.$fmt); + add_bool_term($doc, 'H'.$fmt); $self->{xdb}->replace_document($docid, $doc); # wait for prune_commit to commit... } diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm index 6708527d7..e2ed9d111 100644 --- a/lib/PublicInbox/MiscIdx.pm +++ b/lib/PublicInbox/MiscIdx.pm @@ -15,7 +15,7 @@ use strict; use v5.10.1; use PublicInbox::InboxWritable; use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat -use PublicInbox::SearchIdx qw(index_text term_generator add_val); +use PublicInbox::SearchIdx qw(index_text term_generator add_val add_bool_term); use Carp qw(croak); use File::Path (); use PublicInbox::MiscSearch; @@ -105,7 +105,7 @@ EOF add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified); add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity); - $doc->add_boolean_term('Q'.$eidx_key); # uniQue id + add_bool_term($doc, 'Q'.$eidx_key); # uniQue id $doc->add_boolean_term('T'.'inbox'); # Type # force reread from disk, {description} could be loaded from {misc} diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index ca9953ce0..f4df9d0af 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -29,7 +29,8 @@ use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); use PublicInbox::Address; use Config; our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack - index_text term_generator add_val is_bad_blob update_checkpoint); + index_text term_generator add_val is_bad_blob update_checkpoint + add_bool_term); my $X = \%PublicInbox::Search::X; our ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; @@ -39,7 +40,10 @@ our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : # assume a typical 64-bit system has 8x more RAM than a # typical 32-bit system: (($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024); -use constant DEBUG => !!$ENV{DEBUG}; +use constant { + DEBUG => !!$ENV{DEBUG}, + MAX_TERM_SIZE => 245, # Xapian limitation, includes prefix +}; my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; @@ -191,10 +195,19 @@ sub index_text1 { # called by various ->index_extra $self->{term_generator}->index_text_without_positions($text, 1, $pfx); } +sub add_bool_term ($$) { + my ($doc, $pfx_term) = @_; + if (length($pfx_term) > MAX_TERM_SIZE) { + carp "W: skipping term: `$pfx_term'.length > ", + MAX_TERM_SIZE, "\n"; + } else { + $doc->add_boolean_term($pfx_term); + } +} + sub index_boolean_term { # called by various ->index_extra my ($self, $pfx, $term) = @_; - my $doc = $self->{term_generator}->get_document; - $doc->add_boolean_term($pfx.$term); + add_bool_term($self->{term_generator}->get_document, $pfx.$term); } sub index_text ($$$$) { @@ -439,7 +452,7 @@ sub index_list_id ($$$) { $l =~ /<([^>]+)>/ or next; my $lid = lc $1; $lid =~ tr/\n\t\r\0//d; # same rules as Message-ID - $doc->add_boolean_term('G' . $lid); + add_bool_term $doc, 'G' . $lid; index_phrase($self, $lid, 1, 'XL'); # probabilistic } } @@ -456,7 +469,7 @@ sub index_ids ($$$$) { index_phrase($self, join(' ', @long), 1, 'XM'); } } - $doc->add_boolean_term('Q' . $_) for @$mids; + add_bool_term($doc, 'Q'.$_) for @$mids; index_list_id($self, $doc, $hdr); } @@ -478,11 +491,11 @@ sub eml2doc ($$$;$) { index_headers($self, $smsg); my $ekey = $smsg->{eidx_key}; - $doc->add_boolean_term('O'.$ekey) if ($ekey // '.') ne '.'; + add_bool_term($doc, 'O'.$ekey) if ($ekey // '.') ne '.'; msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); for (@{$smsg->parse_references($eml, $mids)}) { - $doc->add_boolean_term('XRF'.$_) + add_bool_term $doc, 'XRF'.$_; } # by default, we maintain compatibility with v1.5.0 and earlier @@ -510,7 +523,7 @@ sub add_xapian ($$$$) { my @x = @VMD_MAP; while (my ($field, $pfx) = splice(@x, 0, 2)) { for my $term (xap_terms($pfx, $old)) { - $doc->add_boolean_term($pfx.$term); + add_bool_term $doc, $pfx.$term; } } } @@ -595,7 +608,7 @@ sub add_eidx_info { term_generator($self)->set_document($doc); # '.' is special for lei_store - $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.'; + add_bool_term($doc, 'O'.$eidx_key) if $eidx_key ne '.'; index_list_id($self, $doc, $eml); $self->{xdb}->replace_document($docid, $doc); @@ -657,7 +670,7 @@ sub set_vmd { } return unless scalar(@rm) || scalar(@add); $doc->remove_term($_) for @rm; - $doc->add_boolean_term($_) for @add; + add_bool_term($doc, $_) for @add; $self->{xdb}->replace_document($docid, $doc); } @@ -674,7 +687,7 @@ sub apply_vmd_mod ($$) { }; } for my $val (@{$vmd_mod->{"+$field"} // []}) { - $doc->add_boolean_term($pfx . $val); + add_bool_term($doc, $pfx . $val); ++$updated; } } @@ -689,7 +702,7 @@ sub add_vmd { my $updated = 0; while (my ($field, $pfx) = splice(@x, 0, 2)) { my $add = $vmd->{$field} // next; - $doc->add_boolean_term($pfx . $_) for @$add; + add_bool_term($doc, $pfx . $_) for @$add; $updated += scalar(@$add); } $updated += apply_vmd_mod($doc, $vmd); diff --git a/t/search.t b/t/search.t index e793f55b1..0c6aace80 100644 --- a/t/search.t +++ b/t/search.t @@ -567,6 +567,21 @@ $ibx->with_umask(sub { is_deeply($s, $res, 'got binary result from exact literal size'); $s = $query->('"literal 2"'); is_deeply($s, [], 'no results for wrong size'); + + { + my @w; + $eml->header_set('References', '<'.('x' x 241).'@x>'); + $eml->header_set('Message-ID', ''); + local $SIG{__WARN__} = sub { push @w, @_; }; + $rw->add_message($eml); + like "@w", qr/\bskipping term:/, 'excessively long term skipped'; + @w = (); + $eml->header_set('References', '<'.('x' x 240).'@x>'); + $eml->header_set('Message-ID', ''); + $rw->add_message($eml); + is_deeply \@w, [], 'no warnings on barely-fitting references'; + } + $rw->commit_txn_lazy; }); SKIP: {