This was previously known as "mainrepo", which remains supported,
but "inboxdir" takes precedence.
+Absolute pathnames longer than 244 bytes cannot be indexed
+with L<public-inbox-extindex(1)>
+
Default: none, required
=item publicinbox.<name>.url
being served by L<public-inbox-nntpd(1)>,
L<public-inbox-imapd(1)>, and/or L<public-inbox-pop3d(1)>
-Newsgroup names should be all lowercase. Uppercase characters are
-converted to lowercase for compatibility with IMAP, POP3, and our
+Newsgroup names should be all lowercase and have a maximum
+length of 244 bytes. Uppercase characters are converted to
+lowercase for compatibility with IMAP, POP3, and our
L<public-inbox-extindex(1)> and L<public-inbox-cindex(1)> tools
starting with public-inbox 2.0+ (they were unusable before).
=item coderepo.<nick>.dir
The path to a git repository for "publicinbox.<name>.coderepo"
+Absolute pathnames longer than 244 bytes cannot be indexed
+with L<public-inbox-cindex(1)>
=item coderepo.<nick>.cgitUrl
use List::Util qw(max);
use PublicInbox::SHA qw(sha256_hex sha_all);
use PublicInbox::Search qw(xap_terms);
-use PublicInbox::SearchIdx qw(add_val);
+use PublicInbox::SearchIdx qw(add_val add_bool_term);
use PublicInbox::Config qw(glob2re rel2abs_collapsed);
use PublicInbox::Spawn qw(which spawn popen_rd);
use PublicInbox::OnDestroy;
join(', ', map { "#$_" } @extra), "\n";
$self->{xdb}->delete_document($_) for @extra;
my $doc = $PublicInbox::Search::X{Document}->new;
- $doc->add_boolean_term($x);
+ $doc->add_boolean_term($x); # "Q$COMMIT_OIDHEX"
$doc->add_boolean_term('G'.$_) for @$roots;
$doc->add_boolean_term('XP'.$_) for split(/ /, $cmt->{P});
$doc->add_boolean_term('T'.'c');
$self->{xdb}->delete_document($_) for @{$repo->{to_delete}};
my $doc = $PublicInbox::Search::X{Document}->new;
add_val($doc, PublicInbox::CodeSearch::CT, $repo->{ct});
- $doc->add_boolean_term("P$repo->{git_dir}");
+ add_bool_term($doc, "P$repo->{git_dir}");
$doc->add_boolean_term('T'.'r');
$doc->add_boolean_term('G'.$_) for @{$repo->{roots}};
$doc->set_data($repo->{fp}); # \n delimited
$OFMT2HEXLEN{$fmt} // warn <<EOM; # store unknown formats anyways
E: unknown extensions.objectFormat=$fmt in $repo->{git_dir}
EOM
- $doc->add_boolean_term('H'.$fmt);
+ add_bool_term($doc, 'H'.$fmt);
my $did = $repo->{docid};
$did ? $self->{xdb}->replace_document($did, $doc)
: ($did = $self->{xdb}->add_document($doc));
warn "BUG? #$docid for $git_dir has no P(ath)";
@p == 1 or return warn "BUG? #$docid $git_dir multi: @p";
$p[0] eq $git_dir or return warn "BUG? #$docid $git_dir != @p";
- $doc->add_boolean_term('H'.$fmt);
+ add_bool_term($doc, 'H'.$fmt);
$self->{xdb}->replace_document($docid, $doc);
# wait for prune_commit to commit...
}
use v5.10.1;
use PublicInbox::InboxWritable;
use PublicInbox::Search; # for SWIG Xapian and Search::Xapian compat
-use PublicInbox::SearchIdx qw(index_text term_generator add_val);
+use PublicInbox::SearchIdx qw(index_text term_generator add_val add_bool_term);
use Carp qw(croak);
use File::Path ();
use PublicInbox::MiscSearch;
add_val($doc, $PublicInbox::MiscSearch::MODIFIED, $ibx->modified);
add_val($doc, $PublicInbox::MiscSearch::UIDVALIDITY, $ibx->uidvalidity);
- $doc->add_boolean_term('Q'.$eidx_key); # uniQue id
+ add_bool_term($doc, 'Q'.$eidx_key); # uniQue id
$doc->add_boolean_term('T'.'inbox'); # Type
# force reread from disk, {description} could be loaded from {misc}
use PublicInbox::Address;
use Config;
our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack
- index_text term_generator add_val is_bad_blob update_checkpoint);
+ index_text term_generator add_val is_bad_blob update_checkpoint
+ add_bool_term);
my $X = \%PublicInbox::Search::X;
our ($DB_CREATE_OR_OPEN, $DB_OPEN);
our $DB_NO_SYNC = 0;
# assume a typical 64-bit system has 8x more RAM than a
# typical 32-bit system:
(($Config{ptrsize} >= 8 ? 8192 : 1024) * 1024);
-use constant DEBUG => !!$ENV{DEBUG};
+use constant {
+ DEBUG => !!$ENV{DEBUG},
+ MAX_TERM_SIZE => 245, # Xapian limitation, includes prefix
+};
my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/;
my $xapianlevels = qr/\A(?:full|medium)\z/;
my $hex = '[a-f0-9]';
$self->{term_generator}->index_text_without_positions($text, 1, $pfx);
}
+sub add_bool_term ($$) {
+ my ($doc, $pfx_term) = @_;
+ if (length($pfx_term) > MAX_TERM_SIZE) {
+ carp "W: skipping term: `$pfx_term'.length > ",
+ MAX_TERM_SIZE, "\n";
+ } else {
+ $doc->add_boolean_term($pfx_term);
+ }
+}
+
sub index_boolean_term { # called by various ->index_extra
my ($self, $pfx, $term) = @_;
- my $doc = $self->{term_generator}->get_document;
- $doc->add_boolean_term($pfx.$term);
+ add_bool_term($self->{term_generator}->get_document, $pfx.$term);
}
sub index_text ($$$$) {
$l =~ /<([^>]+)>/ or next;
my $lid = lc $1;
$lid =~ tr/\n\t\r\0//d; # same rules as Message-ID
- $doc->add_boolean_term('G' . $lid);
+ add_bool_term $doc, 'G' . $lid;
index_phrase($self, $lid, 1, 'XL'); # probabilistic
}
}
index_phrase($self, join(' ', @long), 1, 'XM');
}
}
- $doc->add_boolean_term('Q' . $_) for @$mids;
+ add_bool_term($doc, 'Q'.$_) for @$mids;
index_list_id($self, $doc, $hdr);
}
index_headers($self, $smsg);
my $ekey = $smsg->{eidx_key};
- $doc->add_boolean_term('O'.$ekey) if ($ekey // '.') ne '.';
+ add_bool_term($doc, 'O'.$ekey) if ($ekey // '.') ne '.';
msg_iter($eml, \&index_xapian, [ $self, $doc ]);
index_ids($self, $doc, $eml, $mids);
for (@{$smsg->parse_references($eml, $mids)}) {
- $doc->add_boolean_term('XRF'.$_)
+ add_bool_term $doc, 'XRF'.$_;
}
# by default, we maintain compatibility with v1.5.0 and earlier
my @x = @VMD_MAP;
while (my ($field, $pfx) = splice(@x, 0, 2)) {
for my $term (xap_terms($pfx, $old)) {
- $doc->add_boolean_term($pfx.$term);
+ add_bool_term $doc, $pfx.$term;
}
}
}
term_generator($self)->set_document($doc);
# '.' is special for lei_store
- $doc->add_boolean_term('O'.$eidx_key) if $eidx_key ne '.';
+ add_bool_term($doc, 'O'.$eidx_key) if $eidx_key ne '.';
index_list_id($self, $doc, $eml);
$self->{xdb}->replace_document($docid, $doc);
}
return unless scalar(@rm) || scalar(@add);
$doc->remove_term($_) for @rm;
- $doc->add_boolean_term($_) for @add;
+ add_bool_term($doc, $_) for @add;
$self->{xdb}->replace_document($docid, $doc);
}
};
}
for my $val (@{$vmd_mod->{"+$field"} // []}) {
- $doc->add_boolean_term($pfx . $val);
+ add_bool_term($doc, $pfx . $val);
++$updated;
}
}
my $updated = 0;
while (my ($field, $pfx) = splice(@x, 0, 2)) {
my $add = $vmd->{$field} // next;
- $doc->add_boolean_term($pfx . $_) for @$add;
+ add_bool_term($doc, $pfx . $_) for @$add;
$updated += scalar(@$add);
}
$updated += apply_vmd_mod($doc, $vmd);
is_deeply($s, $res, 'got binary result from exact literal size');
$s = $query->('"literal 2"');
is_deeply($s, [], 'no results for wrong size');
+
+ {
+ my @w;
+ $eml->header_set('References', '<'.('x' x 241).'@x>');
+ $eml->header_set('Message-ID', '<references-too-long@x>');
+ local $SIG{__WARN__} = sub { push @w, @_; };
+ $rw->add_message($eml);
+ like "@w", qr/\bskipping term:/, 'excessively long term skipped';
+ @w = ();
+ $eml->header_set('References', '<'.('x' x 240).'@x>');
+ $eml->header_set('Message-ID', '<references-barely-fit@x>');
+ $rw->add_message($eml);
+ is_deeply \@w, [], 'no warnings on barely-fitting references';
+ }
+ $rw->commit_txn_lazy;
});
SKIP: {