Default: none
+=item publicinbox.<name>.altid
+
+Index by an alternative ID mechanism as a Xapian search prefix e.g.
+C<gmane:1234>. This is useful to allow looking up legacy serial IDs
+(e.g. gmane article numbers).
+
+It must be specified in the form of
+C<serial:$USER_PREFIX:file=$SQLITE_FILENAME> where C<$USER_PREFIX> is a
+lowercase prefix like C<gmane> for search queries, and
+C<$SQLITE_FILENAME> is points to an SQLite DB. C<$SQLITE_FILENAME> may
+be an absolute path or a path relative to C<INBOXDIR> for v2 inboxes or
+C<INBOXDIR/public-inbox> for v1 inboxes.
+
+The schema of C<$SQLITE_FILENAME> should be the same as a
+C<msgmap.sqlite3>. See C<scripts/xhdr-num2mid> in the public-inbox
+source tree for an example of how to generate such a mapping from
+via NNTP.
+
+This is a noop with C<indexlevel=basic>
+
+Default: none
+
+=item publicinbox.<name>.indexheader
+
+Supports indexing of arbitrary mail headers in Xapian.
+
+It must be specified in the form of
+C<$TYPE:$USER_PREFIX:$MAIL_HEADER:$PARAMS>
+where C<$TYPE> determines how it's indexed and queried;
+C<$USER_PREFIX> is a lowercase prefix for search queries,
+C<$MAIL_HEADER> is the header to index (e.g. C<X-Label>),
+C<$PARAMS> is a URL-style query string for optional parameters.
+
+Valid C<$TYPE> values (in ascending order of storage cost) are as follows:
+
+* C<boolean_term> - index for simple filtering (not sortable by relevance)
+
+* C<text> - add frequency information to allow sorting by relevance
+
+* C<phrase> - add positional information to match sentences or phrases
+
+In other words: C<phrase> forces indexing of a particular header to
+behave like it used C<indexlevel=full>; while C<text> indexes as if
+that header used C<indexlevel=medium>.
+
+Valid keys in C<$PARAMS> include:
+
+* raw - do not perform RFC2047 decoding of headers
+
+Example:
+
+ [publicinbox "foo"]
+ indexheader = boolean_term:xlabel:X-Label:raw=1
+
+Support for other parameters is not finalized and subject to change.
+
+This is a noop with C<indexlevel=basic>
+
+New in public-inbox 2.0.0 (PENDING)
+
+Default: none
+
=item publicinbox.<name>.replyto
May be used to control how reply instructions in the PSGI
lib/PublicInbox/Inbox.pm
lib/PublicInbox/InboxIdle.pm
lib/PublicInbox/InboxWritable.pm
+lib/PublicInbox/IndexHeader.pm
lib/PublicInbox/Inotify.pm
lib/PublicInbox/Inotify3.pm
lib/PublicInbox/InputPipe.pm
t/view.t
t/watch_filter_rubylang.t
t/watch_imap.t
+t/watch_indexheader.t
t/watch_maildir.t
t/watch_maildir_v2.t
t/watch_mh.t
-# Copyright (C) 2016-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
# Used for giving serial numbers to messages. This can be tied to
# it leads to reliance on centralization. However, being able
# to use existing serial numbers is beneficial.
package PublicInbox::AltId;
-use strict;
-use warnings;
-use URI::Escape qw(uri_unescape);
-use PublicInbox::Msgmap;
+use v5.12;
+use parent qw(PublicInbox::IndexHeader);
# spec: TYPE:PREFIX:param1=value1¶m2=value2&...
# The PREFIX will be a searchable boolean prefix in Xapian
# Example: serial:gmane:file=/path/to/altmsgmap.sqlite3
sub new {
my ($class, $ibx, $spec, $writable) = @_;
- my ($type, $prefix, $query) = split(/:/, $spec, 3);
- $type eq 'serial' or die "non-serial not supported, yet\n";
- $prefix =~ /\A\w+\z/ or warn "non-word prefix not searchable\n";
- my %params = map {
- my ($k, $v) = split(/=/, uri_unescape($_), 2);
- $v = '' unless defined $v;
- ($k, $v);
- } split(/[&;]/, $query);
- my $f = $params{file} or die "file: required for $type spec $spec\n";
+ my ($type, $pfx, $query) = split /:/, $spec, 3;
+ $type eq 'serial' or die "E: non-serial not supported, yet ($spec)\n";
+ my $self = bless {}, $class;
+ my $params = $self->extra_indexer_new_common($spec, $pfx, $query);
+ my $f = delete $params->{file} or
+ die "E: file= required for $type spec $spec\n";
unless (index($f, '/') == 0) {
if ($ibx->version == 1) {
$f = "$ibx->{inboxdir}/public-inbox/$f";
$f = "$ibx->{inboxdir}/$f";
}
}
- bless {
- filename => $f,
- writable => $writable,
- prefix => $prefix,
- xprefix => 'X'.uc($prefix),
- }, $class;
+ my @k = keys %$params;
+ warn "W: unknown params in `$spec': ", join(', ', @k), "\n" if @k;
+ $self->{filename} = $f;
+ $self->{writable} = $writable if $writable;
+ $self;
}
-sub mm_alt {
+sub mm_alt ($) {
my ($self) = @_;
$self->{mm_alt} ||= eval {
- my $f = $self->{filename};
- my $writable = $self->{writable};
- PublicInbox::Msgmap->new_file($f, $writable);
+ require PublicInbox::Msgmap;
+ PublicInbox::Msgmap->new_file(@$self{qw(filename writable)});
};
}
-sub mid2alt {
- my ($self, $mid) = @_;
- $self->mm_alt->num_for($mid);
+sub index_extra { # for PublicInbox::SearchIdx
+ my ($self, $sidx, $eml, $mids) = @_;
+ for my $mid (@$mids) {
+ my $id = mm_alt($self)->num_for($mid) // next;
+ $sidx->index_boolean_term($self->{xprefix}, $id);
+ }
}
+sub user_help { # for PublicInbox::Search
+ my ($self) = @_;
+ ("$self->{prefix}:", <<EOF);
+alternate serial number e.g. $self->{prefix}:12345 (boolean)
+EOF
+}
+
+# callback for PublicInbox::Search
+sub query_parser_method { 'add_boolean_prefix' }
+
1;
# more things to encourage decentralization
for my $k (qw(address altid nntpmirror imapmirror
coderepo hide listid url
- infourl watchheader
+ infourl watchheader indexheader
nntpserver imapserver pop3server)) {
my $v = $self->{"$pfx.$k"} // next;
$ibx->{$k} = _array($v);
--- /dev/null
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# allow searching on arbitrary headers as text
+package PublicInbox::IndexHeader;
+use v5.12;
+use URI::Escape qw(uri_unescape);
+
+my %T2IDX = ( # map to PublicInbox::SearchIdx methods
+ phrase => 'index_phrase1',
+ boolean_term => 'index_boolean_term',
+ text => 'index_text1',
+);
+
+# also called by AltId->new
+sub extra_indexer_new_common ($$$$) {
+ my ($self, $spec, $pfx, $query) = @_;
+ $pfx =~ /\A[a-z][a-z0-9]*\z/ or
+ warn "W: non-word prefix in `$spec' not searchable\n";
+ $self->{prefix} = $pfx;
+ my %params = map {
+ my ($k, $v) = split /=/, uri_unescape($_), 2;
+ ($k, $v // '');
+ } split /[&;]/, $query // '';
+ my $xpfx = delete($params{index_prefix}) // "X\U$pfx";
+ $xpfx =~ /\A[A-Z][A-Z0-9]*\z/ or die
+ die "E: `index_prefix' in `$spec' must be ALL CAPS\n";
+ $self->{xprefix} = $xpfx;
+ \%params;
+}
+
+sub new {
+ my ($cls, $ibx, $spec) = @_;
+ my ($type, $pfx, $header, $query) = split /:/, $spec, 4;
+ $pfx // die "E: `$spec' has no user prefix\n";
+ $header // die "E: `$spec' has no mail header\n";
+ my $self = bless { header => $header, type => $type }, $cls;
+ my $params = extra_indexer_new_common $self, $spec, $pfx, $query;
+ $self->{hdr_method} = delete $params->{raw} ? 'header_raw' : 'header';
+ my @k = keys %$params;
+ warn "W: unknown params in `$spec': ", join(', ', @k), "\n" if @k;
+ $T2IDX{$type} // die
+ "E: `$type' not supported in $spec, must be one of: ",
+ join(', ', sort keys %T2IDX), "\n";
+ $self;
+}
+
+sub index_extra { # for PublicInbox::SearchIdx
+ my ($self, $sidx, $eml, $mids) = @_;
+ my $idx_method = $self->{-idx_method} //= $T2IDX{$self->{type}};
+ my $hdr_method = $self->{hdr_method};
+ for my $val ($eml->$hdr_method($self->{header})) {
+ $sidx->$idx_method($self->{xprefix}, $val);
+ }
+}
+
+sub user_help { # for PublicInbox::Search
+ my ($self) = @_;
+ ("$self->{prefix}:", <<EOF);
+the `$self->{header}' mail header e.g. $self->{prefix}:stable
+EOF
+}
+
+my %TYPE_2_QPMETHOD = (
+ phrase => 'add_prefix',
+ boolean_term => 'add_boolean_prefix',
+ text => 'add_prefix',
+);
+
+# callback for PublicInbox::Search
+sub query_parser_method { $TYPE_2_QPMETHOD{$_[0]->{type}} }
+
+1;
};
}
+sub load_extra_indexers ($$) {
+ my ($self, $ibx) = @_;
+ my @extra;
+ for my $f (qw(IndexHeader AltId)) {
+ my $specs = $ibx->{lc $f} // next;
+ my $cls = "PublicInbox::$f";
+ eval "require $cls" or die $@;
+ push @extra, map { $cls->new($ibx, $_) } @$specs;
+ }
+ $self->{-extra} = \@extra if @extra;
+}
+
sub new {
my ($class, $ibx) = @_;
ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx";
my $xap = $ibx->version > 1 ? 'xap' : 'public-inbox/xapian';
my $xpfx = "$ibx->{inboxdir}/$xap".SCHEMA_VERSION;
my $self = bless { xpfx => $xpfx }, $class;
- $self->{altid} = $ibx->{altid} if defined($ibx->{altid});
+ $self->load_extra_indexers($ibx);
$self;
}
$xhc;
}
+my %QPMETHOD_2_SYM = (add_prefix => ':', add_boolean_prefix => '=');
+
sub xh_opt ($$) {
my ($self, $opt) = @_;
my $lim = $opt->{limit} || 50;
push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key};
my $apfx = $self->{-alt_pfx} //= do {
my @tmp;
- for (grep /\Aserial:/, @{$self->{altid} // []}) {
- my (undef, $pfx) = split /:/, $_;
- push @tmp, '-Q', "$pfx=X\U$pfx";
+ for my $x (@{$self->{-extra} // []}) {
+ my $sym = $QPMETHOD_2_SYM{$x->query_parser_method};
+ push @tmp, '-Q', $x->{prefix}.$sym.$x->{xprefix};
}
# TODO: arbitrary header indexing goes here
\@tmp;
$qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix);
}
- # we do not actually create AltId objects,
- # just parse the spec to avoid the extra DB handles for now.
- if (my $altid = $self->{altid}) {
+ if (my $extra = $self->{-extra}) {
my $user_pfx = $self->{-user_pfx} = [];
- for (@$altid) {
- # $_ = 'serial:gmane:/path/to/gmane.msgmap.sqlite3'
- # note: Xapian supports multibyte UTF-8, /^[0-9]+$/,
- # and '_' with prefixes matching \w+
- /\Aserial:(\w+):/ or next;
- my $pfx = $1;
- push @$user_pfx, "$pfx:", <<EOF;
-alternate serial number e.g. $pfx:12345 (boolean)
-EOF
- # gmane => XGMANE
- $qp->add_boolean_prefix($pfx, 'X'.uc($pfx));
+ for my $x (@$extra) {
+ push @$user_pfx, $x->user_help;
+ my $m = $x->query_parser_method;
+ $qp->$m(@$x{qw(prefix xprefix)});
}
chomp @$user_pfx;
}
sub help {
my ($self) = @_;
- $self->{qp} // $self->qparse_new; # parse altids
+ $self->{qp} // $self->qparse_new; # parse altids + indexheaders
my @ret = @HELP;
if (my $user_pfx = $self->{-user_pfx}) {
push @ret, @$user_pfx;
my $inboxdir = $ibx->{inboxdir};
my $version = $ibx->version;
my $indexlevel = 'full';
- my $altid = $ibx->{altid};
- if ($altid) {
- require PublicInbox::AltId;
- $altid = [ map { PublicInbox::AltId->new($ibx, $_); } @$altid ];
- }
if ($ibx->{indexlevel}) {
if ($ibx->{indexlevel} =~ $INDEXLEVELS) {
$indexlevel = $ibx->{indexlevel};
my $self = PublicInbox::Search->new($ibx);
bless $self, $class;
$self->{ibx} = $ibx;
- $self->{-altid} = $altid;
+ $self->load_extra_indexers($ibx);
$self->{indexlevel} = $indexlevel;
$self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium';
if ($ibx->{-skip_docdata}) {
$self->{term_generator}->increase_termpos;
}
+sub index_phrase1 { # called by various ->index_extra
+ my ($self, $pfx, $text) = @_;
+ index_phrase $self, $text, 1, $pfx;
+}
+
+sub index_text1 { # called by various ->index_extra
+ my ($self, $pfx, $text) = @_;
+ $self->{term_generator}->index_text_without_positions($text, 1, $pfx);
+}
+
+sub index_boolean_term { # called by various ->index_extra
+ my ($self, $pfx, $term) = @_;
+ my $doc = $self->{term_generator}->get_document;
+ $doc->add_boolean_term($pfx.$term);
+}
+
sub index_text ($$$$) {
my ($self, $text, $wdf_inc, $prefix) = @_;
$doc->set_data($data);
}
- if (my $altid = $self->{-altid}) {
- foreach my $alt (@$altid) {
- my $pfx = $alt->{xprefix};
- foreach my $mid (@$mids) {
- my $id = $alt->mid2alt($mid);
- next unless defined $id;
- $doc->add_boolean_term($pfx . $id);
- }
- }
+ for my $extra (@{$self->{-extra} // []}) {
+ $extra->index_extra($self, $eml, $mids);
}
$doc;
}
--- /dev/null
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use autodie;
+use PublicInbox::TestCommon;
+use PublicInbox::Eml;
+use PublicInbox::Emergency;
+use PublicInbox::IO qw(write_file);
+use PublicInbox::InboxIdle;
+use PublicInbox::Inbox;
+use PublicInbox::DS;
+use PublicInbox::Config;
+require_mods(qw(DBD::SQLite Xapian));
+my $tmpdir = tmpdir;
+my $config = "$tmpdir/pi_config";
+local $ENV{PI_CONFIG} = $config;
+delete local $ENV{PI_DIR};
+my @V = (1);
+my @creat_opt = (indexlevel => 'medium', sub {});
+my $v1 = create_inbox 'v1', tmpdir => "$tmpdir/v1", @creat_opt;
+my $fh = write_file '>', $config, <<EOM;
+[publicinbox "v1"]
+ inboxdir = $v1->{inboxdir}
+ address = v1\@example.com
+ watch = maildir:$tmpdir/v1-md
+ indexheader = boolean_term:xarchiveshash:X-Archives-Hash
+EOM
+
+SKIP: {
+ require_git(v2.6, 1);
+ push @V, 2;
+ my $v2 = create_inbox 'v2', tmpdir => "$tmpdir/v2", @creat_opt;
+ print $fh <<EOM;
+[publicinbox "v2"]
+ inboxdir = $tmpdir/v2
+ address = v2\@example.com
+ watch = maildir:$tmpdir/v2-md
+ indexheader = boolean_term:xarchiveshash:X-Archives-Hash
+EOM
+}
+close $fh;
+my $cfg = PublicInbox::Config->new;
+for my $v (@V) { for ('', qw(cur new tmp)) { mkdir "$tmpdir/v$v-md/$_" } }
+my $wm = start_script([qw(-watch)]);
+my $h1 = 'deadbeef' x 4;
+my @em = map {
+ my $v = $_;
+ my $em = PublicInbox::Emergency->new("$tmpdir/v$v-md");
+ $em->prepare(\(PublicInbox::Eml->new(<<EOM)->as_string));
+From: x\@example.com
+Message-ID: <i-1$v\@example.com>
+To: <v$v\@example.com>
+Date: Sat, 02 Oct 2010 00:00:00 +0000
+X-Archives-Hash: $h1
+
+EOM
+ $em;
+} @V;
+
+my $delivered = 0;
+my $cb = sub {
+ diag "message delivered to `$_[0]->{name}'";
+ ++$delivered;
+};
+PublicInbox::DS->Reset;
+my $ii = PublicInbox::InboxIdle->new($cfg);
+my $obj = bless \$cb, 'PublicInbox::TestCommon::InboxWakeup';
+$cfg->each_inbox(sub { $_[0]->subscribe_unlock('ident', $obj) });
+local @PublicInbox::DS::post_loop_do = (sub { $delivered != @V });
+$_->commit for @em;
+diag 'waiting for -watch to import new message(s)';
+PublicInbox::DS::event_loop();
+$wm->join('TERM');
+$ii->close;
+
+$cfg->each_inbox(sub {
+ my ($ibx) = @_;
+ my $srch = $ibx->search;
+ my $mset = $srch->mset('xarchiveshash:miss');
+ is($mset->size, 0, 'got xarchiveshash:miss non-result');
+ $mset = $srch->mset("xarchiveshash:$h1");
+ is($mset->size, 1, 'got xarchiveshash: hit result') or return;
+ my $num = $srch->mset_to_artnums($mset);
+ my $eml = $ibx->smsg_eml($ibx->over->get_art($num->[0]));
+ is($eml->header_raw('X-Archives-Hash'), $h1,
+ 'stored message with X-Archives-Hash');
+ my @opt = $srch->xh_opt;
+ is $opt[-2], '-Q', 'xap_helper -Q switch';
+ is $opt[-1], 'xarchiveshash=XXARCHIVESHASH', 'xap_helper -Q arg';
+});
+
+done_testing;