$dig->add($s);
}
-sub content_digest ($;$) {
- my ($eml, $dig) = @_;
+sub content_digest ($;$$) {
+ my ($eml, $dig, $hash_mids) = @_;
$dig //= Digest::SHA->new(256);
# References: and In-Reply-To: get used interchangeably
# in some "duplicates" in LKML. We treat them the same
# in SearchIdx, so treat them the same for this:
# do NOT consider the Message-ID as part of the content_hash
- # if we got here, we've already got Message-ID reuse
- my %seen = map { $_ => 1 } @{mids($eml)};
+ # if we got here, we've already got Message-ID reuse for v2.
+ #
+ # However, `lei q --dedupe=content' does use $hash_mids since
+ # it doesn't have any other dedupe
+ my $mids = mids($eml);
+ if ($hash_mids) {
+ $dig->add("mid\0$_\0") for @$mids;
+ }
+ my %seen = map { $_ => 1 } @$mids;
for (grep { !$seen{$_}++ } @{references($eml)}) {
utf8::encode($_);
$dig->add("ref\0$_\0");
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
package PublicInbox::LeiDedupe;
use v5.12;
-use PublicInbox::ContentHash qw(content_hash git_sha);
+use PublicInbox::ContentHash qw(content_hash content_digest git_sha);
use PublicInbox::SHA qw(sha256);
# n.b. mutt sets most of these headers not sure about Bytes
my ($skv) = @_;
(sub { # may be called in a child process
my ($eml) = @_; # $oidhex = $_[1], ignored
- $skv->set_maybe(content_hash($eml), '');
+
+ # we must account for Message-ID via hash_mids, since
+ # (unlike v2 dedupe) Message-ID is not accounted for elsewhere:
+ $skv->set_maybe(content_digest($eml, PublicInbox::SHA->new(256),
+ 1 # hash_mids
+ )->digest, '');
}, sub {
my ($smsg) = @_;
$skv->set_maybe(smsg_hash($smsg), '');
#!perl -w
-# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
use strict;
use v5.10.1;
require_mods(qw(DBD::SQLite));
use_ok 'PublicInbox::LeiDedupe';
my $eml = eml_load('t/plack-qp.eml');
+my $sameish = eml_load('t/plack-qp.eml');
+$sameish->header_set('Message-ID', '<cuepee@example.com>');
my $mid = $eml->header_raw('Message-ID');
my $different = eml_load('t/msg_iter-order.eml');
$different->header_set('Message-ID', $mid);
ok(!$dd->is_dup($different), "different is_dup with $desc dedupe");
ok(!$dd->is_smsg_dup($smsg), "is_smsg_dup pass w/ $desc dedupe");
ok($dd->is_smsg_dup($smsg), "is_smsg_dup reject w/ $desc dedupe");
+ ok(!$dd->is_dup($sameish),
+ "Message-ID accounted for w/ same content otherwise");
}
$lei->{opt}->{dedupe} = 'bogus';
eval { PublicInbox::LeiDedupe->new($lei) };