From: Eric Wong Date: Tue, 25 Jun 2024 18:49:37 +0000 (+0000) Subject: speedup $EXTRACT_DIFFS callers by 1% X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=410a19a8088101d8aafd14dc019fbb2e9a71d5f9;p=thirdparty%2Fpublic-inbox.git speedup $EXTRACT_DIFFS callers by 1% While Perl docs recommend against using //o, we know the regexp won't change at runtime and there's a measurable improvement to be found. The included perf test on a packed mirror of meta@public-inbox.org shows a consistent ~1% improvement on my system. --- diff --git a/MANIFEST b/MANIFEST index 3fc01a435..af65a86e8 100644 --- a/MANIFEST +++ b/MANIFEST @@ -662,6 +662,7 @@ xt/net_nntp_socks.t xt/net_writer-imap.t xt/nntpd-validate.t xt/over-fsck.perl +xt/perf-extract-diffs.t xt/perf-msgview.t xt/perf-nntpd.t xt/perf-threading.t diff --git a/lib/PublicInbox/LeiRediff.pm b/lib/PublicInbox/LeiRediff.pm index 66359dd44..59fee3f6f 100644 --- a/lib/PublicInbox/LeiRediff.pm +++ b/lib/PublicInbox/LeiRediff.pm @@ -158,7 +158,7 @@ sub extract_oids { # Eml each_part callback $self->{dqre} && $s =~ s/$self->{dqre}//g && $lei->{opt}->{drq} and local $lei->{1} = requote($lei, $1); - my @top = split($PublicInbox::ViewDiff::EXTRACT_DIFFS, $s); + my @top = split(/$PublicInbox::ViewDiff::EXTRACT_DIFFS/o, $s); undef $s; my $blobs = $self->{blobs}; # blobs to resolve my $ctxq; diff --git a/lib/PublicInbox/LeiViewText.pm b/lib/PublicInbox/LeiViewText.pm index 6510b19ec..fa608ca2b 100644 --- a/lib/PublicInbox/LeiViewText.pm +++ b/lib/PublicInbox/LeiViewText.pm @@ -187,7 +187,7 @@ EOF sub flush_text_diff ($$) { my ($self, $cur) = @_; - my @top = split($PublicInbox::ViewDiff::EXTRACT_DIFFS, $$cur); + my @top = split(/$PublicInbox::ViewDiff::EXTRACT_DIFFS/o, $$cur); undef $$cur; # free memory my $dctx; my $obuf = $self->{obuf}; diff --git a/lib/PublicInbox/ViewDiff.pm b/lib/PublicInbox/ViewDiff.pm index 7a6d9a2b9..9ddf85152 100644 --- a/lib/PublicInbox/ViewDiff.pm +++ b/lib/PublicInbox/ViewDiff.pm @@ -183,7 +183,7 @@ sub flush_diff ($$) { my ($ctx, $cur) = @_; my ($subj) = ($$cur =~ /^Subject:\s*\[[^\]]+\]\s*(.+?)$/sm); - my @top = split($EXTRACT_DIFFS, $$cur); + my @top = split(/$EXTRACT_DIFFS/o, $$cur); undef $$cur; # free memory $ctx->{-qry_subj} = $subj if $subj; my $lnk = $ctx->{-linkify}; diff --git a/xt/perf-extract-diffs.t b/xt/perf-extract-diffs.t new file mode 100644 index 000000000..72c39ed16 --- /dev/null +++ b/xt/perf-extract-diffs.t @@ -0,0 +1,80 @@ +#!perl -w +# Copyright (C) all contributors +# License: AGPL-3.0+ +use v5.12; +use PublicInbox::TestCommon; +use Benchmark qw(:all :hireswallclock); +use PublicInbox::Inbox; +use PublicInbox::ViewDiff; +use PublicInbox::MsgIter qw(msg_part_text); +my $nr = $ENV{NR} // 5; +my $inboxdir = $ENV{GIANT_INBOX_DIR}; +plan skip_all => "GIANT_INBOX_DIR not defined for $0" unless $inboxdir; + +my @cat = qw(cat-file --buffer --batch-check --batch-all-objects); +if (require_git(v2.19, 1)) { + push @cat, '--unordered'; +} else { + warn +"git <2.19, cat-file lacks --unordered, locality suffers\n"; +} +my $ibx = PublicInbox::Inbox->new({ inboxdir => $inboxdir, name => 'name' }); +my $git = $ibx->git; +my ($eml, $res, $oid, $type, $n, $m); +my ($part, $s, $err, @top); +sub text_part { + $part = $_[0]->[0]; + ($s, $err) = msg_part_text($part, $part->content_type || 'text/plain'); + $s // return; + $s =~ s/\r+\n/\n/sg; +} + +my %extract_cb = ( + var => sub { # callback for Eml->each_part + text_part(@_) // return; + my @top = split($PublicInbox::ViewDiff::EXTRACT_DIFFS, $s); + }, + slash => sub { # callback for Eml->each_part + text_part(@_) // return; + my @top = split(/$PublicInbox::ViewDiff::EXTRACT_DIFFS/, $s); + }, + slash_o => sub { # callback for Eml->each_part + text_part(@_) // return; + my @top = split(/$PublicInbox::ViewDiff::EXTRACT_DIFFS/o, $s); + }, +); + +my $oid_cb = sub { + my ($bref, undef, undef, undef, $cb) = @_; + ++$m; + $eml = PublicInbox::Eml->new($bref); + $eml->each_part($cb); +}; + +# ensure all --batch-check processes are ready +my @cats = map { + my $fh = $git->popen(@cat); + vec(my $vec = '', fileno($fh), 1) = 1; + select($vec, undef, undef, 60) or + xbail 'timed out waiting for --batch-check'; + $fh +} (1..((scalar keys %extract_cb) * $nr)); + +my $time; +while (my ($name, $eml_cb) = each %extract_cb) { + $time->{$name} = sub { + my $fh = shift @cats // xbail "no --batch-check for $name"; + $n = $m = 0; + while (<$fh>) { + ($oid, $type) = split / /; + next if $type ne 'blob'; + ++$n; + $git->cat_async($oid, $oid_cb, $eml_cb); + } + $git->async_wait_all; + is $n, $m, "$n of $m messages scanned ($name)"; + }; +} + +timethese($nr, $time); +done_testing;