]> git.ipfire.org Git - thirdparty/public-inbox.git/commitdiff
contrib: PSGI RejectBots middleware
authorEric Wong <e@80x24.org>
Sun, 13 Jul 2025 00:08:36 +0000 (00:08 +0000)
committerEric Wong <e@80x24.org>
Wed, 16 Jul 2025 09:21:20 +0000 (09:21 +0000)
RejectBots rejects certain user-agents and forces meta-refresh
while requiring the use of persistent connections.  While the
first two techniques are relatively well-known, the persistent
connection requirement requires the public-inbox-(netd|httpd) to
be directly connected with the remote client without something
like (haproxy|nginx) in front of it.

This middleware is used by public-inbox.org and the
yhbt.net/lore mirror.

MANIFEST
contrib/RejectBots.pm [new file with mode: 0644]

index cc105c191e8ff2d6a216cd3ba7e2235894686759..aa4d8465edab467b6870ed5b3c204a635b66a17e 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -114,6 +114,7 @@ certs/create-certs.perl
 ci/README
 ci/profiles.perl
 ci/run.sh
+contrib/RejectBots.pm
 contrib/completion/lei-completion.bash
 contrib/css/216dark.css
 contrib/css/216light.css
diff --git a/contrib/RejectBots.pm b/contrib/RejectBots.pm
new file mode 100644 (file)
index 0000000..28561fa
--- /dev/null
@@ -0,0 +1,42 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: GPL-3.0+ <https://www.gnu.org/licenses/gpl-3.0.txt>
+#
+# Plack/PSGI middleware to reject aggressive scrapers, requires
+# public-inbox-(httpd|netd) to detect persistent connections
+# via $env->{'pi-httpd.request_nr'}.
+package RejectBots;
+use v5.12;
+use parent qw(Plack::Middleware);
+
+my $bad_ua = join '|',
+      'Bytespider', 'meta-externalagent', 'petalbot',
+      'dataforseo', 'mj12bot', 'yandex', 'zoominfobot',
+      'amazonbot', 'barkrowler', 'oai-searchbot', 'chatgpt',
+      'semrushbot', 'ahrefsbot', 'alibaba',
+      'gptbot', 'awario.*bot', 'magesiftbot', 'serpstatbot',
+      'claudebot', 'google-extended', 'seekport crawler',
+      'blexbot', 'turnitin', 'Scrapy', 'bingbot';
+$bad_ua = qr/(?:$bad_ua)/i;
+
+sub call {
+       my ($self, $env) = @_;
+       my $ua = $env->{HTTP_USER_AGENT} // '';
+       return [ 403, [], [] ] if $ua =~ /$bad_ua/o;
+       my $res = $self->app->($env);
+       my $uri;
+       if ($env->{PATH_INFO} !~ /\.css\z/ &&
+                       $ua =~ m!\A(?:Mozilla|Opera)/! &&
+                       defined($uri = $env->{REQUEST_URI}) &&
+                       ($env->{HTTP_REFERER} // '') !~ /\Q$uri\E\z/ &&
+                       !$env->{'pi-httpd.request_nr'}) {
+               my $body = <<EOM;
+Requiring persistent connection to access: $uri ...
+EOM
+               [ 200, [ 'Refresh' => 1,
+                       'Content-Length' => length($body) ], [ $body ] ]
+       } else {
+               $res;
+       }
+}
+
+1;