From: Eric Wong Date: Sun, 13 Jul 2025 00:08:36 +0000 (+0000) Subject: contrib: PSGI RejectBots middleware X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c849f08a150fd278997607c8e36ea4f6b2ac7718;p=thirdparty%2Fpublic-inbox.git contrib: PSGI RejectBots middleware RejectBots rejects certain user-agents and forces meta-refresh while requiring the use of persistent connections. While the first two techniques are relatively well-known, the persistent connection requirement requires the public-inbox-(netd|httpd) to be directly connected with the remote client without something like (haproxy|nginx) in front of it. This middleware is used by public-inbox.org and the yhbt.net/lore mirror. --- diff --git a/MANIFEST b/MANIFEST index cc105c191..aa4d8465e 100644 --- a/MANIFEST +++ b/MANIFEST @@ -114,6 +114,7 @@ certs/create-certs.perl ci/README ci/profiles.perl ci/run.sh +contrib/RejectBots.pm contrib/completion/lei-completion.bash contrib/css/216dark.css contrib/css/216light.css diff --git a/contrib/RejectBots.pm b/contrib/RejectBots.pm new file mode 100644 index 000000000..28561fa79 --- /dev/null +++ b/contrib/RejectBots.pm @@ -0,0 +1,42 @@ +# Copyright (C) all contributors +# License: GPL-3.0+ +# +# Plack/PSGI middleware to reject aggressive scrapers, requires +# public-inbox-(httpd|netd) to detect persistent connections +# via $env->{'pi-httpd.request_nr'}. +package RejectBots; +use v5.12; +use parent qw(Plack::Middleware); + +my $bad_ua = join '|', + 'Bytespider', 'meta-externalagent', 'petalbot', + 'dataforseo', 'mj12bot', 'yandex', 'zoominfobot', + 'amazonbot', 'barkrowler', 'oai-searchbot', 'chatgpt', + 'semrushbot', 'ahrefsbot', 'alibaba', + 'gptbot', 'awario.*bot', 'magesiftbot', 'serpstatbot', + 'claudebot', 'google-extended', 'seekport crawler', + 'blexbot', 'turnitin', 'Scrapy', 'bingbot'; +$bad_ua = qr/(?:$bad_ua)/i; + +sub call { + my ($self, $env) = @_; + my $ua = $env->{HTTP_USER_AGENT} // ''; + return [ 403, [], [] ] if $ua =~ /$bad_ua/o; + my $res = $self->app->($env); + my $uri; + if ($env->{PATH_INFO} !~ /\.css\z/ && + $ua =~ m!\A(?:Mozilla|Opera)/! && + defined($uri = $env->{REQUEST_URI}) && + ($env->{HTTP_REFERER} // '') !~ /\Q$uri\E\z/ && + !$env->{'pi-httpd.request_nr'}) { + my $body = < 1, + 'Content-Length' => length($body) ], [ $body ] ] + } else { + $res; + } +} + +1;