]> git.ipfire.org Git - thirdparty/public-inbox.git/commitdiff
contrib/reject_bots: drop persistent connection requirement
authorEric Wong <e@80x24.org>
Fri, 24 Oct 2025 17:34:09 +0000 (17:34 +0000)
committerEric Wong <e@80x24.org>
Sat, 25 Oct 2025 09:37:03 +0000 (09:37 +0000)
Like many measures against aggressive scrapers bots which ignore
or insufficiently support robots.txt, requiring persistent
connections no longer seems effective.  The
$env->{'pi-httpd.request_nr'} field remains for logging,
but will probably be removed, soon.

contrib/RejectBots.pm
lib/PublicInbox/HTTP.pm

index 81c574c06fb4f05cf4994ec824e8fa3e9f52a357..a36ee186c09c8267dddf1b9ee57cfd4dbd1014c3 100644 (file)
@@ -1,9 +1,7 @@
 # Copyright (C) all contributors <meta@public-inbox.org>
 # License: GPL-3.0+ <https://www.gnu.org/licenses/gpl-3.0.txt>
 #
-# Plack/PSGI middleware to reject aggressive scrapers, requires
-# public-inbox-(httpd|netd) to detect persistent connections
-# via $env->{'pi-httpd.request_nr'}.
+# Plack/PSGI middleware to reject aggressive bots
 package RejectBots;
 use v5.12;
 use parent qw(Plack::Middleware);
@@ -22,20 +20,7 @@ sub call {
        my ($self, $env) = @_;
        my $ua = $env->{HTTP_USER_AGENT} // '';
        return [ 403, [], [] ] if $ua =~ /$bad_ua/o;
-       my $uri;
-       if ($env->{PATH_INFO} !~ m!(?:/\.well-known/|\.css\z)! &&
-                       $ua =~ m!\A(?:Mozilla|Opera)/! &&
-                       defined($uri = $env->{REQUEST_URI}) &&
-                       ($env->{HTTP_REFERER} // '') !~ /\Q$uri\E\z/ &&
-                       !$env->{'pi-httpd.request_nr'}) {
-               my $body = <<EOM;
-Requiring persistent connection to access: $uri ...
-EOM
-               [ 200, [ 'Refresh' => 1, 'Content-Type' => 'text/plain',
-                       'Content-Length' => length($body) ], [ $body ] ]
-       } else {
-               $self->app->($env);
-       }
+       $self->app->($env);
 }
 
 1;
index 3cb3b35fb81c3a34d5b65584577868dd279435e7..d61b89511f64d4e252c3679634f81017b1e00434 100644 (file)
@@ -144,7 +144,7 @@ sub app_dispatch {
        my ($self, $input, $rbuf) = @_;
        $self->rbuf_idle($rbuf);
        my $env = $self->{env};
-       $env->{'pi-httpd.request_nr'} = $self->{request_nr}++;
+       $env->{'pi-httpd.request_nr'} = $self->{request_nr}++; # TODO remove?
        $self->{env} = undef; # for exists() check in ->busy
        $env->{REMOTE_ADDR} = $self->{remote_addr} // '127.0.0.1';
        $env->{REMOTE_PORT} = $self->{remote_port};