From b3e430bc4518282873b5115bb48b26df3da958dc Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Fri, 24 Oct 2025 17:34:09 +0000 Subject: [PATCH] contrib/reject_bots: drop persistent connection requirement Like many measures against aggressive scrapers bots which ignore or insufficiently support robots.txt, requiring persistent connections no longer seems effective. The $env->{'pi-httpd.request_nr'} field remains for logging, but will probably be removed, soon. --- contrib/RejectBots.pm | 19 ++----------------- lib/PublicInbox/HTTP.pm | 2 +- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/contrib/RejectBots.pm b/contrib/RejectBots.pm index 81c574c06..a36ee186c 100644 --- a/contrib/RejectBots.pm +++ b/contrib/RejectBots.pm @@ -1,9 +1,7 @@ # Copyright (C) all contributors # License: GPL-3.0+ # -# Plack/PSGI middleware to reject aggressive scrapers, requires -# public-inbox-(httpd|netd) to detect persistent connections -# via $env->{'pi-httpd.request_nr'}. +# Plack/PSGI middleware to reject aggressive bots package RejectBots; use v5.12; use parent qw(Plack::Middleware); @@ -22,20 +20,7 @@ sub call { my ($self, $env) = @_; my $ua = $env->{HTTP_USER_AGENT} // ''; return [ 403, [], [] ] if $ua =~ /$bad_ua/o; - my $uri; - if ($env->{PATH_INFO} !~ m!(?:/\.well-known/|\.css\z)! && - $ua =~ m!\A(?:Mozilla|Opera)/! && - defined($uri = $env->{REQUEST_URI}) && - ($env->{HTTP_REFERER} // '') !~ /\Q$uri\E\z/ && - !$env->{'pi-httpd.request_nr'}) { - my $body = < 1, 'Content-Type' => 'text/plain', - 'Content-Length' => length($body) ], [ $body ] ] - } else { - $self->app->($env); - } + $self->app->($env); } 1; diff --git a/lib/PublicInbox/HTTP.pm b/lib/PublicInbox/HTTP.pm index 3cb3b35fb..d61b89511 100644 --- a/lib/PublicInbox/HTTP.pm +++ b/lib/PublicInbox/HTTP.pm @@ -144,7 +144,7 @@ sub app_dispatch { my ($self, $input, $rbuf) = @_; $self->rbuf_idle($rbuf); my $env = $self->{env}; - $env->{'pi-httpd.request_nr'} = $self->{request_nr}++; + $env->{'pi-httpd.request_nr'} = $self->{request_nr}++; # TODO remove? $self->{env} = undef; # for exists() check in ->busy $env->{REMOTE_ADDR} = $self->{remote_addr} // '127.0.0.1'; $env->{REMOTE_PORT} = $self->{remote_port}; -- 2.47.3