]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-79638: Treat an unreachable robots.txt as "disallow all" (GH-138555)
authorSerhiy Storchaka <storchaka@gmail.com>
Thu, 7 May 2026 19:06:57 +0000 (22:06 +0300)
committerGitHub <noreply@github.com>
Thu, 7 May 2026 19:06:57 +0000 (22:06 +0300)
Disallow all access in urllib.robotparser if the robots.txt file is
unreachable due to server or network errors.

Lib/test/test_robotparser.py
Lib/urllib/robotparser.py
Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst [new file with mode: 0644]

index 3ea0ec66fbfbe9e453c5cbc8c63167af2b3ac49d..65bfe815705e0a0da1b21c946802dc798f3636b1 100644 (file)
@@ -646,26 +646,23 @@ Disallow: /spam\
 )
 class BaseLocalNetworkTestCase:
 
-    def setUp(self):
+    @classmethod
+    def setUpClass(cls):
         # clear _opener global variable
-        self.addCleanup(urllib.request.urlcleanup)
+        cls.addClassCleanup(urllib.request.urlcleanup)
 
-        self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
+        cls.server = HTTPServer((socket_helper.HOST, 0), cls.RobotHandler)
+        cls.addClassCleanup(cls.server.server_close)
 
-        self.t = threading.Thread(
+        t = threading.Thread(
             name='HTTPServer serving',
-            target=self.server.serve_forever,
+            target=cls.server.serve_forever,
             # Short poll interval to make the test finish quickly.
             # Time between requests is short enough that we won't wake
             # up spuriously too many times.
             kwargs={'poll_interval':0.01})
-        self.t.daemon = True  # In case this function raises.
-        self.t.start()
-
-    def tearDown(self):
-        self.server.shutdown()
-        self.t.join()
-        self.server.server_close()
+        cls.enterClassContext(threading_helper.start_threads([t]))
+        cls.addClassCleanup(cls.server.shutdown)
 
 
 SAMPLE_ROBOTS_TXT = b'''\
@@ -687,7 +684,6 @@ class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
         def log_message(self, format, *args):
             pass
 
-    @threading_helper.reap_threads
     def testRead(self):
         # Test that reading a weird robots.txt doesn't fail.
         addr = self.server.server_address
@@ -709,17 +705,21 @@ class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
         self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
 
 
-class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
+class HttpErrorsTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
     class RobotHandler(BaseHTTPRequestHandler):
 
         def do_GET(self):
-            self.send_error(403, "Forbidden access")
+            self.send_error(self.server.return_code)
 
         def log_message(self, format, *args):
             pass
 
-    @threading_helper.reap_threads
+    def setUp(self):
+        # Make sure that a valid code is set in the test.
+        self.server.return_code = None
+
     def testPasswordProtectedSite(self):
+        self.server.return_code = 403
         addr = self.server.server_address
         url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
         robots_url = url + "/robots.txt"
@@ -727,6 +727,40 @@ class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase)
         parser.set_url(url)
         parser.read()
         self.assertFalse(parser.can_fetch("*", robots_url))
+        self.assertFalse(parser.can_fetch("*", url + '/some/file.html'))
+
+    def testNotFound(self):
+        self.server.return_code = 404
+        addr = self.server.server_address
+        url = f'http://{socket_helper.HOST}:{addr[1]}'
+        robots_url = url + "/robots.txt"
+        parser = urllib.robotparser.RobotFileParser()
+        parser.set_url(url)
+        parser.read()
+        self.assertTrue(parser.can_fetch("*", robots_url))
+        self.assertTrue(parser.can_fetch("*", url + '/path/file.html'))
+
+    def testTeapot(self):
+        self.server.return_code = 418
+        addr = self.server.server_address
+        url = f'http://{socket_helper.HOST}:{addr[1]}'
+        robots_url = url + "/robots.txt"
+        parser = urllib.robotparser.RobotFileParser()
+        parser.set_url(url)
+        parser.read()
+        self.assertTrue(parser.can_fetch("*", robots_url))
+        self.assertTrue(parser.can_fetch("*", url + '/pot-1?milk-type=Cream'))
+
+    def testServiceUnavailable(self):
+        self.server.return_code = 503
+        addr = self.server.server_address
+        url = f'http://{socket_helper.HOST}:{addr[1]}'
+        robots_url = url + "/robots.txt"
+        parser = urllib.robotparser.RobotFileParser()
+        parser.set_url(url)
+        parser.read()
+        self.assertFalse(parser.can_fetch("*", robots_url))
+        self.assertFalse(parser.can_fetch("*", url + '/path/file.html'))
 
 
 @support.requires_working_socket()
index e70eae80036784006fa28bc5c05632fa46ece69d..0c3e5d9289093583a53b453b6d056cfaf47eb79e 100644 (file)
@@ -65,9 +65,17 @@ class RobotFileParser:
             f = urllib.request.urlopen(self.url)
         except urllib.error.HTTPError as err:
             if err.code in (401, 403):
+                # If access to robot.txt has the status Unauthorized/Forbidden,
+                # then most likely this applies to the entire site.
                 self.disallow_all = True
-            elif err.code >= 400 and err.code < 500:
+            elif 400 <= err.code < 500:
+                # RFC 9309, Section 2.3.1.3: the crawler MAY access any
+                # resources on the server.
                 self.allow_all = True
+            elif 500 <= err.code < 600:
+                # RFC 9309, Section 2.3.1.4: the crawler MUST assume
+                # complete disallow.
+                self.disallow_all = True
             err.close()
         else:
             raw = f.read()
diff --git a/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst b/Misc/NEWS.d/next/Library/2025-09-05-20-50-35.gh-issue-79638.Y-JfaH.rst
new file mode 100644 (file)
index 0000000..bd9fff0
--- /dev/null
@@ -0,0 +1,2 @@
+Disallow all access in :mod:`urllib.robotparser` if the ``robots.txt`` file
+is unreachable due to server or network errors.