]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
bpo-35922: Fix RobotFileParser when robots.txt has no relevant crawl delay or request...
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Sun, 16 Jun 2019 07:10:06 +0000 (00:10 -0700)
committerGitHub <noreply@github.com>
Sun, 16 Jun 2019 07:10:06 +0000 (00:10 -0700)
Co-Authored-By: Tal Einat <taleinat+github@gmail.com>
(cherry picked from commit 8047e0e1c620f69cc21f9ca48b24bf2cdd5c3668)

Co-authored-by: Rémi Lapeyre <remi.lapeyre@henki.fr>
Lib/test/test_robotparser.py
Lib/urllib/robotparser.py
Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst [new file with mode: 0644]

index 140636590aa8600429638eda6569353e72263564..d478e7f127fd54242b551545476e2a0f835fd25a 100644 (file)
@@ -76,30 +76,38 @@ Disallow: /
 
 
 class BaseRequestRateTest(BaseRobotTest):
+    request_rate = None
+    crawl_delay = None
 
     def test_request_rate(self):
+        parser = self.parser
         for url in self.good + self.bad:
             agent, url = self.get_agent_and_url(url)
             with self.subTest(url=url, agent=agent):
-                if self.crawl_delay:
-                    self.assertEqual(
-                        self.parser.crawl_delay(agent), self.crawl_delay
-                    )
-                if self.request_rate:
+                self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
+
+                parsed_request_rate = parser.request_rate(agent)
+                self.assertEqual(parsed_request_rate, self.request_rate)
+                if self.request_rate is not None:
                     self.assertIsInstance(
-                        self.parser.request_rate(agent),
+                        parsed_request_rate,
                         urllib.robotparser.RequestRate
                     )
                     self.assertEqual(
-                        self.parser.request_rate(agent).requests,
+                        parsed_request_rate.requests,
                         self.request_rate.requests
                     )
                     self.assertEqual(
-                        self.parser.request_rate(agent).seconds,
+                        parsed_request_rate.seconds,
                         self.request_rate.seconds
                     )
 
 
+class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
+    robots_txt = ''
+    good = ['/foo']
+
+
 class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
     robots_txt = """\
 User-agent: figtree
@@ -120,10 +128,6 @@ Disallow: /%7ejoe/index.html
 
 class DifferentAgentTest(CrawlDelayAndRequestRateTest):
     agent = 'FigTree Robot libwww-perl/5.04'
-    # these are not actually tested, but we still need to parse it
-    # in order to accommodate the input parameters
-    request_rate = None
-    crawl_delay = None
 
 
 class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
index 883ef249210ebcede3e84d11c12f224530b2a645..f3bd806f072683d2448c441b2130fef70014449a 100644 (file)
@@ -179,7 +179,9 @@ class RobotFileParser:
         for entry in self.entries:
             if entry.applies_to(useragent):
                 return entry.delay
-        return self.default_entry.delay
+        if self.default_entry:
+            return self.default_entry.delay
+        return None
 
     def request_rate(self, useragent):
         if not self.mtime():
@@ -187,7 +189,9 @@ class RobotFileParser:
         for entry in self.entries:
             if entry.applies_to(useragent):
                 return entry.req_rate
-        return self.default_entry.req_rate
+        if self.default_entry:
+            return self.default_entry.req_rate
+        return None
 
     def __str__(self):
         entries = self.entries
diff --git a/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst b/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst
new file mode 100644 (file)
index 0000000..5271a49
--- /dev/null
@@ -0,0 +1,4 @@
+Fix :meth:`RobotFileParser.crawl_delay` and
+:meth:`RobotFileParser.request_rate` to return ``None`` rather than
+raise :exc:`AttributeError` when no relevant rule is defined in the
+robots.txt file.  Patch by Rémi Lapeyre.