good = []
bad = []
site_maps = None
+ expected_output = None
def __init_subclass__(cls):
super().__init_subclass__()
# Remove tests that do nothing.
- if not cls.good:
- cls.test_good_urls = None
- if not cls.bad:
- cls.test_bad_urls = None
+ if issubclass(cls, unittest.TestCase):
+ if not cls.good:
+ cls.test_good_urls = None
+ if not cls.bad:
+ cls.test_bad_urls = None
+ if cls.expected_output is None:
+ cls.test_string_formatting = None
def setUp(self):
lines = io.StringIO(self.robots_txt).readlines()
def test_site_maps(self):
self.assertEqual(self.parser.site_maps(), self.site_maps)
+ def test_string_formatting(self):
+ self.assertEqual(str(self.parser), self.expected_output)
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
+class SimpleExampleTest(BaseRobotTest, unittest.TestCase):
+ # Example from RFC 9309, section 5.1.
+ robots_txt = """\
+User-Agent: *
+Disallow: *.gif$
+Disallow: /example/
+Allow: /publications/
+
+User-Agent: foobot
+Disallow:/
+Allow:/example/page.html
+Allow:/example/allowed.gif
+
+User-Agent: barbot
+User-Agent: bazbot
+Disallow: /example/page.html
+
+User-Agent: quxbot
+ """
+ good = [
+ '/', '/publications/',
+ ('foobot', '/example/page.html'), ('foobot', '/example/allowed.gif'),
+ ('barbot', '/'), ('barbot', '/example/'),
+ ('barbot', '/example/allowed.gif'),
+ ('barbot', '/example/disallowed.gif'),
+ ('barbot', '/publications/'),
+ ('barbot', '/publications/allowed.gif'),
+ ('bazbot', '/'), ('bazbot', '/example/'),
+ ('bazbot', '/example/allowed.gif'),
+ ('bazbot', '/example/disallowed.gif'),
+ ('bazbot', '/publications/'),
+ ('bazbot', '/publications/allowed.gif'),
+ ('quxbot', '/'), ('quxbot', '/example/'),
+ ('quxbot', '/example/page.html'), ('quxbot', '/example/allowed.gif'),
+ ('quxbot', '/example/disallowed.gif'),
+ ('quxbot', '/publications/'),
+ ('quxbot', '/publications/allowed.gif'),
+ ]
+ bad = [
+ '/example/', '/example/page.html', '/example/allowed.gif',
+ '/example/disallowed.gif',
+ '/publications/allowed.gif',
+ ('foobot', '/'), ('foobot', '/example/'),
+ ('foobot', '/example/disallowed.gif'),
+ ('foobot', '/publications/'),
+ ('foobot', '/publications/allowed.gif'),
+ ('barbot', '/example/page.html'),
+ ('bazbot', '/example/page.html'),
+ ]
+
class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /
"""
- good = []
+ good = ['/robots.txt']
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = ''
good = ['/foo']
+ expected_output = ''
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
- # the order of User-agent should be correct. note
- # that this file is incorrect because "Googlebot" is a
- # substring of "Googlebot-Mobile"
+ # the order of User-agent should not matter
robots_txt = """\
User-agent: Googlebot
Disallow: /
+Allow: /folder1/
User-agent: Googlebot-Mobile
Allow: /
+Disallow: /folder1/
"""
agent = 'Googlebot'
bad = ['/something.jpg']
+ good = ['/folder1/myfile.html']
class UserAgentGoogleMobileTest(UserAgentOrderingTest):
- agent = 'Googlebot-Mobile'
+ agent = 'Googlebot-mobile'
+ bad = ['/folder1/myfile.html']
+ good = ['/something.jpg']
-class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
- # Google also got the order wrong. You need
- # to specify the URLs from more specific to more general
+class LongestMatchTest(BaseRobotTest, unittest.TestCase):
+ # Based on example from RFC 9309, section 5.2.
robots_txt = """\
-User-agent: Googlebot
-Allow: /folder1/myfile.html
-Disallow: /folder1/
+User-agent: *
+Allow: /example/page/
+Disallow: /example/page/disallowed.gif
+Allow: /example/
"""
- agent = 'googlebot'
- good = ['/folder1/myfile.html']
- bad = ['/folder1/anotherfile.html']
+ good = ['/example/', '/example/page/']
+ bad = ['/example/page/disallowed.gif']
+
+
+class LongestMatchWildcardTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+User-agent: *
+Allow: /example/page/
+Disallow: *.gif
+Allow: /example/
+ """
+ good = ['/example/', '/example/page/']
+ bad = ['/example/page/disallowed.gif', '/x.gif']
+
+
+class AllowWinsEqualMatchTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+User-agent: *
+Disallow: /spam
+Allow: /spam
+Disallow: /spam
+ """
+ good = ['/spam', '/spam/']
+
+
+class AllowWinsEqualFullMatchTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+User-agent: *
+Disallow: /spam
+Allow: /spam$
+Disallow: /spam
+Disallow: /eggs$
+Allow: /eggs
+Disallow: /eggs$
+ """
+ good = ['/spam', '/eggs', '/eggs/']
+ bad = ['/spam/']
+
+
+class AllowWinsEqualMatchWildcardTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+User-agent: *
+Disallow: /spam
+Allow: *am
+Disallow: /spam
+Disallow: *gs
+Allow: /eggs
+Disallow: *gs
+ """
+ good = ['/spam', '/eggs', '/spam/', '/eggs/']
+
+
+class MergeGroupsTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+User-agent: spambot
+Disallow: /some/path
+
+User-agent: spambot
+Disallow: /another/path
+ """
+ agent = 'spambot'
+ bad = ['/some/path', '/another/path']
+
+
+class UserAgentStartsGroupTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+User-agent: spambot
+Disallow: /some/path
+User-agent: eggsbot
+Disallow: /another/path
+ """
+ good = [('spambot', '/'), ('spambot', '/another/path'),
+ ('eggsbot', '/'), ('eggsbot', '/some/path')]
+ bad = [('spambot', '/some/path'), ('eggsbot', '/another/path')]
+ expected_output = """\
+User-agent: spambot
+Disallow: /some/path
+
+User-agent: eggsbot
+Disallow: /another/path\
+"""
+
+class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+User-agent: spambot
+
+User-agent: eggsbot
+Disallow: /some/path
+
+Disallow: /another/path
+ """
+ good = [('spambot', '/'), ('eggsbot', '/')]
+ bad = [
+ ('spambot', '/some/path'), ('spambot', '/another/path'),
+ ('eggsbot', '/some/path'), ('eggsbot', '/another/path'),
+ ]
+ expected_output = """\
+User-agent: spambot
+User-agent: eggsbot
+Disallow: /some/path
+Disallow: /another/path\
+"""
+
+
+class IgnoreRulesWithoutUserAgentTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+Disallow: /some/path
+
+User-agent: *
+Disallow: /another/path
+ """
+ good = ['/', '/some/path']
+ bad = ['/another/path']
+ expected_output = """\
+User-agent: *
+Disallow: /another/path\
+"""
+
+
+class EmptyGroupTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+User-agent: *
+Disallow: /some/path
+
+User-agent: spambot
+ """
+ agent = 'spambot'
+ good = ['/', '/some/path']
+ expected_output = """\
+User-agent: *
+Disallow: /some/path
+
+User-agent: spambot
+Allow:\
+"""
+
+
+class WeirdPathTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = f"""\
+User-agent: *
+Disallow: /a$$$
+Disallow: /b$z
+Disallow: /c***
+Disallow: /d***z
+Disallow: /e*$**$$
+Disallow: /f*$**$$z
+Disallow: /g$*$$**
+Disallow: /h$*$$**z
+ """
+ good = ['/ax', '/a$$', '/b', '/bz', '/b$z', '/d', '/f', '/fz',
+ '/f$$$z', '/fx$y$$z', '/gx', '/g$$$', '/g$x$$y', '/h', '/hz',
+ '/h$$$z', '/h$x$$yz']
+ bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy',
+ '/e$$', '/ex$y$', '/g']
+ expected_output = """\
+User-agent: *
+Disallow: /a$
+Disallow: /c*
+Disallow: /d*z
+Disallow: /e*$
+Disallow: /g$\
+"""
+
+
+class PathWithManyWildcardsTest(BaseRobotTest, unittest.TestCase):
+ # This test would take many years if use naive translation to regular
+ # expression (* -> .*).
+ N = 50
+ robots_txt = f"""\
+User-agent: *
+Disallow: /{'*a'*N}*b
+ """
+ good = ['/' + 'a'*N + 'a']
+ bad = ['/' + 'a'*N + 'b']
class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
good = ['/some/path', '/some/path?',
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
'/another/path', '/another/path%3F',
- '/yet/one/path?name=value%26more']
+ '/yet/one/path?name=value%26more',
+ '/some/pathxname=value']
bad = ['/some/path?name=value'
'/another/path?', '/another/path?name=value',
'/yet/one/path?name=value&more']
-class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
- # obey first * entry (#4108)
- robots_txt = """\
-User-agent: *
-Disallow: /some/path
-
-User-agent: *
-Disallow: /another/path
- """
- good = ['/another/path']
- bad = ['/some/path']
-
-
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
"""
expected_output = """\
-User-agent: cybermapper
-Disallow: /some/path
-
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
-Disallow: /cyberworld/map/\
+Disallow: /cyberworld/map/
+
+User-agent: cybermapper
+Disallow: /some/path\
"""
- def test_string_formatting(self):
- self.assertEqual(str(self.parser), self.expected_output)
+
+class ConstructedStringFormattingTest(unittest.TestCase):
+ def test_empty(self):
+ parser = urllib.robotparser.RobotFileParser()
+ self.assertEqual(str(parser), '')
+
+ def test_group_without_rules(self):
+ parser = urllib.robotparser.RobotFileParser()
+ entry = urllib.robotparser.Entry()
+ entry.useragents = ['spambot']
+ parser._add_entry(entry)
+ entry = urllib.robotparser.Entry()
+ entry.useragents = ['hambot']
+ entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
+ parser._add_entry(entry)
+ entry = urllib.robotparser.Entry()
+ entry.useragents = ['eggsbot']
+ parser._add_entry(entry)
+ self.assertEqual(str(parser), """\
+User-agent: spambot
+Allow:
+
+User-agent: hambot
+Disallow: /ham
+
+User-agent: eggsbot
+Allow:\
+""")
+
+ def test_group_without_user_agent(self):
+ parser = urllib.robotparser.RobotFileParser()
+ entry = urllib.robotparser.Entry()
+ entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
+ parser._add_entry(entry)
+ entry = urllib.robotparser.Entry()
+ entry.useragents = ['spambot']
+ entry.rulelines = [urllib.robotparser.RuleLine('/spam', False)]
+ parser._add_entry(entry)
+ entry = urllib.robotparser.Entry()
+ entry.rulelines = [urllib.robotparser.RuleLine('/eggs', False)]
+ parser._add_entry(entry)
+ self.assertEqual(str(parser), """\
+User-agent: spambot
+Disallow: /spam\
+""")
@unittest.skipUnless(
def test_can_fetch(self):
self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
- self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
+ self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian')))
self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
self.assertTrue(self.parser.can_fetch('*', self.base_url))
2) PSF license for Python 2.2
The robots.txt Exclusion Protocol is implemented as specified in
- http://www.robotstxt.org/norobots-rfc.txt
+ RFC 9309
"""
import collections
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
-def normalize(path):
- unquoted = urllib.parse.unquote(path, errors='surrogateescape')
- return urllib.parse.quote(unquoted, errors='surrogateescape')
-
-def normalize_path(path):
- path, sep, query = path.partition('?')
- path = normalize(path)
- if sep:
- query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
- path += '?' + query
- return path
-
-
class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
def __init__(self, url=''):
self.entries = []
+ self.groups = {}
self.sitemaps = []
self.default_entry = None
self.disallow_all = False
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
def _add_entry(self, entry):
- if "*" in entry.useragents:
- # the default entry is considered last
- if self.default_entry is None:
- # the first default entry wins
- self.default_entry = entry
- else:
- self.entries.append(entry)
+ self.entries.append(entry)
+ for agent in entry.useragents:
+ agent = agent.lower()
+ if agent not in self.groups:
+ self.groups[agent] = entry
+ else:
+ self.groups[agent] = merge_entries(self.groups[agent], entry)
def parse(self, lines):
"""Parse the input lines from a robots.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines.
"""
+ entries = []
# states:
# 0: start state
# 1: saw user-agent line
self.modified()
for line in lines:
- if not line:
- if state == 1:
- entry = Entry()
- state = 0
- elif state == 2:
- self._add_entry(entry)
- entry = Entry()
- state = 0
# remove optional comment and strip line
i = line.find('#')
if i >= 0:
if state == 2:
self._add_entry(entry)
entry = Entry()
- entry.useragents.append(line[1])
+ product_token = line[1]
+ entry.useragents.append(product_token)
state = 1
elif line[0] == "disallow":
if state != 0:
- entry.rulelines.append(RuleLine(line[1], False))
state = 2
+ try:
+ entry.rulelines.append(RuleLine(line[1], False))
+ except ValueError:
+ pass
elif line[0] == "allow":
if state != 0:
- entry.rulelines.append(RuleLine(line[1], True))
state = 2
+ try:
+ entry.rulelines.append(RuleLine(line[1], True))
+ except ValueError:
+ pass
elif line[0] == "crawl-delay":
if state != 0:
# before trying to convert to int we need to make
# so it doesn't matter where you place it in your file."
# Therefore we do not change the state of the parser.
self.sitemaps.append(line[1])
- if state == 2:
+ if state != 0:
self._add_entry(entry)
+ def _find_entry(self, useragent):
+ entry = self.groups.get(useragent.lower())
+ if entry is not None:
+ return entry
+ for entry in self.groups.values():
+ if entry.applies_to(useragent):
+ return entry
+ return self.groups.get('*')
+
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
if self.disallow_all:
# the first match counts
parsed_url = urllib.parse.urlsplit(url)
url = urllib.parse.urlunsplit(('', '', *parsed_url[2:]))
- url = normalize_path(url)
+ url = normalize_uri(url)
if not url:
url = "/"
- for entry in self.entries:
- if entry.applies_to(useragent):
- return entry.allowance(url)
- # try the default entry last
- if self.default_entry:
- return self.default_entry.allowance(url)
- # agent not found ==> access granted
- return True
+ if url == '/robots.txt':
+ # The /robots.txt URI is implicitly allowed.
+ return True
+ entry = self._find_entry(useragent)
+ if entry is None:
+ return True
+ return entry.allowance(url)
def crawl_delay(self, useragent):
if not self.mtime():
return None
- for entry in self.entries:
- if entry.applies_to(useragent):
- return entry.delay
- if self.default_entry:
- return self.default_entry.delay
- return None
+ entry = self._find_entry(useragent)
+ if entry is None:
+ return None
+ return entry.delay
def request_rate(self, useragent):
if not self.mtime():
return None
- for entry in self.entries:
- if entry.applies_to(useragent):
- return entry.req_rate
- if self.default_entry:
- return self.default_entry.req_rate
- return None
+ entry = self._find_entry(useragent)
+ if entry is None:
+ return None
+ return entry.req_rate
def site_maps(self):
if not self.sitemaps:
entries = self.entries
if self.default_entry is not None:
entries = entries + [self.default_entry]
- return '\n\n'.join(map(str, entries))
+ return '\n\n'.join(filter(None, map(str, entries)))
class RuleLine:
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
if path == '' and not allowance:
# an empty value means allow all
allowance = True
- self.path = normalize_path(path)
+ path = re.sub(r'[*]{2,}', '*', path)
+ path = re.sub(r'[$][$*]+', '$', path)
+ path = normalize_pattern(path)
+ self.fullmatch = path.endswith('$')
+ path = path.rstrip('$')
+ if '$' in path:
+ raise ValueError('$ not at the end of path')
+ self.matcher = None
+ if '*' in path:
+ pattern = re.compile(translate_pattern(path), re.DOTALL)
+ if self.fullmatch:
+ self.matcher = pattern.fullmatch
+ else:
+ self.matcher = pattern.match
+ self.path = path
self.allowance = allowance
def applies_to(self, filename):
- return self.path == "*" or filename.startswith(self.path)
+ # If the filename matches the rule, return the matching length plus 1.
+ # If it does not match, return 0.
+ if self.matcher is not None:
+ m = self.matcher(filename)
+ if m:
+ return m.end() + 1
+ else:
+ if self.fullmatch:
+ if filename == self.path:
+ return len(self.path) + 1
+ else:
+ if filename.startswith(self.path):
+ return len(self.path) + 1
+ return 0
def __str__(self):
- return ("Allow" if self.allowance else "Disallow") + ": " + self.path
+ return (("Allow" if self.allowance else "Disallow") + ": " + self.path
+ + ('$' if self.fullmatch else ''))
class Entry:
self.req_rate = None
def __str__(self):
+ if not self.useragents:
+ return ''
ret = []
for agent in self.useragents:
ret.append(f"User-agent: {agent}")
if self.req_rate is not None:
rate = self.req_rate
ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
- ret.extend(map(str, self.rulelines))
+ if self.rulelines:
+ ret.extend(map(str, self.rulelines))
+ else:
+ ret.append("Allow:")
return '\n'.join(ret)
def applies_to(self, useragent):
"""check if this entry applies to the specified agent"""
+ if useragent is None:
+ return '*' in self.useragents
# split the name token and make it lower case
useragent = useragent.split("/")[0].lower()
for agent in self.useragents:
- if agent == '*':
- # we have the catch-all agent
- return True
- agent = agent.lower()
- if agent in useragent:
- return True
+ if agent != '*':
+ agent = agent.lower()
+ if agent in useragent:
+ return True
return False
def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- - filename is URL encoded"""
+ - filename is URL encoded
+ """
+ best_match = -1
+ allowance = True
for line in self.rulelines:
- if line.applies_to(filename):
- return line.allowance
- return True
+ m = line.applies_to(filename)
+ if m:
+ if m > best_match:
+ best_match = m
+ allowance = line.allowance
+ elif m == best_match and not allowance:
+ allowance = line.allowance
+ return allowance
+
+
+def normalize(path):
+ unquoted = urllib.parse.unquote(path, errors='surrogateescape')
+ return urllib.parse.quote(unquoted, errors='surrogateescape')
+
+def normalize_uri(path):
+ path, sep, query = path.partition('?')
+ path = normalize(path)
+ if sep:
+ query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
+ path += '?' + query
+ return path
+
+def normalize_pattern(path):
+ path, sep, query = path.partition('?')
+ path = re.sub(r'[^*$]+', lambda m: normalize(m[0]), path)
+ if sep:
+ query = re.sub(r'[^=&*$]+', lambda m: normalize(m[0]), query)
+ path += '?' + query
+ return path
+
+def translate_pattern(path):
+ parts = list(map(re.escape, path.split('*')))
+ for i in range(1, len(parts)-1):
+ parts[i] = f'(?>.*?{parts[i]})'
+ parts[-1] = f'.*{parts[-1]}'
+ return ''.join(parts)
+
+def merge_entries(e1, e2):
+ entry = Entry()
+ entry.useragents = list(filter(set(e2.useragents).__contains__, e1.useragents))
+ entry.rulelines = e1.rulelines + e2.rulelines
+ entry.delay = e1.delay if e2.delay is None else e2.delay
+ entry.req_rate = e1.req_rate if e2.req_rate is None else e2.req_rate
+ return entry