From: Senthil Kumaran Date: Wed, 28 Jul 2010 16:27:56 +0000 (+0000) Subject: Fix Issue6325 - robotparse to honor urls with query strings. X-Git-Tag: v3.2a1~79 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=3f8ab965f722b3bda679c9271fb8907e2bbcdc64;p=thirdparty%2FPython%2Fcpython.git Fix Issue6325 - robotparse to honor urls with query strings. --- diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 4c3b5363fd1f..9d3040547e50 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -205,6 +205,17 @@ bad = ['/folder1/anotherfile.html'] RobotTest(13, doc, good, bad, agent="googlebot") +# 14. For issue #6325 (query string support) +doc = """ +User-agent: * +Disallow: /some/path?name=value +""" + +good = ['/some/path'] +bad = ['/some/path?name=value'] + +RobotTest(14, doc, good, bad) + class NetworkTestCase(unittest.TestCase): diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index bafb611a1c56..30baa055d2a8 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -129,8 +129,10 @@ class RobotFileParser: return True # search for given user agent matches # the first match counts - url = urllib.parse.quote( - urllib.parse.urlparse(urllib.parse.unquote(url))[2]) + parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) + url = urllib.parse.urlunparse(('','',parsed_url.path, + parsed_url.params,parsed_url.query, parsed_url.fragment)) + url = urllib.parse.quote(url) if not url: url = "/" for entry in self.entries: