From: Senthil Kumaran Date: Wed, 29 May 2013 12:58:47 +0000 (-0700) Subject: #17403: urllib.parse.robotparser normalizes the urls before adding to ruleline. X-Git-Tag: v2.7.6rc1~372 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2c4810efa2421c1a3e0042888b71193a917b39c5;p=thirdparty%2FPython%2Fcpython.git #17403: urllib.parse.robotparser normalizes the urls before adding to ruleline. This helps in handling certain types invalid urls in a conservative manner. --- diff --git a/Lib/robotparser.py b/Lib/robotparser.py index 1722863d144d..ad3be9471158 100644 --- a/Lib/robotparser.py +++ b/Lib/robotparser.py @@ -160,6 +160,7 @@ class RuleLine: if path == '' and not allowance: # an empty value means allow all allowance = True + path = urlparse.urlunparse(urlparse.urlparse(path)) self.path = urllib.quote(path) self.allowance = allowance diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index b3d4a46056bb..651301bde2a3 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -228,6 +228,18 @@ bad = ['/some/path'] RobotTest(15, doc, good, bad) +# 16. Empty query (issue #17403). Normalizing the url first. +doc = """ +User-agent: * +Allow: /some/path? +Disallow: /another/path? +""" + +good = ['/some/path?'] +bad = ['/another/path?'] + +RobotTest(16, doc, good, bad) + class NetworkTestCase(unittest.TestCase): diff --git a/Misc/NEWS b/Misc/NEWS index 00448c45af4b..7e901f700667 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -15,6 +15,10 @@ Core and Builtins Library ------- +- Issue #17403: urllib.parse.robotparser normalizes the urls before adding to + ruleline. This helps in handling certain types invalid urls in a conservative + manner. Patch contributed by Mher Movsisyan. + - Implement inequality on weakref.WeakSet. - Issue #17981: Closed socket on error in SysLogHandler.