#17403: urllib.parse.robotparser normalizes the urls before adding to ruleline.

author Senthil Kumaran <senthil@uthcode.com>

Wed, 29 May 2013 12:54:31 +0000 (05:54 -0700)

committer Senthil Kumaran <senthil@uthcode.com>

Wed, 29 May 2013 12:54:31 +0000 (05:54 -0700)
author Senthil Kumaran <senthil@uthcode.com>
Wed, 29 May 2013 12:54:31 +0000 (05:54 -0700)
committer Senthil Kumaran <senthil@uthcode.com>
Wed, 29 May 2013 12:54:31 +0000 (05:54 -0700)
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py

index 8c09e7452c562708acbcd7ba38f1d72af7634a81..d1dfd9eeec026d4d501b1debca900bef6be1e50c 100644 (file)
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -234,6 +234,18 @@ bad = ['/some/path']
  
  RobotTest(15, doc, good, bad)
  
+# 16. Empty query (issue #17403). Normalizing the url first.
+doc = """
+User-agent: *
+Allow: /some/path?
+Disallow: /another/path?
+"""
+
+good = ['/some/path?']
+bad = ['/another/path?']
+
+RobotTest(16, doc, good, bad)
+
  
  class NetworkTestCase(unittest.TestCase):
  
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py

index 75be4af4091806af2928d92a0ce943b5c5f1d267..978ba58d84a60bb3a7e72c58e4fdb06234ee24e5 100644 (file)
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -157,6 +157,7 @@ class RuleLine:
          if path == '' and not allowance:
              # an empty value means allow all
              allowance = True
+        path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
          self.path = urllib.parse.quote(path)
          self.allowance = allowance
  
diff --git a/Misc/NEWS b/Misc/NEWS

index 828e240c5bef66fe5da14ff73ccef6753cc22a9b..be6fd578db26a50741c7b8bb237d254c412d5fe5 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -24,6 +24,10 @@ Core and Builtins
  Library
  -------
  
+- Issue #17403: urllib.parse.robotparser normalizes the urls before adding to
+  ruleline. This helps in handling certain types invalid urls in a conservative
+  manner.
+
  - Issue #18025: Fixed a segfault in io.BufferedIOBase.readinto() when raw
    stream's read() returns more bytes than requested.
author	Senthil Kumaran <senthil@uthcode.com>
	Wed, 29 May 2013 12:54:31 +0000 (05:54 -0700)
committer	Senthil Kumaran <senthil@uthcode.com>
	Wed, 29 May 2013 12:54:31 +0000 (05:54 -0700)
Lib/test/test_robotparser.py		patch \| blob \| blame \| history
Lib/urllib/robotparser.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history