#18155: Regex-escape delimiter, in case it is a regex special char.

author R David Murray <rdmurray@bitdance.com>

Sat, 29 Jun 2013 22:43:59 +0000 (18:43 -0400)

committer R David Murray <rdmurray@bitdance.com>

Sat, 29 Jun 2013 22:43:59 +0000 (18:43 -0400)
author R David Murray <rdmurray@bitdance.com>
Sat, 29 Jun 2013 22:43:59 +0000 (18:43 -0400)
committer R David Murray <rdmurray@bitdance.com>
Sat, 29 Jun 2013 22:43:59 +0000 (18:43 -0400)
diff --git a/Lib/csv.py b/Lib/csv.py

index 984ed7e581b9a7ac15d47f4711081d06de1e67cf..98480ba16a1d1802e96320e6b0f390952daccebc 100644 (file)
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -261,8 +261,9 @@ class Sniffer:
  
          # if we see an extra quote between delimiters, we've got a
          # double quoted format
-        dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
-                               {'delim':delim, 'quote':quotechar}, re.MULTILINE)
+        dq_regexp = re.compile(
+                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
+                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
  
  
  
diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py

index 53ca5ab12f1a2c6d985feea22dd2ae3ccf9f08d9..3f8266512923cea633585e536091158a7ca9e98f 100644 (file)
--- a/Lib/test/test_csv.py
+++ b/Lib/test/test_csv.py
@@ -914,7 +914,7 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
  'Tommy''s Place':'Blue Island':'IL':'12/28/02':'Blue Sunday/White Crow'
  'Stonecutters ''Seafood'' and Chop House':'Lemont':'IL':'12/19/02':'Week Back'
  """
-    header = '''\
+    header1 = '''\
  "venue","city","state","date","performers"
  '''
      sample3 = '''\
@@ -933,10 +933,35 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
      sample6 = "a|b|c\r\nd|e|f\r\n"
      sample7 = "'a'|'b'|'c'\r\n'd'|e|f\r\n"
  
+# Issue 18155: Use a delimiter that is a special char to regex:
+
+    header2 = '''\
+"venue"+"city"+"state"+"date"+"performers"
+'''
+    sample8 = """\
+Harry's+ Arlington Heights+ IL+ 2/1/03+ Kimi Hayes
+Shark City+ Glendale Heights+ IL+ 12/28/02+ Prezence
+Tommy's Place+ Blue Island+ IL+ 12/28/02+ Blue Sunday/White Crow
+Stonecutters Seafood and Chop House+ Lemont+ IL+ 12/19/02+ Week Back
+"""
+    sample9 = """\
+'Harry''s'+ Arlington Heights'+ 'IL'+ '2/1/03'+ 'Kimi Hayes'
+'Shark City'+ Glendale Heights'+' IL'+ '12/28/02'+ 'Prezence'
+'Tommy''s Place'+ Blue Island'+ 'IL'+ '12/28/02'+ 'Blue Sunday/White Crow'
+'Stonecutters ''Seafood'' and Chop House'+ 'Lemont'+ 'IL'+ '12/19/02'+ 'Week Back'
+"""
+
      def test_has_header(self):
          sniffer = csv.Sniffer()
          self.assertEqual(sniffer.has_header(self.sample1), False)
-        self.assertEqual(sniffer.has_header(self.header+self.sample1), True)
+        self.assertEqual(sniffer.has_header(self.header1 + self.sample1),
+                         True)
+
+    def test_has_header_regex_special_delimiter(self):
+        sniffer = csv.Sniffer()
+        self.assertEqual(sniffer.has_header(self.sample8), False)
+        self.assertEqual(sniffer.has_header(self.header2 + self.sample8),
+                         True)
  
      def test_sniff(self):
          sniffer = csv.Sniffer()
@@ -970,13 +995,24 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
          dialect = sniffer.sniff(self.sample7)
          self.assertEqual(dialect.delimiter, "|")
          self.assertEqual(dialect.quotechar, "'")
+        dialect = sniffer.sniff(self.sample8)
+        self.assertEqual(dialect.delimiter, '+')
+        dialect = sniffer.sniff(self.sample9)
+        self.assertEqual(dialect.delimiter, '+')
+        self.assertEqual(dialect.quotechar, "'")
  
      def test_doublequote(self):
          sniffer = csv.Sniffer()
-        dialect = sniffer.sniff(self.header)
+        dialect = sniffer.sniff(self.header1)
+        self.assertFalse(dialect.doublequote)
+        dialect = sniffer.sniff(self.header2)
          self.assertFalse(dialect.doublequote)
          dialect = sniffer.sniff(self.sample2)
          self.assertTrue(dialect.doublequote)
+        dialect = sniffer.sniff(self.sample8)
+        self.assertFalse(dialect.doublequote)
+        dialect = sniffer.sniff(self.sample9)
+        self.assertTrue(dialect.doublequote)
  
  if not hasattr(sys, "gettotalrefcount"):
      if test_support.verbose: print "*** skipping leakage tests ***"
diff --git a/Misc/ACKS b/Misc/ACKS

index f8e588e1dd71cef3176209dfece3a7fdff13ff7c..67f0c97eafa9b7d2f443d6c2d5173fc956da4d8d 100644 (file)
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -545,6 +545,7 @@ Jeff Knupp
  Greg Kochanski
  Damon Kohler
  Marko Kohtala
+Vajrasky Kok
  Guido Kollerie
  Peter A. Koren
  Joseph Koshy
diff --git a/Misc/NEWS b/Misc/NEWS

index fff0032c26898a91d50f8e53b707ff8083ef1966..175fa9569b2968421e9547cb3328f7cc6b0772f4 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -24,11 +24,15 @@ Core and Builtins
  Library
  -------
  
+- Issue #18155: The csv module now correctly handles csv files that use
+  a delimiter character that has a special meaning in regexes, instead of
+  throwing an exception.
+
  - Issue #18135: ssl.SSLSocket.write() now raises an OverflowError if the input
    string in longer than 2 gigabytes. The ssl module does not support partial
    write.
  
-- Issue #18167: cgi.FieldStorage no more fails to handle multipart/form-data
+- Issue #18167: cgi.FieldStorage no longer fails to handle multipart/form-data
    when \r\n appears at end of 65535 bytes without other newlines.
  
  - Issue #17403: urllib.parse.robotparser normalizes the urls before adding to
author	R David Murray <rdmurray@bitdance.com>
	Sat, 29 Jun 2013 22:43:59 +0000 (18:43 -0400)
committer	R David Murray <rdmurray@bitdance.com>
	Sat, 29 Jun 2013 22:43:59 +0000 (18:43 -0400)
Lib/csv.py		patch \| blob \| blame \| history
Lib/test/test_csv.py		patch \| blob \| blame \| history
Misc/ACKS		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history