gh-137627: Make `csv.Sniffer.sniff()` delimiter detection 1.6x faster (#137628)

author Maurycy Pawłowski-Wieroński <5383+maurycy@users.noreply.github.com>

Thu, 23 Oct 2025 12:28:29 +0000 (14:28 +0200)

committer GitHub <noreply@github.com>

Thu, 23 Oct 2025 12:28:29 +0000 (15:28 +0300)
author Maurycy Pawłowski-Wieroński <5383+maurycy@users.noreply.github.com>
Thu, 23 Oct 2025 12:28:29 +0000 (14:28 +0200)
committer GitHub <noreply@github.com>
Thu, 23 Oct 2025 12:28:29 +0000 (15:28 +0300)
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst

index a9543bdd13e83f824eb7460de62458c8df14616e..cbca20cba5c28471f4bf3a35649603308de7b914 100644 (file)
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -652,11 +652,11 @@ zlib
  Optimizations
  =============
  
-module_name
------------
-
-* TODO
+csv
+---
  
+* :meth:`csv.Sniffer.sniff` delimiter detection is now up to 1.6x faster.
+  (Contributed by Maurycy Pawłowski-Wieroński in :gh:`137628`.)
  
  
  Removed
diff --git a/Lib/csv.py b/Lib/csv.py

index 98eab01429a8ec10daa5ef5e4262c17604bcff16..b2aaf5fd9fa91e6a866c82fb02f07c95cd17b114 100644 (file)
--- a/Lib/csv.py
+++ b/Lib/csv.py
@@ -362,31 +362,33 @@ class Sniffer:
          try and evaluate the smallest portion of the data possible, evaluating
          additional chunks as necessary.
          """
+        from collections import Counter, defaultdict
  
          data = list(filter(None, data.split('\n')))
  
-        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
-
          # build frequency tables
          chunkLength = min(10, len(data))
          iteration = 0
-        charFrequency = {}
+        num_lines = 0
+        # {char -> {count_per_line -> num_lines_with_that_count}}
+        char_frequency = defaultdict(Counter)
          modes = {}
          delims = {}
          start, end = 0, chunkLength
          while start < len(data):
              iteration += 1
              for line in data[start:end]:
-                for char in ascii:
-                    metaFrequency = charFrequency.get(char, {})
-                    # must count even if frequency is 0
-                    freq = line.count(char)
-                    # value is the mode
-                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
-                    charFrequency[char] = metaFrequency
-
-            for char in charFrequency.keys():
-                items = list(charFrequency[char].items())
+                num_lines += 1
+                for char, count in Counter(line).items():
+                    if char.isascii():
+                        char_frequency[char][count] += 1
+
+            for char, counts in char_frequency.items():
+                items = list(counts.items())
+                missed_lines = num_lines - sum(counts.values())
+                if missed_lines:
+                    # Store the number of lines 'char' was missing from.
+                    items.append((0, missed_lines))
                  if len(items) == 1 and items[0][0] == 0:
                      continue
                  # get the mode of the frequencies
diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py

index 50431b562f90bafb72f54e1a42e7cda661cb42b4..6be6a7ae222f02c625d8c47ba82c81e77f8c74a9 100644 (file)
--- a/Lib/test/test_csv.py
+++ b/Lib/test/test_csv.py
@@ -1437,6 +1437,56 @@ ghi\0jkl
          dialect = sniffer.sniff(self.sample9)
          self.assertTrue(dialect.doublequote)
  
+    def test_guess_delimiter_crlf_not_chosen(self):
+        # Ensure that we pick the real delimiter ("|") over "\r" in a tie.
+        sniffer = csv.Sniffer()
+        sample = "a|b\r\nc|d\r\ne|f\r\n"
+        self.assertEqual(sniffer.sniff(sample).delimiter, "|")
+        self.assertNotEqual(sniffer.sniff(sample).delimiter, "\r")
+
+    def test_zero_mode_tie_order_independence(self):
+        sniffer = csv.Sniffer()
+        # ":" appears in half the rows (1, 0, 1, 0) - a tie between
+        #     0 and 1 per line.
+        # "," appears once every row (true delimiter).
+        #
+        # Even if the zero-frequency bucket is appended vs. inserted, the tie
+        # yields an adjusted score of 0, so ":" should not be promoted and
+        # "," must be selected.
+        sample = (
+            "a,b:c\n"
+            "d,e\n"
+            "f,g:c\n"
+            "h,i\n"
+        )
+        dialect = sniffer.sniff(sample)
+        self.assertEqual(dialect.delimiter, ",")
+
+    def test_zero_mode_tie_order_comma_first(self):
+        sniffer = csv.Sniffer()
+        pattern = (
+            "a,b\n"
+            "c:d\n"
+            "e,f\n"
+            "g:h\n"
+        )
+        sample = pattern * 10
+        with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
+            sniffer.sniff(sample)
+
+    def test_zero_mode_tie_order_colon_first(self):
+        sniffer = csv.Sniffer()
+        pattern = (
+            "a:b\n"
+            "c,d\n"
+            "e:f\n"
+            "g,h\n"
+        )
+        sample = pattern * 10
+        with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
+            sniffer.sniff(sample)
+
+
  class NUL:
      def write(s, *args):
          pass
diff --git a/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst

new file mode 100644 (file)

index 0000000..855070e
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst
@@ -0,0 +1 @@
+Speed up :meth:`csv.Sniffer.sniff` delimiter detection by up to 1.6x.
author	Maurycy Pawłowski-Wieroński <5383+maurycy@users.noreply.github.com>
	Thu, 23 Oct 2025 12:28:29 +0000 (14:28 +0200)
committer	GitHub <noreply@github.com>
	Thu, 23 Oct 2025 12:28:29 +0000 (15:28 +0300)
Doc/whatsnew/3.15.rst		patch \| blob \| blame \| history
Lib/csv.py		patch \| blob \| blame \| history
Lib/test/test_csv.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2025-08-11-04-52-18.gh-issue-137627.Ku5Yi2.rst	[new file with mode: 0644]	patch \| blob