try and evaluate the smallest portion of the data possible, evaluating
additional chunks as necessary.
"""
+ from collections import Counter, defaultdict
data = list(filter(None, data.split('\n')))
- ascii = [chr(c) for c in range(127)] # 7-bit ASCII
-
# build frequency tables
chunkLength = min(10, len(data))
iteration = 0
- charFrequency = {}
+ num_lines = 0
+ # {char -> {count_per_line -> num_lines_with_that_count}}
+ char_frequency = defaultdict(Counter)
modes = {}
delims = {}
start, end = 0, chunkLength
while start < len(data):
iteration += 1
for line in data[start:end]:
- for char in ascii:
- metaFrequency = charFrequency.get(char, {})
- # must count even if frequency is 0
- freq = line.count(char)
- # value is the mode
- metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
- charFrequency[char] = metaFrequency
-
- for char in charFrequency.keys():
- items = list(charFrequency[char].items())
+ num_lines += 1
+ for char, count in Counter(line).items():
+ if char.isascii():
+ char_frequency[char][count] += 1
+
+ for char, counts in char_frequency.items():
+ items = list(counts.items())
+ missed_lines = num_lines - sum(counts.values())
+ if missed_lines:
+ # Store the number of lines 'char' was missing from.
+ items.append((0, missed_lines))
if len(items) == 1 and items[0][0] == 0:
continue
# get the mode of the frequencies
dialect = sniffer.sniff(self.sample9)
self.assertTrue(dialect.doublequote)
+ def test_guess_delimiter_crlf_not_chosen(self):
+ # Ensure that we pick the real delimiter ("|") over "\r" in a tie.
+ sniffer = csv.Sniffer()
+ sample = "a|b\r\nc|d\r\ne|f\r\n"
+ self.assertEqual(sniffer.sniff(sample).delimiter, "|")
+ self.assertNotEqual(sniffer.sniff(sample).delimiter, "\r")
+
+ def test_zero_mode_tie_order_independence(self):
+ sniffer = csv.Sniffer()
+ # ":" appears in half the rows (1, 0, 1, 0) - a tie between
+ # 0 and 1 per line.
+ # "," appears once every row (true delimiter).
+ #
+ # Even if the zero-frequency bucket is appended vs. inserted, the tie
+ # yields an adjusted score of 0, so ":" should not be promoted and
+ # "," must be selected.
+ sample = (
+ "a,b:c\n"
+ "d,e\n"
+ "f,g:c\n"
+ "h,i\n"
+ )
+ dialect = sniffer.sniff(sample)
+ self.assertEqual(dialect.delimiter, ",")
+
+ def test_zero_mode_tie_order_comma_first(self):
+ sniffer = csv.Sniffer()
+ pattern = (
+ "a,b\n"
+ "c:d\n"
+ "e,f\n"
+ "g:h\n"
+ )
+ sample = pattern * 10
+ with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
+ sniffer.sniff(sample)
+
+ def test_zero_mode_tie_order_colon_first(self):
+ sniffer = csv.Sniffer()
+ pattern = (
+ "a:b\n"
+ "c,d\n"
+ "e:f\n"
+ "g,h\n"
+ )
+ sample = pattern * 10
+ with self.assertRaisesRegex(csv.Error, "Could not determine delimiter"):
+ sniffer.sniff(sample)
+
+
class NUL:
def write(s, *args):
pass