]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
Issue1395: Universal mode used to duplicate newlines when using read(1).
authorAmaury Forgeot d'Arc <amauryfa@gmail.com>
Mon, 19 Nov 2007 20:34:10 +0000 (20:34 +0000)
committerAmaury Forgeot d'Arc <amauryfa@gmail.com>
Mon, 19 Nov 2007 20:34:10 +0000 (20:34 +0000)
"Universal newline" is now an incremental decoder wrapping the initial one,
with its own additional buffer (if '\r' is seen at the end of the input).

A decoder allows the tell() funtion to record the state of the translation.
This also simplifies the readline() process.

Now test_netrc passes on Windows, as well as many new tests in test_io.py

Lib/io.py
Lib/test/test_io.py
Modules/_fileio.c

index 74076d387e0ae3d1fd4cc002e825eb68ded00614..d2d2fbcf887748f8e60d27cce99d278f2706acc1 100644 (file)
--- a/Lib/io.py
+++ b/Lib/io.py
@@ -1041,6 +1041,84 @@ class TextIOBase(IOBase):
         return None
 
 
+class IncrementalNewlineDecoder(codecs.IncrementalDecoder):
+    """Codec used when reading a file in universal newlines mode.
+    It wraps another incremental decoder, translating \\r\\n and \\r into \\n.
+    It also records the types of newlines encountered.
+    When used with translate=False, it ensures that the newline sequence is
+    returned in one piece.
+    """
+    def __init__(self, decoder, translate, errors='strict'):
+        codecs.IncrementalDecoder.__init__(self, errors=errors)
+        self.buffer = b''
+        self.translate = translate
+        self.decoder = decoder
+        self.seennl = 0
+
+    def decode(self, input, final=False):
+        # decode input (with the eventual \r from a previous pass)
+        if self.buffer:
+            input = self.buffer + input
+
+        output = self.decoder.decode(input, final=final)
+
+        # retain last \r even when not translating data:
+        # then readline() is sure to get \r\n in one pass
+        if output.endswith("\r") and not final:
+            output = output[:-1]
+            self.buffer = b'\r'
+        else:
+            self.buffer = b''
+
+        # Record which newlines are read
+        crlf = output.count('\r\n')
+        cr = output.count('\r') - crlf
+        lf = output.count('\n') - crlf
+        self.seennl |= (lf and self._LF) | (cr and self._CR) \
+                    | (crlf and self._CRLF)
+
+        if self.translate:
+            if crlf:
+                output = output.replace("\r\n", "\n")
+            if cr:
+                output = output.replace("\r", "\n")
+
+        return output
+
+    def getstate(self):
+        buf, flag = self.decoder.getstate()
+        return buf + self.buffer, flag
+
+    def setstate(self, state):
+        buf, flag = state
+        if buf.endswith(b'\r'):
+            self.buffer = b'\r'
+            buf = buf[:-1]
+        else:
+            self.buffer = b''
+        self.decoder.setstate((buf, flag))
+
+    def reset(self):
+        self.buffer = b''
+        self.decoder.reset()
+
+    _LF = 1
+    _CR = 2
+    _CRLF = 4
+
+    @property
+    def newlines(self):
+        return (None,
+                "\n",
+                "\r",
+                ("\r", "\n"),
+                "\r\n",
+                ("\n", "\r\n"),
+                ("\r", "\r\n"),
+                ("\r", "\n", "\r\n")
+               )[self.seennl]
+
+
 class TextIOWrapper(TextIOBase):
 
     """Buffered text stream.
@@ -1077,7 +1155,6 @@ class TextIOWrapper(TextIOBase):
         self._readnl = newline
         self._writetranslate = newline != ''
         self._writenl = newline or os.linesep
-        self._seennl = 0
         self._decoder = None
         self._pending = ""
         self._snapshot = None
@@ -1124,6 +1201,7 @@ class TextIOWrapper(TextIOBase):
         if not isinstance(s, str):
             raise TypeError("can't write %s to text stream" %
                             s.__class__.__name__)
+        length = len(s)
         haslf = "\n" in s
         if haslf and self._writetranslate and self._writenl != "\n":
             s = s.replace("\n", self._writenl)
@@ -1132,15 +1210,20 @@ class TextIOWrapper(TextIOBase):
         self.buffer.write(b)
         if haslf and self.isatty():
             self.flush()
-        self._snapshot = self._decoder = None
-        return len(s)
+        self._snapshot = None
+        if self._decoder:
+            self._decoder.reset()
+        return length
 
     def _get_decoder(self):
         make_decoder = codecs.getincrementaldecoder(self._encoding)
         if make_decoder is None:
             raise IOError("Can't find an incremental decoder for encoding %s" %
                           self._encoding)
-        decoder = self._decoder = make_decoder()  # XXX: errors
+        decoder = make_decoder()  # XXX: errors
+        if self._readuniversal:
+            decoder = IncrementalNewlineDecoder(decoder, self._readtranslate)
+        self._decoder = decoder
         return decoder
 
     def _read_chunk(self):
@@ -1220,7 +1303,8 @@ class TextIOWrapper(TextIOBase):
             pos = self.buffer.seek(0, 2)
             self._snapshot = None
             self._pending = ""
-            self._decoder = None
+            if self._decoder:
+                self._decoder.reset()
             return pos
         if whence != 0:
             raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" %
@@ -1234,7 +1318,8 @@ class TextIOWrapper(TextIOBase):
             self.buffer.seek(pos)
             self._snapshot = None
             self._pending = ""
-            self._decoder = None
+            if self._decoder:
+                self._decoder.reset()
             return pos
         decoder = self._decoder or self._get_decoder()
         decoder.set_state(("", ds))
@@ -1253,7 +1338,7 @@ class TextIOWrapper(TextIOBase):
             res += decoder.decode(self.buffer.read(), True)
             self._pending = ""
             self._snapshot = None
-            return self._replacenl(res)
+            return res
         else:
             while len(res) < n:
                 readahead, pending = self._read_chunk()
@@ -1261,7 +1346,7 @@ class TextIOWrapper(TextIOBase):
                 if not readahead:
                     break
             self._pending = res[n:]
-            return self._replacenl(res[:n])
+            return res[:n]
 
     def __next__(self):
         self._telling = False
@@ -1285,62 +1370,55 @@ class TextIOWrapper(TextIOBase):
 
         line = self._pending
         start = 0
-        cr_eof = False
         decoder = self._decoder or self._get_decoder()
 
         pos = endpos = None
-        ending = None
         while True:
-            if self._readuniversal:
+            if self._readtranslate:
+                # Newlines are already translated, only search for \n
+                pos = line.find('\n', start)
+                if pos >= 0:
+                    endpos = pos + 1
+                    break
+                else:
+                    start = len(line)
+
+            elif self._readuniversal:
                 # Universal newline search. Find any of \r, \r\n, \n
+                # The decoder ensures that \r\n are not split in two pieces
 
                 # In C we'd look for these in parallel of course.
                 nlpos = line.find("\n", start)
                 crpos = line.find("\r", start)
                 if crpos == -1:
                     if nlpos == -1:
+                        # Nothing found
                         start = len(line)
                     else:
                         # Found \n
-                        pos = nlpos
-                        endpos = pos + 1
-                        ending = self._LF
+                        endpos = nlpos + 1
                         break
                 elif nlpos == -1:
-                    if crpos == len(line) - 1:
-                        # Found \r at end of buffer, must keep reading
-                        start = crpos
-                        cr_eof = True
-                    else:
-                        # Found lone \r
-                        ending = self._CR
-                        pos = crpos
-                        endpos = pos + 1
-                        break
+                    # Found lone \r
+                    endpos = crpos + 1
+                    break
                 elif nlpos < crpos:
                     # Found \n
-                    pos = nlpos
-                    endpos = pos + 1
-                    ending = self._LF
+                    endpos = nlpos + 1
                     break
                 elif nlpos == crpos + 1:
                     # Found \r\n
-                    ending = self._CRLF
-                    pos = crpos
-                    endpos = pos + 2
+                    endpos = crpos + 2
                     break
                 else:
                     # Found \r
-                    pos = crpos
-                    endpos = pos + 1
-                    ending = self._CR
+                    endpos = crpos + 1
                     break
             else:
                 # non-universal
                 pos = line.find(self._readnl)
                 if pos >= 0:
-                    endpos = pos+len(self._readnl)
-                    ending = self._nlflag(self._readnl)
+                    endpos = pos + len(self._readnl)
                     break
 
             # No line ending seen yet - get more data
@@ -1356,65 +1434,14 @@ class TextIOWrapper(TextIOBase):
                 # end of file
                 self._pending = ''
                 self._snapshot = None
-                if cr_eof:
-                    self._seennl |= self._CR
-                    return line[:-1] + '\n'
-                else:
-                    return line
+                return line
 
         self._pending = line[endpos:]
-        if self._readtranslate:
-            self._seennl |= ending
-            if ending != self._LF:
-                return line[:pos] + '\n'
-            else:
-                return line[:endpos]
-        else:
-            return line[:endpos]
+        return line[:endpos]
 
-    def _replacenl(self, data):
-        # Replace newlines in data as needed and record that they have
-        # been seen.
-        if not self._readtranslate:
-            return data
-        if self._readuniversal:
-            crlf = data.count('\r\n')
-            cr = data.count('\r') - crlf
-            lf = data.count('\n') - crlf
-            self._seennl |= (lf and self._LF) | (cr and self._CR) \
-                         | (crlf and self._CRLF)
-            if crlf:
-                data = data.replace("\r\n", "\n")
-            if cr:
-                data = data.replace("\r", "\n")
-        elif self._readnl == '\n':
-            # Only need to detect if \n was seen.
-            if data.count('\n'):
-                self._seennl |= self._LF
-        else:
-            newdata = data.replace(self._readnl, '\n')
-            if newdata is not data:
-                self._seennl |= self._nlflag(self._readnl)
-            data = newdata
-        return data
-
-    _LF = 1
-    _CR = 2
-    _CRLF = 4
     @property
     def newlines(self):
-        return (None,
-                "\n",
-                "\r",
-                ("\r", "\n"),
-                "\r\n",
-                ("\n", "\r\n"),
-                ("\r", "\r\n"),
-                ("\r", "\n", "\r\n")
-               )[self._seennl]
-
-    def _nlflag(self, nlstr):
-        return [None, "\n", "\r", None, "\r\n"].index(nlstr)
+        return self._decoder.newlines if self._decoder else None
 
 class StringIO(TextIOWrapper):
 
index dace6427cdf620d5155f3b9861aa1ae4c4a96278..697f69e5b2efc6dc2f6884b02103166bf1b7d1a7 100644 (file)
@@ -489,6 +489,10 @@ class BufferedRandomTest(unittest.TestCase):
 
 class TextIOWrapperTest(unittest.TestCase):
 
+    def setUp(self):
+        self.testdata = b"AAA\r\nBBB\rCCC\r\nDDD\nEEE\r\n"
+        self.normalized = b"AAA\nBBB\nCCC\nDDD\nEEE\n".decode("ascii")
+
     def tearDown(self):
         test_support.unlink(test_support.TESTFN)
 
@@ -496,14 +500,14 @@ class TextIOWrapperTest(unittest.TestCase):
         testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
         normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
         for newline, expected in [
-            (None, normalized.decode("ASCII").splitlines(True)),
-            ("", testdata.decode("ASCII").splitlines(True)),
+            (None, normalized.decode("ascii").splitlines(True)),
+            ("", testdata.decode("ascii").splitlines(True)),
             ("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
             ("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
             ("\r",  ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]),
             ]:
             buf = io.BytesIO(testdata)
-            txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+            txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
             self.assertEquals(txt.readlines(), expected)
             txt.seek(0)
             self.assertEquals(txt.read(), "".join(expected))
@@ -518,7 +522,7 @@ class TextIOWrapperTest(unittest.TestCase):
         tests = [(None, testdict[os.linesep])] + sorted(testdict.items())
         for newline, expected in tests:
             buf = io.BytesIO()
-            txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+            txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
             txt.write("AAA\nB")
             txt.write("BB\nCCC\n")
             txt.write("X\rY\r\nZ")
@@ -568,14 +572,14 @@ class TextIOWrapperTest(unittest.TestCase):
         testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
         normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
         for newline, expected in [
-            (None, normalized.decode("ASCII").splitlines(True)),
-            ("", testdata.decode("ASCII").splitlines(True)),
+            (None, normalized.decode("ascii").splitlines(True)),
+            ("", testdata.decode("ascii").splitlines(True)),
             ("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
             ("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
             ("\r",  ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]),
             ]:
             buf = io.BytesIO(testdata)
-            txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+            txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
             self.assertEquals(txt.readlines(), expected)
             txt.seek(0)
             self.assertEquals(txt.read(), "".join(expected))
@@ -600,7 +604,7 @@ class TextIOWrapperTest(unittest.TestCase):
                 ("\r\n", "\r\n", data_crlf),
                 ]:
                 buf = io.BytesIO()
-                txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+                txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
                 txt.write(data)
                 txt.close()
                 self.assertEquals(buf.getvalue(), expected)
@@ -745,6 +749,114 @@ class TextIOWrapperTest(unittest.TestCase):
                 print("Reading using readline(): %6.3f seconds" % (t3-t2))
                 print("Using readline()+tell():  %6.3f seconds" % (t4-t3))
 
+    def testReadOneByOne(self):
+        txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB"))
+        reads = ""
+        while True:
+            c = txt.read(1)
+            if not c:
+                break
+            reads += c
+        self.assertEquals(reads, "AA\nBB")
+
+    # read in amounts equal to TextIOWrapper._CHUNK_SIZE which is 128.
+    def testReadByChunk(self):
+        # make sure "\r\n" straddles 128 char boundary.
+        txt = io.TextIOWrapper(io.BytesIO(b"A" * 127 + b"\r\nB"))
+        reads = ""
+        while True:
+            c = txt.read(128)
+            if not c:
+                break
+            reads += c
+        self.assertEquals(reads, "A"*127+"\nB")
+
+    def test_issue1395_1(self):
+        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
+
+        # read one char at a time
+        reads = ""
+        while True:
+            c = txt.read(1)
+            if not c:
+                break
+            reads += c
+        self.assertEquals(reads, self.normalized)
+
+    def test_issue1395_2(self):
+        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
+        txt._CHUNK_SIZE = 4
+
+        reads = ""
+        while True:
+            c = txt.read(4)
+            if not c:
+                break
+            reads += c
+        self.assertEquals(reads, self.normalized)
+
+    def test_issue1395_3(self):
+        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
+        txt._CHUNK_SIZE = 4
+
+        reads = txt.read(4)
+        reads += txt.read(4)
+        reads += txt.readline()
+        reads += txt.readline()
+        reads += txt.readline()
+        self.assertEquals(reads, self.normalized)
+
+    def test_issue1395_4(self):
+        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
+        txt._CHUNK_SIZE = 4
+
+        reads = txt.read(4)
+        reads += txt.read()
+        self.assertEquals(reads, self.normalized)
+
+    def test_issue1395_5(self):
+        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
+        txt._CHUNK_SIZE = 4
+
+        reads = txt.read(4)
+        pos = txt.tell()
+        txt.seek(0)
+        txt.seek(pos)
+        self.assertEquals(txt.read(4), "BBB\n")
+
+    def test_newline_decoder(self):
+        import codecs
+        decoder = codecs.getincrementaldecoder("utf-8")()
+        decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
+
+        self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
+
+        self.assertEquals(decoder.decode(b'\xe8'), "")
+        self.assertEquals(decoder.decode(b'\xa2'), "")
+        self.assertEquals(decoder.decode(b'\x88'), "\u8888")
+
+        self.assertEquals(decoder.decode(b'\xe8'), "")
+        self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True)
+
+        decoder.setstate((b'', 0))
+        self.assertEquals(decoder.decode(b'\n'), "\n")
+        self.assertEquals(decoder.decode(b'\r'), "")
+        self.assertEquals(decoder.decode(b'', final=True), "\n")
+        self.assertEquals(decoder.decode(b'\r', final=True), "\n")
+
+        self.assertEquals(decoder.decode(b'\r'), "")
+        self.assertEquals(decoder.decode(b'a'), "\na")
+
+        self.assertEquals(decoder.decode(b'\r\r\n'), "\n\n")
+        self.assertEquals(decoder.decode(b'\r'), "")
+        self.assertEquals(decoder.decode(b'\r'), "\n")
+        self.assertEquals(decoder.decode(b'\na'), "\na")
+
+        self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), "\u8888\n")
+        self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
+        self.assertEquals(decoder.decode(b'\n'), "\n")
+        self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), "\u8888")
+        self.assertEquals(decoder.decode(b'\n'), "\n")
 
 # XXX Tests for open()
 
index 0fd8b6685716a01382626c02d47356edfdaf2030..68b28d40cd7dccf7f9876ea2e1b3dab86ecdb71e 100644 (file)
@@ -867,7 +867,7 @@ static PyGetSetDef fileio_getsetlist[] = {
 
 PyTypeObject PyFileIO_Type = {
        PyVarObject_HEAD_INIT(&PyType_Type, 0)
-       "FileIO",
+       "_FileIO",
        sizeof(PyFileIOObject),
        0,
        (destructor)fileio_dealloc,             /* tp_dealloc */