Issue1395: Universal mode used to duplicate newlines when using read(1).

author Amaury Forgeot d'Arc <amauryfa@gmail.com>

Mon, 19 Nov 2007 20:34:10 +0000 (20:34 +0000)

committer Amaury Forgeot d'Arc <amauryfa@gmail.com>

Mon, 19 Nov 2007 20:34:10 +0000 (20:34 +0000)
author Amaury Forgeot d'Arc <amauryfa@gmail.com>
Mon, 19 Nov 2007 20:34:10 +0000 (20:34 +0000)
committer Amaury Forgeot d'Arc <amauryfa@gmail.com>
Mon, 19 Nov 2007 20:34:10 +0000 (20:34 +0000)
diff --git a/Lib/io.py b/Lib/io.py

index 74076d387e0ae3d1fd4cc002e825eb68ded00614..d2d2fbcf887748f8e60d27cce99d278f2706acc1 100644 (file)
--- a/Lib/io.py
+++ b/Lib/io.py
@@ -1041,6 +1041,84 @@ class TextIOBase(IOBase):
          return None
  
  
+class IncrementalNewlineDecoder(codecs.IncrementalDecoder):
+    """Codec used when reading a file in universal newlines mode.
+    It wraps another incremental decoder, translating \\r\\n and \\r into \\n.
+    It also records the types of newlines encountered.
+    When used with translate=False, it ensures that the newline sequence is
+    returned in one piece.
+    """
+    def __init__(self, decoder, translate, errors='strict'):
+        codecs.IncrementalDecoder.__init__(self, errors=errors)
+        self.buffer = b''
+        self.translate = translate
+        self.decoder = decoder
+        self.seennl = 0
+
+    def decode(self, input, final=False):
+        # decode input (with the eventual \r from a previous pass)
+        if self.buffer:
+            input = self.buffer + input
+
+        output = self.decoder.decode(input, final=final)
+
+        # retain last \r even when not translating data:
+        # then readline() is sure to get \r\n in one pass
+        if output.endswith("\r") and not final:
+            output = output[:-1]
+            self.buffer = b'\r'
+        else:
+            self.buffer = b''
+
+        # Record which newlines are read
+        crlf = output.count('\r\n')
+        cr = output.count('\r') - crlf
+        lf = output.count('\n') - crlf
+        self.seennl |= (lf and self._LF) | (cr and self._CR) \
+                    | (crlf and self._CRLF)
+
+        if self.translate:
+            if crlf:
+                output = output.replace("\r\n", "\n")
+            if cr:
+                output = output.replace("\r", "\n")
+
+        return output
+
+    def getstate(self):
+        buf, flag = self.decoder.getstate()
+        return buf + self.buffer, flag
+
+    def setstate(self, state):
+        buf, flag = state
+        if buf.endswith(b'\r'):
+            self.buffer = b'\r'
+            buf = buf[:-1]
+        else:
+            self.buffer = b''
+        self.decoder.setstate((buf, flag))
+
+    def reset(self):
+        self.buffer = b''
+        self.decoder.reset()
+
+    _LF = 1
+    _CR = 2
+    _CRLF = 4
+
+    @property
+    def newlines(self):
+        return (None,
+                "\n",
+                "\r",
+                ("\r", "\n"),
+                "\r\n",
+                ("\n", "\r\n"),
+                ("\r", "\r\n"),
+                ("\r", "\n", "\r\n")
+               )[self.seennl]
+
+
  class TextIOWrapper(TextIOBase):
  
      """Buffered text stream.
@@ -1077,7 +1155,6 @@ class TextIOWrapper(TextIOBase):
          self._readnl = newline
          self._writetranslate = newline != ''
          self._writenl = newline or os.linesep
-        self._seennl = 0
          self._decoder = None
          self._pending = ""
          self._snapshot = None
@@ -1124,6 +1201,7 @@ class TextIOWrapper(TextIOBase):
          if not isinstance(s, str):
              raise TypeError("can't write %s to text stream" %
                              s.__class__.__name__)
+        length = len(s)
          haslf = "\n" in s
          if haslf and self._writetranslate and self._writenl != "\n":
              s = s.replace("\n", self._writenl)
@@ -1132,15 +1210,20 @@ class TextIOWrapper(TextIOBase):
          self.buffer.write(b)
          if haslf and self.isatty():
              self.flush()
-        self._snapshot = self._decoder = None
-        return len(s)
+        self._snapshot = None
+        if self._decoder:
+            self._decoder.reset()
+        return length
  
      def _get_decoder(self):
          make_decoder = codecs.getincrementaldecoder(self._encoding)
          if make_decoder is None:
              raise IOError("Can't find an incremental decoder for encoding %s" %
                            self._encoding)
-        decoder = self._decoder = make_decoder()  # XXX: errors
+        decoder = make_decoder()  # XXX: errors
+        if self._readuniversal:
+            decoder = IncrementalNewlineDecoder(decoder, self._readtranslate)
+        self._decoder = decoder
          return decoder
  
      def _read_chunk(self):
@@ -1220,7 +1303,8 @@ class TextIOWrapper(TextIOBase):
              pos = self.buffer.seek(0, 2)
              self._snapshot = None
              self._pending = ""
-            self._decoder = None
+            if self._decoder:
+                self._decoder.reset()
              return pos
          if whence != 0:
              raise ValueError("Invalid whence (%r, should be 0, 1 or 2)" %
@@ -1234,7 +1318,8 @@ class TextIOWrapper(TextIOBase):
              self.buffer.seek(pos)
              self._snapshot = None
              self._pending = ""
-            self._decoder = None
+            if self._decoder:
+                self._decoder.reset()
              return pos
          decoder = self._decoder or self._get_decoder()
          decoder.set_state(("", ds))
@@ -1253,7 +1338,7 @@ class TextIOWrapper(TextIOBase):
              res += decoder.decode(self.buffer.read(), True)
              self._pending = ""
              self._snapshot = None
-            return self._replacenl(res)
+            return res
          else:
              while len(res) < n:
                  readahead, pending = self._read_chunk()
@@ -1261,7 +1346,7 @@ class TextIOWrapper(TextIOBase):
                  if not readahead:
                      break
              self._pending = res[n:]
-            return self._replacenl(res[:n])
+            return res[:n]
  
      def __next__(self):
          self._telling = False
@@ -1285,62 +1370,55 @@ class TextIOWrapper(TextIOBase):
  
          line = self._pending
          start = 0
-        cr_eof = False
          decoder = self._decoder or self._get_decoder()
  
          pos = endpos = None
-        ending = None
          while True:
-            if self._readuniversal:
+            if self._readtranslate:
+                # Newlines are already translated, only search for \n
+                pos = line.find('\n', start)
+                if pos >= 0:
+                    endpos = pos + 1
+                    break
+                else:
+                    start = len(line)
+
+            elif self._readuniversal:
                  # Universal newline search. Find any of \r, \r\n, \n
+                # The decoder ensures that \r\n are not split in two pieces
  
                  # In C we'd look for these in parallel of course.
                  nlpos = line.find("\n", start)
                  crpos = line.find("\r", start)
                  if crpos == -1:
                      if nlpos == -1:
+                        # Nothing found
                          start = len(line)
                      else:
                          # Found \n
-                        pos = nlpos
-                        endpos = pos + 1
-                        ending = self._LF
+                        endpos = nlpos + 1
                          break
                  elif nlpos == -1:
-                    if crpos == len(line) - 1:
-                        # Found \r at end of buffer, must keep reading
-                        start = crpos
-                        cr_eof = True
-                    else:
-                        # Found lone \r
-                        ending = self._CR
-                        pos = crpos
-                        endpos = pos + 1
-                        break
+                    # Found lone \r
+                    endpos = crpos + 1
+                    break
                  elif nlpos < crpos:
                      # Found \n
-                    pos = nlpos
-                    endpos = pos + 1
-                    ending = self._LF
+                    endpos = nlpos + 1
                      break
                  elif nlpos == crpos + 1:
                      # Found \r\n
-                    ending = self._CRLF
-                    pos = crpos
-                    endpos = pos + 2
+                    endpos = crpos + 2
                      break
                  else:
                      # Found \r
-                    pos = crpos
-                    endpos = pos + 1
-                    ending = self._CR
+                    endpos = crpos + 1
                      break
              else:
                  # non-universal
                  pos = line.find(self._readnl)
                  if pos >= 0:
-                    endpos = pos+len(self._readnl)
-                    ending = self._nlflag(self._readnl)
+                    endpos = pos + len(self._readnl)
                      break
  
              # No line ending seen yet - get more data
@@ -1356,65 +1434,14 @@ class TextIOWrapper(TextIOBase):
                  # end of file
                  self._pending = ''
                  self._snapshot = None
-                if cr_eof:
-                    self._seennl |= self._CR
-                    return line[:-1] + '\n'
-                else:
-                    return line
+                return line
  
          self._pending = line[endpos:]
-        if self._readtranslate:
-            self._seennl |= ending
-            if ending != self._LF:
-                return line[:pos] + '\n'
-            else:
-                return line[:endpos]
-        else:
-            return line[:endpos]
+        return line[:endpos]
  
-    def _replacenl(self, data):
-        # Replace newlines in data as needed and record that they have
-        # been seen.
-        if not self._readtranslate:
-            return data
-        if self._readuniversal:
-            crlf = data.count('\r\n')
-            cr = data.count('\r') - crlf
-            lf = data.count('\n') - crlf
-            self._seennl |= (lf and self._LF) | (cr and self._CR) \
-                         | (crlf and self._CRLF)
-            if crlf:
-                data = data.replace("\r\n", "\n")
-            if cr:
-                data = data.replace("\r", "\n")
-        elif self._readnl == '\n':
-            # Only need to detect if \n was seen.
-            if data.count('\n'):
-                self._seennl |= self._LF
-        else:
-            newdata = data.replace(self._readnl, '\n')
-            if newdata is not data:
-                self._seennl |= self._nlflag(self._readnl)
-            data = newdata
-        return data
-
-    _LF = 1
-    _CR = 2
-    _CRLF = 4
      @property
      def newlines(self):
-        return (None,
-                "\n",
-                "\r",
-                ("\r", "\n"),
-                "\r\n",
-                ("\n", "\r\n"),
-                ("\r", "\r\n"),
-                ("\r", "\n", "\r\n")
-               )[self._seennl]
-
-    def _nlflag(self, nlstr):
-        return [None, "\n", "\r", None, "\r\n"].index(nlstr)
+        return self._decoder.newlines if self._decoder else None
  
  class StringIO(TextIOWrapper):
  
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py

index dace6427cdf620d5155f3b9861aa1ae4c4a96278..697f69e5b2efc6dc2f6884b02103166bf1b7d1a7 100644 (file)
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -489,6 +489,10 @@ class BufferedRandomTest(unittest.TestCase):
  
  class TextIOWrapperTest(unittest.TestCase):
  
+    def setUp(self):
+        self.testdata = b"AAA\r\nBBB\rCCC\r\nDDD\nEEE\r\n"
+        self.normalized = b"AAA\nBBB\nCCC\nDDD\nEEE\n".decode("ascii")
+
      def tearDown(self):
          test_support.unlink(test_support.TESTFN)
  
@@ -496,14 +500,14 @@ class TextIOWrapperTest(unittest.TestCase):
          testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
          normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
          for newline, expected in [
-            (None, normalized.decode("ASCII").splitlines(True)),
-            ("", testdata.decode("ASCII").splitlines(True)),
+            (None, normalized.decode("ascii").splitlines(True)),
+            ("", testdata.decode("ascii").splitlines(True)),
              ("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
              ("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
              ("\r",  ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]),
              ]:
              buf = io.BytesIO(testdata)
-            txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+            txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
              self.assertEquals(txt.readlines(), expected)
              txt.seek(0)
              self.assertEquals(txt.read(), "".join(expected))
@@ -518,7 +522,7 @@ class TextIOWrapperTest(unittest.TestCase):
          tests = [(None, testdict[os.linesep])] + sorted(testdict.items())
          for newline, expected in tests:
              buf = io.BytesIO()
-            txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+            txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
              txt.write("AAA\nB")
              txt.write("BB\nCCC\n")
              txt.write("X\rY\r\nZ")
@@ -568,14 +572,14 @@ class TextIOWrapperTest(unittest.TestCase):
          testdata = b"AAA\nBBB\nCCC\rDDD\rEEE\r\nFFF\r\nGGG"
          normalized = testdata.replace(b"\r\n", b"\n").replace(b"\r", b"\n")
          for newline, expected in [
-            (None, normalized.decode("ASCII").splitlines(True)),
-            ("", testdata.decode("ASCII").splitlines(True)),
+            (None, normalized.decode("ascii").splitlines(True)),
+            ("", testdata.decode("ascii").splitlines(True)),
              ("\n", ["AAA\n", "BBB\n", "CCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
              ("\r\n", ["AAA\nBBB\nCCC\rDDD\rEEE\r\n", "FFF\r\n", "GGG"]),
              ("\r",  ["AAA\nBBB\nCCC\r", "DDD\r", "EEE\r", "\nFFF\r", "\nGGG"]),
              ]:
              buf = io.BytesIO(testdata)
-            txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+            txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
              self.assertEquals(txt.readlines(), expected)
              txt.seek(0)
              self.assertEquals(txt.read(), "".join(expected))
@@ -600,7 +604,7 @@ class TextIOWrapperTest(unittest.TestCase):
                  ("\r\n", "\r\n", data_crlf),
                  ]:
                  buf = io.BytesIO()
-                txt = io.TextIOWrapper(buf, encoding="ASCII", newline=newline)
+                txt = io.TextIOWrapper(buf, encoding="ascii", newline=newline)
                  txt.write(data)
                  txt.close()
                  self.assertEquals(buf.getvalue(), expected)
@@ -745,6 +749,114 @@ class TextIOWrapperTest(unittest.TestCase):
                  print("Reading using readline(): %6.3f seconds" % (t3-t2))
                  print("Using readline()+tell():  %6.3f seconds" % (t4-t3))
  
+    def testReadOneByOne(self):
+        txt = io.TextIOWrapper(io.BytesIO(b"AA\r\nBB"))
+        reads = ""
+        while True:
+            c = txt.read(1)
+            if not c:
+                break
+            reads += c
+        self.assertEquals(reads, "AA\nBB")
+
+    # read in amounts equal to TextIOWrapper._CHUNK_SIZE which is 128.
+    def testReadByChunk(self):
+        # make sure "\r\n" straddles 128 char boundary.
+        txt = io.TextIOWrapper(io.BytesIO(b"A" * 127 + b"\r\nB"))
+        reads = ""
+        while True:
+            c = txt.read(128)
+            if not c:
+                break
+            reads += c
+        self.assertEquals(reads, "A"*127+"\nB")
+
+    def test_issue1395_1(self):
+        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
+
+        # read one char at a time
+        reads = ""
+        while True:
+            c = txt.read(1)
+            if not c:
+                break
+            reads += c
+        self.assertEquals(reads, self.normalized)
+
+    def test_issue1395_2(self):
+        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
+        txt._CHUNK_SIZE = 4
+
+        reads = ""
+        while True:
+            c = txt.read(4)
+            if not c:
+                break
+            reads += c
+        self.assertEquals(reads, self.normalized)
+
+    def test_issue1395_3(self):
+        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
+        txt._CHUNK_SIZE = 4
+
+        reads = txt.read(4)
+        reads += txt.read(4)
+        reads += txt.readline()
+        reads += txt.readline()
+        reads += txt.readline()
+        self.assertEquals(reads, self.normalized)
+
+    def test_issue1395_4(self):
+        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
+        txt._CHUNK_SIZE = 4
+
+        reads = txt.read(4)
+        reads += txt.read()
+        self.assertEquals(reads, self.normalized)
+
+    def test_issue1395_5(self):
+        txt = io.TextIOWrapper(io.BytesIO(self.testdata), encoding="ascii")
+        txt._CHUNK_SIZE = 4
+
+        reads = txt.read(4)
+        pos = txt.tell()
+        txt.seek(0)
+        txt.seek(pos)
+        self.assertEquals(txt.read(4), "BBB\n")
+
+    def test_newline_decoder(self):
+        import codecs
+        decoder = codecs.getincrementaldecoder("utf-8")()
+        decoder = io.IncrementalNewlineDecoder(decoder, translate=True)
+
+        self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
+
+        self.assertEquals(decoder.decode(b'\xe8'), "")
+        self.assertEquals(decoder.decode(b'\xa2'), "")
+        self.assertEquals(decoder.decode(b'\x88'), "\u8888")
+
+        self.assertEquals(decoder.decode(b'\xe8'), "")
+        self.assertRaises(UnicodeDecodeError, decoder.decode, b'', final=True)
+
+        decoder.setstate((b'', 0))
+        self.assertEquals(decoder.decode(b'\n'), "\n")
+        self.assertEquals(decoder.decode(b'\r'), "")
+        self.assertEquals(decoder.decode(b'', final=True), "\n")
+        self.assertEquals(decoder.decode(b'\r', final=True), "\n")
+
+        self.assertEquals(decoder.decode(b'\r'), "")
+        self.assertEquals(decoder.decode(b'a'), "\na")
+
+        self.assertEquals(decoder.decode(b'\r\r\n'), "\n\n")
+        self.assertEquals(decoder.decode(b'\r'), "")
+        self.assertEquals(decoder.decode(b'\r'), "\n")
+        self.assertEquals(decoder.decode(b'\na'), "\na")
+
+        self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r\n'), "\u8888\n")
+        self.assertEquals(decoder.decode(b'\xe8\xa2\x88'), "\u8888")
+        self.assertEquals(decoder.decode(b'\n'), "\n")
+        self.assertEquals(decoder.decode(b'\xe8\xa2\x88\r'), "\u8888")
+        self.assertEquals(decoder.decode(b'\n'), "\n")
  
  # XXX Tests for open()
  
diff --git a/Modules/_fileio.c b/Modules/_fileio.c

index 0fd8b6685716a01382626c02d47356edfdaf2030..68b28d40cd7dccf7f9876ea2e1b3dab86ecdb71e 100644 (file)
--- a/Modules/_fileio.c
+++ b/Modules/_fileio.c
@@ -867,7 +867,7 @@ static PyGetSetDef fileio_getsetlist[] = {
  
  PyTypeObject PyFileIO_Type = {
         PyVarObject_HEAD_INIT(&PyType_Type, 0)
-       "FileIO",
+       "_FileIO",
         sizeof(PyFileIOObject),
         0,
         (destructor)fileio_dealloc,             /* tp_dealloc */
author	Amaury Forgeot d'Arc <amauryfa@gmail.com>
	Mon, 19 Nov 2007 20:34:10 +0000 (20:34 +0000)
committer	Amaury Forgeot d'Arc <amauryfa@gmail.com>
	Mon, 19 Nov 2007 20:34:10 +0000 (20:34 +0000)
Lib/io.py		patch \| blob \| blame \| history
Lib/test/test_io.py		patch \| blob \| blame \| history
Modules/_fileio.c		patch \| blob \| blame \| history