Patch #1268314: Cache lines in StreamReader.readlines for performance.

author Martin v. Löwis <martin@v.loewis.de>

Sun, 18 Sep 2005 08:34:39 +0000 (08:34 +0000)

committer Martin v. Löwis <martin@v.loewis.de>

Sun, 18 Sep 2005 08:34:39 +0000 (08:34 +0000)
author Martin v. Löwis <martin@v.loewis.de>
Sun, 18 Sep 2005 08:34:39 +0000 (08:34 +0000)
committer Martin v. Löwis <martin@v.loewis.de>
Sun, 18 Sep 2005 08:34:39 +0000 (08:34 +0000)
diff --git a/Lib/codecs.py b/Lib/codecs.py

index d972a5191fd74ccb1d72c406540634b8a43ab211..9d29acc20ca66acbc5cb8c369aed7e0432dab642 100644 (file)
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -232,6 +232,7 @@ class StreamReader(Codec):
          # For str->str decoding this will stay a str
          # For str->unicode decoding the first read will promote it to unicode
          self.charbuffer = ""
+        self.linebuffer = None
  
      def decode(self, input, errors='strict'):
          raise NotImplementedError
@@ -264,6 +265,11 @@ class StreamReader(Codec):
              optional encoding endings or state markers are available
              on the stream, these should be read too.
          """
+        # If we have lines cached, first merge them back into characters
+        if self.linebuffer:
+            self.charbuffer = "".join(self.linebuffer)
+            self.linebuffer = None
+            
          # read until we get the required number of characters (if available)
          while True:
              # can the request can be satisfied from the character buffer?
@@ -316,6 +322,20 @@ class StreamReader(Codec):
              read() method.
  
          """
+        # If we have lines cached from an earlier read, return
+        # them unconditionally
+        if self.linebuffer:
+            line = self.linebuffer[0]
+            del self.linebuffer[0]
+            if len(self.linebuffer) == 1:
+                # revert to charbuffer mode; we might need more data
+                # next time
+                self.charbuffer = self.linebuffer[0]
+                self.linebuffer = None
+            if not keepends:
+                line = line.splitlines(False)[0]
+            return line
+            
          readsize = size or 72
          line = ""
          # If size is given, we call read() only once
@@ -331,6 +351,22 @@ class StreamReader(Codec):
              line += data
              lines = line.splitlines(True)
              if lines:
+                if len(lines) > 1:
+                    # More than one line result; the first line is a full line
+                    # to return
+                    line = lines[0]
+                    del lines[0]
+                    if len(lines) > 1:
+                        # cache the remaining lines
+                        lines[-1] += self.charbuffer
+                        self.linebuffer = lines
+                        self.charbuffer = None
+                    else:
+                        # only one remaining line, put it back into charbuffer
+                        self.charbuffer = lines[0] + self.charbuffer
+                    if not keepends:
+                        line = line.splitlines(False)[0]
+                    break
                  line0withend = lines[0]
                  line0withoutend = lines[0].splitlines(False)[0]
                  if line0withend != line0withoutend: # We really have a line end
@@ -376,6 +412,7 @@ class StreamReader(Codec):
          """
          self.bytebuffer = ""
          self.charbuffer = u""
+        self.linebuffer = None
  
      def seek(self, offset, whence=0):
          """ Set the input stream's current position.
diff --git a/Misc/NEWS b/Misc/NEWS

index 1ffa307782ae54408394fcb0746e91a892647c03..5eb20ab92636a83da008c1eb24697b48ceed47e0 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -219,6 +219,8 @@ Extension Modules
  Library
  -------
  
+- Patch #1268314: Cache lines in StreamReader.readlines for performance.
+
  - Bug #1290505: Fix clearing the regex cache for time.strptime().
  
  - Bug #1167128: Fix size of a symlink in a tarfile to be 0.
author	Martin v. Löwis <martin@v.loewis.de>
	Sun, 18 Sep 2005 08:34:39 +0000 (08:34 +0000)
committer	Martin v. Löwis <martin@v.loewis.de>
	Sun, 18 Sep 2005 08:34:39 +0000 (08:34 +0000)
Lib/codecs.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history