]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-129005: Align FileIO.readall between _pyio and _io (#129705)
authorCody Maloney <cmaloney@users.noreply.github.com>
Fri, 7 Feb 2025 11:06:11 +0000 (03:06 -0800)
committerGitHub <noreply@github.com>
Fri, 7 Feb 2025 11:06:11 +0000 (12:06 +0100)
Utilize `bytearray.resize()` and `os.readinto()` to reduce copies
and match behavior of `_io.FileIO.readall()`.

There is still an extra copy which means twice the memory required
compared to FileIO because there isn't a zero-copy  path from
`bytearray` -> `bytes` currently.

On my system reading a 2 GB file:
`./python -m test -M8g -uall test_largefile -m test.test_largefile.PyLargeFileTest.test_large_read -v`

Goes from ~2.7 seconds -> ~2.2 seconds

Co-authored-by: Victor Stinner <vstinner@python.org>
Lib/_pyio.py
Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst [new file with mode: 0644]

index b3a8f37d68acdb77c9b0c2f7487eb82bddbce14a..f7370dff19efc859a25174a3ad8828714facb369 100644 (file)
@@ -1454,6 +1454,17 @@ class BufferedRandom(BufferedWriter, BufferedReader):
         return BufferedWriter.write(self, b)
 
 
+def _new_buffersize(bytes_read):
+    # Parallels _io/fileio.c new_buffersize
+    if bytes_read > 65536:
+        addend = bytes_read >> 3
+    else:
+        addend = 256 + bytes_read
+    if addend < DEFAULT_BUFFER_SIZE:
+        addend = DEFAULT_BUFFER_SIZE
+    return bytes_read + addend
+
+
 class FileIO(RawIOBase):
     _fd = -1
     _created = False
@@ -1672,22 +1683,20 @@ class FileIO(RawIOBase):
                 except OSError:
                     pass
 
-        result = bytearray()
-        while True:
-            if len(result) >= bufsize:
-                bufsize = len(result)
-                bufsize += max(bufsize, DEFAULT_BUFFER_SIZE)
-            n = bufsize - len(result)
-            try:
-                chunk = os.read(self._fd, n)
-            except BlockingIOError:
-                if result:
-                    break
+        result = bytearray(bufsize)
+        bytes_read = 0
+        try:
+            while n := os.readinto(self._fd, memoryview(result)[bytes_read:]):
+                bytes_read += n
+                if bytes_read >= len(result):
+                    result.resize(_new_buffersize(bytes_read))
+        except BlockingIOError:
+            if not bytes_read:
                 return None
-            if not chunk: # reached the end of the file
-                break
-            result += chunk
 
+        assert len(result) - bytes_read >= 1, \
+            "os.readinto buffer size 0 will result in erroneous EOF / returns 0"
+        result.resize(bytes_read)
         return bytes(result)
 
     def readinto(self, buffer):
diff --git a/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst
new file mode 100644 (file)
index 0000000..236d776
--- /dev/null
@@ -0,0 +1,2 @@
+``_pyio.FileIO.readall()`` now allocates, resizes, and fills a data buffer
+using the same algorithm ``_io.FileIO.readall()`` uses.