]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-120754: Refactor I/O modules to stash whole stat result rather than individual...
authorCody Maloney <cmaloney@users.noreply.github.com>
Wed, 18 Sep 2024 15:47:57 +0000 (08:47 -0700)
committerGitHub <noreply@github.com>
Wed, 18 Sep 2024 15:47:57 +0000 (17:47 +0200)
Multiple places in the I/O stack optimize common cases by using the
information from stat. Currently individual members are extracted from
the stat and stored into the fileio struct. Refactor the code to store
the whole stat struct instead.

Parallels the changes to _io. The `stat` Python object doesn't allow
changing members, so rather than modifying estimated_size, just clear
the value.

Lib/_pyio.py
Modules/_io/fileio.c

index 75b5ad1b1a47d24efa29497de8fb1517b1c11740..18849b309b860550bbf5e4346bac19a133cb55c5 100644 (file)
@@ -242,14 +242,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None,
             buffering = -1
             line_buffering = True
         if buffering < 0:
-            buffering = DEFAULT_BUFFER_SIZE
-            try:
-                bs = os.fstat(raw.fileno()).st_blksize
-            except (OSError, AttributeError):
-                pass
-            else:
-                if bs > 1:
-                    buffering = bs
+            buffering = raw._blksize
         if buffering < 0:
             raise ValueError("invalid buffering size")
         if buffering == 0:
@@ -1565,19 +1558,15 @@ class FileIO(RawIOBase):
                     os.set_inheritable(fd, False)
 
             self._closefd = closefd
-            fdfstat = os.fstat(fd)
+            self._stat_atopen = os.fstat(fd)
             try:
-                if stat.S_ISDIR(fdfstat.st_mode):
+                if stat.S_ISDIR(self._stat_atopen.st_mode):
                     raise IsADirectoryError(errno.EISDIR,
                                             os.strerror(errno.EISDIR), file)
             except AttributeError:
                 # Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR
                 # don't exist.
                 pass
-            self._blksize = getattr(fdfstat, 'st_blksize', 0)
-            if self._blksize <= 1:
-                self._blksize = DEFAULT_BUFFER_SIZE
-            self._estimated_size = fdfstat.st_size
 
             if _setmode:
                 # don't translate newlines (\r\n <=> \n)
@@ -1623,6 +1612,17 @@ class FileIO(RawIOBase):
             return ('<%s name=%r mode=%r closefd=%r>' %
                     (class_name, name, self.mode, self._closefd))
 
+    @property
+    def _blksize(self):
+        if self._stat_atopen is None:
+            return DEFAULT_BUFFER_SIZE
+
+        blksize = getattr(self._stat_atopen, "st_blksize", 0)
+        # WASI sets blsize to 0
+        if not blksize:
+            return DEFAULT_BUFFER_SIZE
+        return blksize
+
     def _checkReadable(self):
         if not self._readable:
             raise UnsupportedOperation('File not open for reading')
@@ -1655,16 +1655,20 @@ class FileIO(RawIOBase):
         """
         self._checkClosed()
         self._checkReadable()
-        if self._estimated_size <= 0:
+        if self._stat_atopen is None or self._stat_atopen.st_size <= 0:
             bufsize = DEFAULT_BUFFER_SIZE
         else:
-            bufsize = self._estimated_size + 1
+            # In order to detect end of file, need a read() of at least 1
+            # byte which returns size 0. Oversize the buffer by 1 byte so the
+            # I/O can be completed with two read() calls (one for all data, one
+            # for EOF) without needing to resize the buffer.
+            bufsize = self._stat_atopen.st_size + 1
 
-            if self._estimated_size > 65536:
+            if self._stat_atopen.st_size > 65536:
                 try:
                     pos = os.lseek(self._fd, 0, SEEK_CUR)
-                    if self._estimated_size >= pos:
-                        bufsize = self._estimated_size - pos + 1
+                    if self._stat_atopen.st_size >= pos:
+                        bufsize = self._stat_atopen.st_size - pos + 1
                 except OSError:
                     pass
 
@@ -1742,7 +1746,7 @@ class FileIO(RawIOBase):
         if size is None:
             size = self.tell()
         os.ftruncate(self._fd, size)
-        self._estimated_size = size
+        self._stat_atopen = None
         return size
 
     def close(self):
index 5d9d87d6118a751e2f90d5604fbd98d93850cd92..865b0e3634f3b4e9579148aa5aadb955bc2ccf07 100644 (file)
@@ -74,8 +74,13 @@ typedef struct {
     signed int seekable : 2; /* -1 means unknown */
     unsigned int closefd : 1;
     char finalizing;
-    unsigned int blksize;
-    Py_off_t estimated_size;
+    /* Stat result which was grabbed at file open, useful for optimizing common
+       File I/O patterns to be more efficient. This is only guidance / an
+       estimate, as it is subject to Time-Of-Check to Time-Of-Use (TOCTOU)
+       issues / bugs. Both the underlying file descriptor and file may be
+       modified outside of the fileio object / Python (ex. gh-90102, GH-121941,
+       gh-109523). */
+    struct _Py_stat_struct *stat_atopen;
     PyObject *weakreflist;
     PyObject *dict;
 } fileio;
@@ -199,8 +204,7 @@ fileio_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
         self->writable = 0;
         self->appending = 0;
         self->seekable = -1;
-        self->blksize = 0;
-        self->estimated_size = -1;
+        self->stat_atopen = NULL;
         self->closefd = 1;
         self->weakreflist = NULL;
     }
@@ -256,7 +260,6 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
 #elif !defined(MS_WINDOWS)
     int *atomic_flag_works = NULL;
 #endif
-    struct _Py_stat_struct fdfstat;
     int fstat_result;
     int async_err = 0;
 
@@ -454,9 +457,13 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
 #endif
     }
 
-    self->blksize = DEFAULT_BUFFER_SIZE;
+    self->stat_atopen = PyMem_New(struct _Py_stat_struct, 1);
+    if (self->stat_atopen == NULL) {
+        PyErr_NoMemory();
+        goto error;
+    }
     Py_BEGIN_ALLOW_THREADS
-    fstat_result = _Py_fstat_noraise(self->fd, &fdfstat);
+    fstat_result = _Py_fstat_noraise(self->fd, self->stat_atopen);
     Py_END_ALLOW_THREADS
     if (fstat_result < 0) {
         /* Tolerate fstat() errors other than EBADF.  See Issue #25717, where
@@ -471,25 +478,21 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
 #endif
             goto error;
         }
+
+        PyMem_Free(self->stat_atopen);
+        self->stat_atopen = NULL;
     }
     else {
 #if defined(S_ISDIR) && defined(EISDIR)
         /* On Unix, open will succeed for directories.
            In Python, there should be no file objects referring to
            directories, so we need a check.  */
-        if (S_ISDIR(fdfstat.st_mode)) {
+        if (S_ISDIR(self->stat_atopen->st_mode)) {
             errno = EISDIR;
             PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, nameobj);
             goto error;
         }
 #endif /* defined(S_ISDIR) */
-#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
-        if (fdfstat.st_blksize > 1)
-            self->blksize = fdfstat.st_blksize;
-#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */
-        if (fdfstat.st_size < PY_SSIZE_T_MAX) {
-            self->estimated_size = (Py_off_t)fdfstat.st_size;
-        }
     }
 
 #if defined(MS_WINDOWS) || defined(__CYGWIN__)
@@ -521,6 +524,10 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
         internal_close(self);
         _PyErr_ChainExceptions1(exc);
     }
+    if (self->stat_atopen != NULL) {
+        PyMem_Free(self->stat_atopen);
+        self->stat_atopen = NULL;
+    }
 
  done:
 #ifdef MS_WINDOWS
@@ -553,6 +560,10 @@ fileio_dealloc(fileio *self)
     if (_PyIOBase_finalize((PyObject *) self) < 0)
         return;
     _PyObject_GC_UNTRACK(self);
+    if (self->stat_atopen != NULL) {
+        PyMem_Free(self->stat_atopen);
+        self->stat_atopen = NULL;
+    }
     if (self->weakreflist != NULL)
         PyObject_ClearWeakRefs((PyObject *) self);
     (void)fileio_clear(self);
@@ -725,20 +736,27 @@ _io_FileIO_readall_impl(fileio *self)
         return err_closed();
     }
 
-    end = self->estimated_size;
+    if (self->stat_atopen != NULL && self->stat_atopen->st_size < _PY_READ_MAX) {
+        end = (Py_off_t)self->stat_atopen->st_size;
+    }
+    else {
+        end = -1;
+    }
     if (end <= 0) {
         /* Use a default size and resize as needed. */
         bufsize = SMALLCHUNK;
     }
     else {
-        /* This is probably a real file, so we try to allocate a
-           buffer one byte larger than the rest of the file.  If the
-           calculation is right then we should get EOF without having
-           to enlarge the buffer. */
+        /* This is probably a real file. */
         if (end > _PY_READ_MAX - 1) {
             bufsize = _PY_READ_MAX;
         }
         else {
+            /* In order to detect end of file, need a read() of at
+               least 1 byte which returns size 0. Oversize the buffer
+               by 1 byte so the I/O can be completed with two read()
+               calls (one for all data, one for EOF) without needing
+               to resize the buffer. */
             bufsize = (size_t)end + 1;
         }
 
@@ -1094,11 +1112,13 @@ _io_FileIO_truncate_impl(fileio *self, PyTypeObject *cls, PyObject *posobj)
         return NULL;
     }
 
-    /* Sometimes a large file is truncated. While estimated_size is used as a
-    estimate, that it is much larger than the actual size can result in a
-    significant over allocation and sometimes a MemoryError / running out of
-    memory. */
-    self->estimated_size = pos;
+    /* Since the file was truncated, its size at open is no longer accurate
+       as an estimate. Clear out the stat result, and rely on dynamic resize
+       code if a readall is requested. */
+    if (self->stat_atopen != NULL) {
+        PyMem_Free(self->stat_atopen);
+        self->stat_atopen = NULL;
+    }
 
     return posobj;
 }
@@ -1229,16 +1249,27 @@ get_mode(fileio *self, void *closure)
     return PyUnicode_FromString(mode_string(self));
 }
 
+static PyObject *
+get_blksize(fileio *self, void *closure)
+{
+#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
+    if (self->stat_atopen != NULL && self->stat_atopen->st_blksize > 1) {
+        return PyLong_FromLong(self->stat_atopen->st_blksize);
+    }
+#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */
+    return PyLong_FromLong(DEFAULT_BUFFER_SIZE);
+}
+
 static PyGetSetDef fileio_getsetlist[] = {
     {"closed", (getter)get_closed, NULL, "True if the file is closed"},
     {"closefd", (getter)get_closefd, NULL,
         "True if the file descriptor will be closed by close()."},
     {"mode", (getter)get_mode, NULL, "String giving the file mode"},
+    {"_blksize", (getter)get_blksize, NULL, "Stat st_blksize if available"},
     {NULL},
 };
 
 static PyMemberDef fileio_members[] = {
-    {"_blksize", Py_T_UINT, offsetof(fileio, blksize), 0},
     {"_finalizing", Py_T_BOOL, offsetof(fileio, finalizing), 0},
     {"__weaklistoffset__", Py_T_PYSSIZET, offsetof(fileio, weakreflist), Py_READONLY},
     {"__dictoffset__", Py_T_PYSSIZET, offsetof(fileio, dict), Py_READONLY},