]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-102511: Speed up os.path.splitroot() with native helpers (GH-118089)
authorNice Zombies <nineteendo19d0@gmail.com>
Thu, 25 Apr 2024 09:07:38 +0000 (11:07 +0200)
committerGitHub <noreply@github.com>
Thu, 25 Apr 2024 09:07:38 +0000 (10:07 +0100)
Include/internal/pycore_fileutils.h
Lib/ntpath.py
Lib/posixpath.py
Lib/test/test_ntpath.py
Misc/NEWS.d/next/Core and Builtins/2024-04-19-08-50-48.gh-issue-102511.qDEB66.rst [new file with mode: 0644]
Modules/clinic/posixmodule.c.h
Modules/posixmodule.c
Python/fileutils.c

index 5c55282fa39e6f6062c056af220057847416ab5b..bc8100b58e8ea32ae54139e4db78bbfc961f9eb8 100644 (file)
@@ -290,6 +290,8 @@ extern wchar_t *_Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t
 extern HRESULT PathCchSkipRoot(const wchar_t *pszPath, const wchar_t **ppszRootEnd);
 #endif /* defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP) */
 
+extern void _Py_skiproot(const wchar_t *path, Py_ssize_t size, Py_ssize_t *drvsize, Py_ssize_t *rootsize);
+
 // Macros to protect CRT calls against instant termination when passed an
 // invalid parameter (bpo-23524). IPH stands for Invalid Parameter Handler.
 // Usage:
index aba18bfe407abf931adbdc144f1c66eedbf1ae79..e810b655e5ac851bd6fd4f2793c86791a45aa44b 100644 (file)
@@ -167,56 +167,76 @@ def splitdrive(p):
     return drive, root + tail
 
 
-def splitroot(p):
-    """Split a pathname into drive, root and tail. The drive is defined
-    exactly as in splitdrive(). On Windows, the root may be a single path
-    separator or an empty string. The tail contains anything after the root.
-    For example:
-
-        splitroot('//server/share/') == ('//server/share', '/', '')
-        splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney')
-        splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham')
-        splitroot('Windows/notepad') == ('', '', 'Windows/notepad')
-    """
-    p = os.fspath(p)
-    if isinstance(p, bytes):
-        sep = b'\\'
-        altsep = b'/'
-        colon = b':'
-        unc_prefix = b'\\\\?\\UNC\\'
-        empty = b''
-    else:
-        sep = '\\'
-        altsep = '/'
-        colon = ':'
-        unc_prefix = '\\\\?\\UNC\\'
-        empty = ''
-    normp = p.replace(altsep, sep)
-    if normp[:1] == sep:
-        if normp[1:2] == sep:
-            # UNC drives, e.g. \\server\share or \\?\UNC\server\share
-            # Device drives, e.g. \\.\device or \\?\device
-            start = 8 if normp[:8].upper() == unc_prefix else 2
-            index = normp.find(sep, start)
-            if index == -1:
-                return p, empty, empty
-            index2 = normp.find(sep, index + 1)
-            if index2 == -1:
-                return p, empty, empty
-            return p[:index2], p[index2:index2 + 1], p[index2 + 1:]
+try:
+    from nt import _path_splitroot_ex
+except ImportError:
+    def splitroot(p):
+        """Split a pathname into drive, root and tail. The drive is defined
+        exactly as in splitdrive(). On Windows, the root may be a single path
+        separator or an empty string. The tail contains anything after the root.
+        For example:
+
+            splitroot('//server/share/') == ('//server/share', '/', '')
+            splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney')
+            splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham')
+            splitroot('Windows/notepad') == ('', '', 'Windows/notepad')
+        """
+        p = os.fspath(p)
+        if isinstance(p, bytes):
+            sep = b'\\'
+            altsep = b'/'
+            colon = b':'
+            unc_prefix = b'\\\\?\\UNC\\'
+            empty = b''
         else:
-            # Relative path with root, e.g. \Windows
-            return empty, p[:1], p[1:]
-    elif normp[1:2] == colon:
-        if normp[2:3] == sep:
-            # Absolute drive-letter path, e.g. X:\Windows
-            return p[:2], p[2:3], p[3:]
+            sep = '\\'
+            altsep = '/'
+            colon = ':'
+            unc_prefix = '\\\\?\\UNC\\'
+            empty = ''
+        normp = p.replace(altsep, sep)
+        if normp[:1] == sep:
+            if normp[1:2] == sep:
+                # UNC drives, e.g. \\server\share or \\?\UNC\server\share
+                # Device drives, e.g. \\.\device or \\?\device
+                start = 8 if normp[:8].upper() == unc_prefix else 2
+                index = normp.find(sep, start)
+                if index == -1:
+                    return p, empty, empty
+                index2 = normp.find(sep, index + 1)
+                if index2 == -1:
+                    return p, empty, empty
+                return p[:index2], p[index2:index2 + 1], p[index2 + 1:]
+            else:
+                # Relative path with root, e.g. \Windows
+                return empty, p[:1], p[1:]
+        elif normp[1:2] == colon:
+            if normp[2:3] == sep:
+                # Absolute drive-letter path, e.g. X:\Windows
+                return p[:2], p[2:3], p[3:]
+            else:
+                # Relative path with drive, e.g. X:Windows
+                return p[:2], empty, p[2:]
         else:
-            # Relative path with drive, e.g. X:Windows
-            return p[:2], empty, p[2:]
-    else:
-        # Relative path, e.g. Windows
-        return empty, empty, p
+            # Relative path, e.g. Windows
+            return empty, empty, p
+else:
+    def splitroot(p):
+        """Split a pathname into drive, root and tail. The drive is defined
+        exactly as in splitdrive(). On Windows, the root may be a single path
+        separator or an empty string. The tail contains anything after the root.
+        For example:
+
+            splitroot('//server/share/') == ('//server/share', '/', '')
+            splitroot('C:/Users/Barney') == ('C:', '/', 'Users/Barney')
+            splitroot('C:///spam///ham') == ('C:', '/', '//spam///ham')
+            splitroot('Windows/notepad') == ('', '', 'Windows/notepad')
+        """
+        p = os.fspath(p)
+        if isinstance(p, bytes):
+            drive, root, tail = _path_splitroot_ex(os.fsdecode(p))
+            return os.fsencode(drive), os.fsencode(root), os.fsencode(tail)
+        return _path_splitroot_ex(p)
 
 
 # Split a path in head (everything up to the last '/') and tail (the
index f1960ddb88e59055d8bf257a5ac6042f64acc8d1..56b7915826daf4b81531110b6b5b43c6838e4ad5 100644 (file)
@@ -134,33 +134,53 @@ def splitdrive(p):
     return p[:0], p
 
 
-def splitroot(p):
-    """Split a pathname into drive, root and tail. On Posix, drive is always
-    empty; the root may be empty, a single slash, or two slashes. The tail
-    contains anything after the root. For example:
-
-        splitroot('foo/bar') == ('', '', 'foo/bar')
-        splitroot('/foo/bar') == ('', '/', 'foo/bar')
-        splitroot('//foo/bar') == ('', '//', 'foo/bar')
-        splitroot('///foo/bar') == ('', '/', '//foo/bar')
-    """
-    p = os.fspath(p)
-    if isinstance(p, bytes):
-        sep = b'/'
-        empty = b''
-    else:
-        sep = '/'
-        empty = ''
-    if p[:1] != sep:
-        # Relative path, e.g.: 'foo'
-        return empty, empty, p
-    elif p[1:2] != sep or p[2:3] == sep:
-        # Absolute path, e.g.: '/foo', '///foo', '////foo', etc.
-        return empty, sep, p[1:]
-    else:
-        # Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see
-        # https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13
-        return empty, p[:2], p[2:]
+try:
+    from posix import _path_splitroot_ex
+except ImportError:
+    def splitroot(p):
+        """Split a pathname into drive, root and tail. On Posix, drive is always
+        empty; the root may be empty, a single slash, or two slashes. The tail
+        contains anything after the root. For example:
+
+            splitroot('foo/bar') == ('', '', 'foo/bar')
+            splitroot('/foo/bar') == ('', '/', 'foo/bar')
+            splitroot('//foo/bar') == ('', '//', 'foo/bar')
+            splitroot('///foo/bar') == ('', '/', '//foo/bar')
+        """
+        p = os.fspath(p)
+        if isinstance(p, bytes):
+            sep = b'/'
+            empty = b''
+        else:
+            sep = '/'
+            empty = ''
+        if p[:1] != sep:
+            # Relative path, e.g.: 'foo'
+            return empty, empty, p
+        elif p[1:2] != sep or p[2:3] == sep:
+            # Absolute path, e.g.: '/foo', '///foo', '////foo', etc.
+            return empty, sep, p[1:]
+        else:
+            # Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see
+            # https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13
+            return empty, p[:2], p[2:]
+else:
+    def splitroot(p):
+        """Split a pathname into drive, root and tail. On Posix, drive is always
+        empty; the root may be empty, a single slash, or two slashes. The tail
+        contains anything after the root. For example:
+
+            splitroot('foo/bar') == ('', '', 'foo/bar')
+            splitroot('/foo/bar') == ('', '/', 'foo/bar')
+            splitroot('//foo/bar') == ('', '//', 'foo/bar')
+            splitroot('///foo/bar') == ('', '/', '//foo/bar')
+        """
+        p = os.fspath(p)
+        if isinstance(p, bytes):
+            # Optimisation: the drive is always empty
+            _, root, tail = _path_splitroot_ex(os.fsdecode(p))
+            return b'', os.fsencode(root), os.fsencode(tail)
+        return _path_splitroot_ex(p)
 
 
 # Return the tail (basename) part of a path, same as split(path)[1].
index 31156130fcc74742cb1d052003bb5960c6f1d1d1..7f91bf1c2b837a4ea43e021c05f784238715cdce 100644 (file)
@@ -374,6 +374,7 @@ class TestNtpath(NtpathTestCase):
         tester("ntpath.normpath('\\\\foo\\')", '\\\\foo\\')
         tester("ntpath.normpath('\\\\foo')", '\\\\foo')
         tester("ntpath.normpath('\\\\')", '\\\\')
+        tester("ntpath.normpath('//?/UNC/server/share/..')", '\\\\?\\UNC\\server\\share\\')
 
     def test_realpath_curdir(self):
         expected = ntpath.normpath(os.getcwd())
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-04-19-08-50-48.gh-issue-102511.qDEB66.rst b/Misc/NEWS.d/next/Core and Builtins/2024-04-19-08-50-48.gh-issue-102511.qDEB66.rst
new file mode 100644 (file)
index 0000000..dfdf250
--- /dev/null
@@ -0,0 +1 @@
+Speed up :func:`os.path.splitroot` with a native implementation.
index 0398629e3c10ce9d2361f92d6bf874b006e834bb..a0d1f3238a6733bd880b8f8f70c8151653cc2faf 100644 (file)
@@ -2248,6 +2248,64 @@ exit:
 
 #endif /* defined(MS_WINDOWS) */
 
+PyDoc_STRVAR(os__path_splitroot_ex__doc__,
+"_path_splitroot_ex($module, /, path)\n"
+"--\n"
+"\n");
+
+#define OS__PATH_SPLITROOT_EX_METHODDEF    \
+    {"_path_splitroot_ex", _PyCFunction_CAST(os__path_splitroot_ex), METH_FASTCALL|METH_KEYWORDS, os__path_splitroot_ex__doc__},
+
+static PyObject *
+os__path_splitroot_ex_impl(PyObject *module, PyObject *path);
+
+static PyObject *
+os__path_splitroot_ex(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 1
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_item = { &_Py_ID(path), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"path", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "_path_splitroot_ex",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[1];
+    PyObject *path;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, 1, 1, 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!PyUnicode_Check(args[0])) {
+        _PyArg_BadArgument("_path_splitroot_ex", "argument 'path'", "str", args[0]);
+        goto exit;
+    }
+    path = args[0];
+    return_value = os__path_splitroot_ex_impl(module, path);
+
+exit:
+    return return_value;
+}
+
 PyDoc_STRVAR(os__path_normpath__doc__,
 "_path_normpath($module, /, path)\n"
 "--\n"
@@ -12602,4 +12660,4 @@ os__supports_virtual_terminal(PyObject *module, PyObject *Py_UNUSED(ignored))
 #ifndef OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF
     #define OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF
 #endif /* !defined(OS__SUPPORTS_VIRTUAL_TERMINAL_METHODDEF) */
-/*[clinic end generated code: output=511f0788a6b90db0 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=c4698b47007cd6eb input=a9049054013a1b77]*/
index 5e54cf64cd563ecb9d7eb31bf4a2b483c4c4d440..c9d67ccbb8c9089a4e950a17d51ceae6daae8e91 100644 (file)
@@ -5467,6 +5467,49 @@ os__path_islink_impl(PyObject *module, PyObject *path)
 #endif /* MS_WINDOWS */
 
 
+/*[clinic input]
+os._path_splitroot_ex
+
+    path: unicode
+
+[clinic start generated code]*/
+
+static PyObject *
+os__path_splitroot_ex_impl(PyObject *module, PyObject *path)
+/*[clinic end generated code: output=de97403d3dfebc40 input=f1470e12d899f9ac]*/
+{
+    Py_ssize_t len, drvsize, rootsize;
+    PyObject *drv = NULL, *root = NULL, *tail = NULL, *result = NULL;
+
+    wchar_t *buffer = PyUnicode_AsWideCharString(path, &len);
+    if (!buffer) {
+        goto exit;
+    }
+
+    _Py_skiproot(buffer, len, &drvsize, &rootsize);
+    drv = PyUnicode_FromWideChar(buffer, drvsize);
+    if (drv == NULL) {
+        goto exit;
+    }
+    root = PyUnicode_FromWideChar(&buffer[drvsize], rootsize);
+    if (root == NULL) {
+        goto exit;
+    }
+    tail = PyUnicode_FromWideChar(&buffer[drvsize + rootsize],
+                                  len - drvsize - rootsize);
+    if (tail == NULL) {
+        goto exit;
+    }
+    result = Py_BuildValue("(OOO)", drv, root, tail);
+exit:
+    PyMem_Free(buffer);
+    Py_XDECREF(drv);
+    Py_XDECREF(root);
+    Py_XDECREF(tail);
+    return result;
+}
+
+
 /*[clinic input]
 os._path_normpath
 
@@ -16799,6 +16842,7 @@ static PyMethodDef posix_methods[] = {
     OS__FINDFIRSTFILE_METHODDEF
     OS__GETVOLUMEPATHNAME_METHODDEF
     OS__PATH_SPLITROOT_METHODDEF
+    OS__PATH_SPLITROOT_EX_METHODDEF
     OS__PATH_NORMPATH_METHODDEF
     OS_GETLOADAVG_METHODDEF
     OS_URANDOM_METHODDEF
index 882d3299575cf3b44dd4e1c5f92f243a46b992d8..54853ba2f75d9d088990e99c0de27bc44f0c1c3b 100644 (file)
@@ -2295,6 +2295,99 @@ PathCchCombineEx(wchar_t *buffer, size_t bufsize, const wchar_t *dirname,
 
 #endif /* defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP) */
 
+void
+_Py_skiproot(const wchar_t *path, Py_ssize_t size, Py_ssize_t *drvsize,
+             Py_ssize_t *rootsize)
+{
+    assert(drvsize);
+    assert(rootsize);
+#ifndef MS_WINDOWS
+#define IS_SEP(x) (*(x) == SEP)
+    *drvsize = 0;
+    if (!IS_SEP(&path[0])) {
+        // Relative path, e.g.: 'foo'
+        *rootsize = 0;
+    }
+    else if (!IS_SEP(&path[1]) || IS_SEP(&path[2])) {
+        // Absolute path, e.g.: '/foo', '///foo', '////foo', etc.
+        *rootsize = 1;
+    }
+    else {
+        // Precisely two leading slashes, e.g.: '//foo'. Implementation defined per POSIX, see
+        // https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap04.html#tag_04_13
+        *rootsize = 2;
+    }
+#undef IS_SEP
+#else
+    const wchar_t *pEnd = size >= 0 ? &path[size] : NULL;
+#define IS_END(x) (pEnd ? (x) == pEnd : !*(x))
+#define IS_SEP(x) (*(x) == SEP || *(x) == ALTSEP)
+#define SEP_OR_END(x) (IS_SEP(x) || IS_END(x))
+    if (IS_SEP(&path[0])) {
+        if (IS_SEP(&path[1])) {
+            // Device drives, e.g. \\.\device or \\?\device
+            // UNC drives, e.g. \\server\share or \\?\UNC\server\share
+            Py_ssize_t idx;
+            if (path[2] == L'?' && IS_SEP(&path[3]) &&
+                (path[4] == L'U' || path[4] == L'u') &&
+                (path[5] == L'N' || path[5] == L'n') &&
+                (path[6] == L'C' || path[6] == L'c') &&
+                IS_SEP(&path[7]))
+            {
+                idx = 8;
+            }
+            else {
+                idx = 2;
+            }
+            while (!SEP_OR_END(&path[idx])) {
+                idx++;
+            }
+            if (IS_END(&path[idx])) {
+                *drvsize = idx;
+                *rootsize = 0;
+            }
+            else {
+                idx++;
+                while (!SEP_OR_END(&path[idx])) {
+                    idx++;
+                }
+                *drvsize = idx;
+                if (IS_END(&path[idx])) {
+                    *rootsize = 0;
+                }
+                else {
+                    *rootsize = 1;
+                }
+            }
+        }
+        else {
+            // Relative path with root, e.g. \Windows
+            *drvsize = 0;
+            *rootsize = 1;
+        }
+    }
+    else if (!IS_END(&path[0]) && path[1] == L':') {
+        *drvsize = 2;
+        if (IS_SEP(&path[2])) {
+            // Absolute drive-letter path, e.g. X:\Windows
+            *rootsize = 1;
+        }
+        else {
+            // Relative path with drive, e.g. X:Windows
+            *rootsize = 0;
+        }
+    }
+    else {
+        // Relative path, e.g. Windows
+        *drvsize = 0;
+        *rootsize = 0;
+    }
+#undef SEP_OR_END
+#undef IS_SEP
+#undef IS_END
+#endif
+}
+
 // The caller must ensure "buffer" is big enough.
 static int
 join_relfile(wchar_t *buffer, size_t bufsize,
@@ -2411,49 +2504,39 @@ _Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t *normsize)
 #endif
 #define SEP_OR_END(x) (IS_SEP(x) || IS_END(x))
 
-    // Skip leading '.\'
     if (p1[0] == L'.' && IS_SEP(&p1[1])) {
+        // Skip leading '.\'
         path = &path[2];
-        while (IS_SEP(path) && !IS_END(path)) {
+        while (IS_SEP(path)) {
             path++;
         }
         p1 = p2 = minP2 = path;
         lastC = SEP;
     }
+    else {
+        Py_ssize_t drvsize, rootsize;
+        _Py_skiproot(path, size, &drvsize, &rootsize);
+        if (drvsize || rootsize) {
+            // Skip past root and update minP2
+            p1 = &path[drvsize + rootsize];
+#ifndef ALTSEP
+            p2 = p1;
+#else
+            for (; p2 < p1; ++p2) {
+                if (*p2 == ALTSEP) {
+                    *p2 = SEP;
+                }
+            }
+#endif
+            minP2 = p2 - 1;
+            lastC = *minP2;
 #ifdef MS_WINDOWS
-    // Skip past drive segment and update minP2
-    else if (p1[0] && p1[1] == L':') {
-        *p2++ = *p1++;
-        *p2++ = *p1++;
-        minP2 = p2;
-        lastC = L':';
-    }
-    // Skip past all \\-prefixed paths, including \\?\, \\.\,
-    // and network paths, including the first segment.
-    else if (IS_SEP(&p1[0]) && IS_SEP(&p1[1])) {
-        int sepCount = 2;
-        *p2++ = SEP;
-        *p2++ = SEP;
-        p1 += 2;
-        for (; !IS_END(p1) && sepCount; ++p1) {
-            if (IS_SEP(p1)) {
-                --sepCount;
-                *p2++ = lastC = SEP;
-            } else {
-                *p2++ = lastC = *p1;
+            if (lastC != SEP) {
+                minP2++;
             }
+#endif
         }
-        minP2 = p2 - 1;
-    }
-#else
-    // Skip past two leading SEPs
-    else if (IS_SEP(&p1[0]) && IS_SEP(&p1[1]) && !IS_SEP(&p1[2])) {
-        *p2++ = *p1++;
-        *p2++ = *p1++;
-        minP2 = p2 - 1;  // Absolute path has SEP at minP2
-        lastC = SEP;
     }
-#endif /* MS_WINDOWS */
 
     /* if pEnd is specified, check that. Else, check for null terminator */
     for (; !IS_END(p1); ++p1) {