]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
pid: only take pidmap_lock once on alloc
authorMateusz Guzik <mjguzik@gmail.com>
Wed, 3 Dec 2025 09:28:51 +0000 (10:28 +0100)
committerChristian Brauner <brauner@kernel.org>
Mon, 15 Dec 2025 13:33:38 +0000 (14:33 +0100)
When spawning and killing threads in separate processes in parallel the
primary bottleneck on the stock kernel is pidmap_lock, largely because
of a back-to-back acquire in the common case. This aspect is fixed with
the patch.

Performance improvement varies between reboots. When benchmarking with
20 processes creating and killing threads in a loop, the unpatched
baseline hovers around 465k ops/s, while patched is anything between
~510k ops/s and ~560k depending on false-sharing (which I only minimally
sanitized). So this is at least 10% if you are unlucky.

The change also facilitated some cosmetic fixes.

It has an unintentional side effect of no longer issuing spurious
idr_preload() around idr_replace().

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://patch.msgid.link/20251203092851.287617-3-mjguzik@gmail.com
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
kernel/pid.c

index a31771bc89c1a3ef7c9b4b3e28467b608975fbc4..f45ae56db7daa8445a3a824a5ea77a5f9c6ecec5 100644 (file)
@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
                        free_pid(pids[tmp]);
 }
 
-struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
-                     size_t set_tid_size)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
+                     size_t arg_set_tid_size)
 {
+       int set_tid[MAX_PID_NS_LEVEL + 1] = {};
+       int pid_max[MAX_PID_NS_LEVEL + 1] = {};
        struct pid *pid;
        enum pid_type type;
        int i, nr;
        struct pid_namespace *tmp;
        struct upid *upid;
        int retval = -ENOMEM;
+       bool retried_preload;
 
        /*
-        * set_tid_size contains the size of the set_tid array. Starting at
+        * arg_set_tid_size contains the size of the arg_set_tid array. Starting at
         * the most nested currently active PID namespace it tells alloc_pid()
         * which PID to set for a process in that most nested PID namespace
-        * up to set_tid_size PID namespaces. It does not have to set the PID
-        * for a process in all nested PID namespaces but set_tid_size must
+        * up to arg_set_tid_size PID namespaces. It does not have to set the PID
+        * for a process in all nested PID namespaces but arg_set_tid_size must
         * never be greater than the current ns->level + 1.
         */
-       if (set_tid_size > ns->level + 1)
+       if (arg_set_tid_size > ns->level + 1)
                return ERR_PTR(-EINVAL);
 
+       /*
+        * Prep before we take locks:
+        *
+        * 1. allocate and fill in pid struct
+        */
        pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
        if (!pid)
                return ERR_PTR(retval);
 
-       tmp = ns;
+       get_pid_ns(ns);
        pid->level = ns->level;
+       refcount_set(&pid->count, 1);
+       spin_lock_init(&pid->lock);
+       for (type = 0; type < PIDTYPE_MAX; ++type)
+               INIT_HLIST_HEAD(&pid->tasks[type]);
+       init_waitqueue_head(&pid->wait_pidfd);
+       INIT_HLIST_HEAD(&pid->inodes);
 
-       for (i = ns->level; i >= 0; i--) {
-               int tid = 0;
-               int pid_max = READ_ONCE(tmp->pid_max);
+       /*
+        * 2. perm check checkpoint_restore_ns_capable()
+        *
+        * This stores found pid_max to make sure the used value is the same should
+        * later code need it.
+        */
+       for (tmp = ns, i = ns->level; i >= 0; i--) {
+               pid_max[ns->level - i] = READ_ONCE(tmp->pid_max);
 
-               if (set_tid_size) {
-                       tid = set_tid[ns->level - i];
+               if (arg_set_tid_size) {
+                       int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i];
 
                        retval = -EINVAL;
-                       if (tid < 1 || tid >= pid_max)
-                               goto out_free;
+                       if (tid < 1 || tid >= pid_max[ns->level - i])
+                               goto out_abort;
                        /*
                         * Also fail if a PID != 1 is requested and
                         * no PID 1 exists.
                         */
                        if (tid != 1 && !tmp->child_reaper)
-                               goto out_free;
+                               goto out_abort;
                        retval = -EPERM;
                        if (!checkpoint_restore_ns_capable(tmp->user_ns))
-                               goto out_free;
-                       set_tid_size--;
+                               goto out_abort;
+                       arg_set_tid_size--;
                }
 
-               idr_preload(GFP_KERNEL);
-               spin_lock(&pidmap_lock);
+               tmp = tmp->parent;
+       }
+
+       /*
+        * Prep is done, id allocation goes here:
+        */
+       retried_preload = false;
+       idr_preload(GFP_KERNEL);
+       spin_lock(&pidmap_lock);
+       for (tmp = ns, i = ns->level; i >= 0;) {
+               int tid = set_tid[ns->level - i];
 
                if (tid) {
                        nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                         * alreay in use. Return EEXIST in that case.
                         */
                        if (nr == -ENOSPC)
+
                                nr = -EEXIST;
                } else {
                        int pid_min = 1;
@@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                         * a partially initialized PID (see below).
                         */
                        nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
-                                             pid_max, GFP_ATOMIC);
+                                             pid_max[ns->level - i], GFP_ATOMIC);
+                       if (nr == -ENOSPC)
+                               nr = -EAGAIN;
                }
-               spin_unlock(&pidmap_lock);
-               idr_preload_end();
 
-               if (nr < 0) {
-                       retval = (nr == -ENOSPC) ? -EAGAIN : nr;
+               if (unlikely(nr < 0)) {
+                       /*
+                        * Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
+                        *
+                        * The IDR API only allows us to preload memory for one call, while we may end
+                        * up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
+                        * salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
+                        * did not help (the routine unfortunately returns void, so we have no idea
+                        * if it got anywhere).
+                        *
+                        * The lock can be safely dropped and picked up as historically pid allocation
+                        * for different namespaces was *not* atomic -- we try to hold on to it the
+                        * entire time only for performance reasons.
+                        */
+                       if (nr == -ENOMEM && !retried_preload) {
+                               spin_unlock(&pidmap_lock);
+                               idr_preload_end();
+                               retried_preload = true;
+                               idr_preload(GFP_KERNEL);
+                               spin_lock(&pidmap_lock);
+                               continue;
+                       }
+                       retval = nr;
                        goto out_free;
                }
 
                pid->numbers[i].nr = nr;
                pid->numbers[i].ns = tmp;
                tmp = tmp->parent;
+               i--;
+               retried_preload = false;
        }
 
        /*
@@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
         * is what we have exposed to userspace for a long time and it is
         * documented behavior for pid namespaces. So we can't easily
         * change it even if there were an error code better suited.
+        *
+        * This can't be done earlier because we need to preserve other
+        * error conditions.
         */
        retval = -ENOMEM;
-
-       get_pid_ns(ns);
-       refcount_set(&pid->count, 1);
-       spin_lock_init(&pid->lock);
-       for (type = 0; type < PIDTYPE_MAX; ++type)
-               INIT_HLIST_HEAD(&pid->tasks[type]);
-
-       init_waitqueue_head(&pid->wait_pidfd);
-       INIT_HLIST_HEAD(&pid->inodes);
-
-       upid = pid->numbers + ns->level;
-       idr_preload(GFP_KERNEL);
-       spin_lock(&pidmap_lock);
-       if (!(ns->pid_allocated & PIDNS_ADDING))
-               goto out_unlock;
+       if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
+               goto out_free;
        pidfs_add_pid(pid);
-       for ( ; upid >= pid->numbers; --upid) {
+       for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
                /* Make the PID visible to find_pid_ns. */
                idr_replace(&upid->ns->idr, pid, upid->nr);
                upid->ns->pid_allocated++;
@@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 
        return pid;
 
-out_unlock:
-       spin_unlock(&pidmap_lock);
-       idr_preload_end();
-       put_pid_ns(ns);
-
 out_free:
-       spin_lock(&pidmap_lock);
        while (++i <= ns->level) {
                upid = pid->numbers + i;
                idr_remove(&upid->ns->idr, upid->nr);
@@ -303,7 +339,10 @@ out_free:
                idr_set_cursor(&ns->idr, 0);
 
        spin_unlock(&pidmap_lock);
+       idr_preload_end();
 
+out_abort:
+       put_pid_ns(ns);
        kmem_cache_free(ns->pid_cachep, pid);
        return ERR_PTR(retval);
 }