pid: only take pidmap_lock once on alloc

author Mateusz Guzik <mjguzik@gmail.com>

Wed, 3 Dec 2025 09:28:51 +0000 (10:28 +0100)

committer Christian Brauner <brauner@kernel.org>

Mon, 15 Dec 2025 13:33:38 +0000 (14:33 +0100)
author Mateusz Guzik <mjguzik@gmail.com>
Wed, 3 Dec 2025 09:28:51 +0000 (10:28 +0100)
committer Christian Brauner <brauner@kernel.org>
Mon, 15 Dec 2025 13:33:38 +0000 (14:33 +0100)
diff --git a/kernel/pid.c b/kernel/pid.c

index a31771bc89c1a3ef7c9b4b3e28467b608975fbc4..f45ae56db7daa8445a3a824a5ea77a5f9c6ecec5 100644 (file)
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -159,58 +159,86 @@ void free_pids(struct pid **pids)
                         free_pid(pids[tmp]);
  }
  
-struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
-                     size_t set_tid_size)
+struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
+                     size_t arg_set_tid_size)
  {
+       int set_tid[MAX_PID_NS_LEVEL + 1] = {};
+       int pid_max[MAX_PID_NS_LEVEL + 1] = {};
         struct pid *pid;
         enum pid_type type;
         int i, nr;
         struct pid_namespace *tmp;
         struct upid *upid;
         int retval = -ENOMEM;
+       bool retried_preload;
  
         /*
-        * set_tid_size contains the size of the set_tid array. Starting at
+        * arg_set_tid_size contains the size of the arg_set_tid array. Starting at
          * the most nested currently active PID namespace it tells alloc_pid()
          * which PID to set for a process in that most nested PID namespace
-        * up to set_tid_size PID namespaces. It does not have to set the PID
-        * for a process in all nested PID namespaces but set_tid_size must
+        * up to arg_set_tid_size PID namespaces. It does not have to set the PID
+        * for a process in all nested PID namespaces but arg_set_tid_size must
          * never be greater than the current ns->level + 1.
          */
-       if (set_tid_size > ns->level + 1)
+       if (arg_set_tid_size > ns->level + 1)
                 return ERR_PTR(-EINVAL);
  
+       /*
+        * Prep before we take locks:
+        *
+        * 1. allocate and fill in pid struct
+        */
         pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
         if (!pid)
                 return ERR_PTR(retval);
  
-       tmp = ns;
+       get_pid_ns(ns);
         pid->level = ns->level;
+       refcount_set(&pid->count, 1);
+       spin_lock_init(&pid->lock);
+       for (type = 0; type < PIDTYPE_MAX; ++type)
+               INIT_HLIST_HEAD(&pid->tasks[type]);
+       init_waitqueue_head(&pid->wait_pidfd);
+       INIT_HLIST_HEAD(&pid->inodes);
  
-       for (i = ns->level; i >= 0; i--) {
-               int tid = 0;
-               int pid_max = READ_ONCE(tmp->pid_max);
+       /*
+        * 2. perm check checkpoint_restore_ns_capable()
+        *
+        * This stores found pid_max to make sure the used value is the same should
+        * later code need it.
+        */
+       for (tmp = ns, i = ns->level; i >= 0; i--) {
+               pid_max[ns->level - i] = READ_ONCE(tmp->pid_max);
  
-               if (set_tid_size) {
-                       tid = set_tid[ns->level - i];
+               if (arg_set_tid_size) {
+                       int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i];
  
                         retval = -EINVAL;
-                       if (tid < 1 || tid >= pid_max)
-                               goto out_free;
+                       if (tid < 1 || tid >= pid_max[ns->level - i])
+                               goto out_abort;
                         /*
                          * Also fail if a PID != 1 is requested and
                          * no PID 1 exists.
                          */
                         if (tid != 1 && !tmp->child_reaper)
-                               goto out_free;
+                               goto out_abort;
                         retval = -EPERM;
                         if (!checkpoint_restore_ns_capable(tmp->user_ns))
-                               goto out_free;
-                       set_tid_size--;
+                               goto out_abort;
+                       arg_set_tid_size--;
                 }
  
-               idr_preload(GFP_KERNEL);
-               spin_lock(&pidmap_lock);
+               tmp = tmp->parent;
+       }
+
+       /*
+        * Prep is done, id allocation goes here:
+        */
+       retried_preload = false;
+       idr_preload(GFP_KERNEL);
+       spin_lock(&pidmap_lock);
+       for (tmp = ns, i = ns->level; i >= 0;) {
+               int tid = set_tid[ns->level - i];
  
                 if (tid) {
                         nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -220,6 +248,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                          * alreay in use. Return EEXIST in that case.
                          */
                         if (nr == -ENOSPC)
+
                                 nr = -EEXIST;
                 } else {
                         int pid_min = 1;
@@ -235,19 +264,42 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                          * a partially initialized PID (see below).
                          */
                         nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
-                                             pid_max, GFP_ATOMIC);
+                                             pid_max[ns->level - i], GFP_ATOMIC);
+                       if (nr == -ENOSPC)
+                               nr = -EAGAIN;
                 }
-               spin_unlock(&pidmap_lock);
-               idr_preload_end();
  
-               if (nr < 0) {
-                       retval = (nr == -ENOSPC) ? -EAGAIN : nr;
+               if (unlikely(nr < 0)) {
+                       /*
+                        * Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM.
+                        *
+                        * The IDR API only allows us to preload memory for one call, while we may end
+                        * up doing several under pidmap_lock with GFP_ATOMIC. The situation may be
+                        * salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload
+                        * did not help (the routine unfortunately returns void, so we have no idea
+                        * if it got anywhere).
+                        *
+                        * The lock can be safely dropped and picked up as historically pid allocation
+                        * for different namespaces was *not* atomic -- we try to hold on to it the
+                        * entire time only for performance reasons.
+                        */
+                       if (nr == -ENOMEM && !retried_preload) {
+                               spin_unlock(&pidmap_lock);
+                               idr_preload_end();
+                               retried_preload = true;
+                               idr_preload(GFP_KERNEL);
+                               spin_lock(&pidmap_lock);
+                               continue;
+                       }
+                       retval = nr;
                         goto out_free;
                 }
  
                 pid->numbers[i].nr = nr;
                 pid->numbers[i].ns = tmp;
                 tmp = tmp->parent;
+               i--;
+               retried_preload = false;
         }
  
         /*
@@ -257,25 +309,15 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
          * is what we have exposed to userspace for a long time and it is
          * documented behavior for pid namespaces. So we can't easily
          * change it even if there were an error code better suited.
+        *
+        * This can't be done earlier because we need to preserve other
+        * error conditions.
          */
         retval = -ENOMEM;
-
-       get_pid_ns(ns);
-       refcount_set(&pid->count, 1);
-       spin_lock_init(&pid->lock);
-       for (type = 0; type < PIDTYPE_MAX; ++type)
-               INIT_HLIST_HEAD(&pid->tasks[type]);
-
-       init_waitqueue_head(&pid->wait_pidfd);
-       INIT_HLIST_HEAD(&pid->inodes);
-
-       upid = pid->numbers + ns->level;
-       idr_preload(GFP_KERNEL);
-       spin_lock(&pidmap_lock);
-       if (!(ns->pid_allocated & PIDNS_ADDING))
-               goto out_unlock;
+       if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
+               goto out_free;
         pidfs_add_pid(pid);
-       for ( ; upid >= pid->numbers; --upid) {
+       for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
                 /* Make the PID visible to find_pid_ns. */
                 idr_replace(&upid->ns->idr, pid, upid->nr);
                 upid->ns->pid_allocated++;
@@ -286,13 +328,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
  
         return pid;
  
-out_unlock:
-       spin_unlock(&pidmap_lock);
-       idr_preload_end();
-       put_pid_ns(ns);
-
  out_free:
-       spin_lock(&pidmap_lock);
         while (++i <= ns->level) {
                 upid = pid->numbers + i;
                 idr_remove(&upid->ns->idr, upid->nr);
@@ -303,7 +339,10 @@ out_free:
                 idr_set_cursor(&ns->idr, 0);
  
         spin_unlock(&pidmap_lock);
+       idr_preload_end();
  
+out_abort:
+       put_pid_ns(ns);
         kmem_cache_free(ns->pid_cachep, pid);
         return ERR_PTR(retval);
  }
author	Mateusz Guzik <mjguzik@gmail.com>
	Wed, 3 Dec 2025 09:28:51 +0000 (10:28 +0100)
committer	Christian Brauner <brauner@kernel.org>
	Mon, 15 Dec 2025 13:33:38 +0000 (14:33 +0100)