cgroup: Wait for dying tasks to leave on rmdir

author Tejun Heo <tj@kernel.org>

Tue, 24 Mar 2026 20:21:25 +0000 (10:21 -1000)

committer Tejun Heo <tj@kernel.org>

Tue, 24 Mar 2026 20:21:40 +0000 (10:21 -1000)
author Tejun Heo <tj@kernel.org>
Tue, 24 Mar 2026 20:21:25 +0000 (10:21 -1000)
committer Tejun Heo <tj@kernel.org>
Tue, 24 Mar 2026 20:21:40 +0000 (10:21 -1000)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index bb92f5c169ca2dcfc9354623e34968f6449854b1..7f87399938fa2bf8076966510e646e53f7ff2dc3 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -609,6 +609,9 @@ struct cgroup {
         /* used to wait for offlining of csses */
         wait_queue_head_t offline_waitq;
  
+       /* used by cgroup_rmdir() to wait for dying tasks to leave */
+       wait_queue_head_t dying_populated_waitq;
+
         /* used to schedule release agent */
         struct work_struct release_agent_work;
  
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index 01fc2a93f3ef229cfb4d5038df9cd2b42ce57439..2163054e1aa65f47fb7ad7db665bab58d2deaaba 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2126,6 +2126,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
  #endif
  
         init_waitqueue_head(&cgrp->offline_waitq);
+       init_waitqueue_head(&cgrp->dying_populated_waitq);
         INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
  }
  
@@ -6224,6 +6225,76 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         return 0;
  };
  
+/**
+ * cgroup_drain_dying - wait for dying tasks to leave before rmdir
+ * @cgrp: the cgroup being removed
+ *
+ * The PF_EXITING filter in css_task_iter_advance() hides exiting tasks from
+ * cgroup.procs so that userspace (e.g. systemd) doesn't see tasks that have
+ * already been reaped via waitpid(). However, the populated counter
+ * (nr_populated_csets) is only decremented when the task later passes through
+ * cgroup_task_dead() in finish_task_switch(). This creates a window where
+ * cgroup.procs appears empty but cgroup_is_populated() is still true, causing
+ * rmdir to fail with -EBUSY.
+ *
+ * This function bridges that gap. If the cgroup is populated but all remaining
+ * tasks have PF_EXITING set, we wait for cgroup_task_dead() to process them.
+ * Tasks are removed from the cgroup's css_set in cgroup_task_dead() called from
+ * finish_task_switch(). As the window between PF_EXITING and cgroup_task_dead()
+ * is short, the number of PF_EXITING tasks on the list is small and the wait
+ * is brief.
+ *
+ * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
+ * retry the full check from scratch.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+static int cgroup_drain_dying(struct cgroup *cgrp)
+       __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+{
+       struct css_task_iter it;
+       struct task_struct *task;
+       DEFINE_WAIT(wait);
+
+       lockdep_assert_held(&cgroup_mutex);
+retry:
+       if (!cgroup_is_populated(cgrp))
+               return 0;
+
+       /* Same iterator as cgroup.threads - if any task is visible, it's busy */
+       css_task_iter_start(&cgrp->self, 0, &it);
+       task = css_task_iter_next(&it);
+       css_task_iter_end(&it);
+
+       if (task)
+               return -EBUSY;
+
+       /*
+        * All remaining tasks are PF_EXITING and will pass through
+        * cgroup_task_dead() shortly. Wait for a kick and retry.
+        *
+        * cgroup_is_populated() can't transition from false to true while
+        * we're holding cgroup_mutex, but the true to false transition
+        * happens under css_set_lock (via cgroup_task_dead()). We must
+        * retest and prepare_to_wait() under css_set_lock. Otherwise, the
+        * transition can happen between our first test and
+        * prepare_to_wait(), and we sleep with no one to wake us.
+        */
+       spin_lock_irq(&css_set_lock);
+       if (!cgroup_is_populated(cgrp)) {
+               spin_unlock_irq(&css_set_lock);
+               return 0;
+       }
+       prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
+                       TASK_UNINTERRUPTIBLE);
+       spin_unlock_irq(&css_set_lock);
+       mutex_unlock(&cgroup_mutex);
+       schedule();
+       finish_wait(&cgrp->dying_populated_waitq, &wait);
+       mutex_lock(&cgroup_mutex);
+       goto retry;
+}
+
  int cgroup_rmdir(struct kernfs_node *kn)
  {
         struct cgroup *cgrp;
@@ -6233,9 +6304,12 @@ int cgroup_rmdir(struct kernfs_node *kn)
         if (!cgrp)
                 return 0;
  
-       ret = cgroup_destroy_locked(cgrp);
-       if (!ret)
-               TRACE_CGROUP_PATH(rmdir, cgrp);
+       ret = cgroup_drain_dying(cgrp);
+       if (!ret) {
+               ret = cgroup_destroy_locked(cgrp);
+               if (!ret)
+                       TRACE_CGROUP_PATH(rmdir, cgrp);
+       }
  
         cgroup_kn_unlock(kn);
         return ret;
@@ -6995,6 +7069,7 @@ void cgroup_task_exit(struct task_struct *tsk)
  
  static void do_cgroup_task_dead(struct task_struct *tsk)
  {
+       struct cgrp_cset_link *link;
         struct css_set *cset;
         unsigned long flags;
  
@@ -7008,6 +7083,11 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
         if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
                 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
  
+       /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
+       list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+               if (waitqueue_active(&link->cgrp->dying_populated_waitq))
+                       wake_up(&link->cgrp->dying_populated_waitq);
+
         if (dl_task(tsk))
                 dec_dl_tasks_cs(tsk);
author	Tejun Heo <tj@kernel.org>
	Tue, 24 Mar 2026 20:21:25 +0000 (10:21 -1000)
committer	Tejun Heo <tj@kernel.org>
	Tue, 24 Mar 2026 20:21:40 +0000 (10:21 -1000)
include/linux/cgroup-defs.h		patch \| blob \| blame \| history
kernel/cgroup/cgroup.c		patch \| blob \| blame \| history