cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated

author Tejun Heo <tj@kernel.org>

Fri, 1 May 2026 18:31:22 +0000 (08:31 -1000)

committer Tejun Heo <tj@kernel.org>

Mon, 4 May 2026 18:52:26 +0000 (08:52 -1000)
author Tejun Heo <tj@kernel.org>
Fri, 1 May 2026 18:31:22 +0000 (08:31 -1000)
committer Tejun Heo <tj@kernel.org>
Mon, 4 May 2026 18:52:26 +0000 (08:52 -1000)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index f42563739d2e53670da74c4948046f3c5c9a0bc9..50a784da7a81aa3c826fcd7d85257c70e29ea814 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -611,8 +611,8 @@ struct cgroup {
         /* used to wait for offlining of csses */
         wait_queue_head_t offline_waitq;
  
-       /* used by cgroup_rmdir() to wait for dying tasks to leave */
-       wait_queue_head_t dying_populated_waitq;
+       /* defers killing csses after removal until cgroup is depopulated */
+       struct work_struct finish_destroy_work;
  
         /* used to schedule release agent */
         struct work_struct release_agent_work;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index c928dea9dea6bde8efdff09d15af36a5c4540e6a..bd10a7e2f9c550564e50d498dde2b2ff1508004b 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -264,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
  static void css_task_iter_skip(struct css_task_iter *it,
                                struct task_struct *task);
  static int cgroup_destroy_locked(struct cgroup *cgrp);
+static void cgroup_finish_destroy(struct cgroup *cgrp);
+static void kill_css_sync(struct cgroup_subsys_state *css);
+static void kill_css_finish(struct cgroup_subsys_state *css);
  static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
                                               struct cgroup_subsys *ss);
  static void css_release(struct percpu_ref *ref);
-static void kill_css(struct cgroup_subsys_state *css);
  static int cgroup_addrm_files(struct cgroup_subsys_state *css,
                               struct cgroup *cgrp, struct cftype cfts[],
                               bool is_add);
@@ -797,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
                 if (was_populated == cgroup_is_populated(cgrp))
                         break;
  
+               /*
+                * Subtree just emptied below an offlined cgrp. Fire deferred
+                * destroy. The transition is one-shot.
+                */
+               if (was_populated && !css_is_online(&cgrp->self)) {
+                       cgroup_get(cgrp);
+                       WARN_ON_ONCE(!queue_work(cgroup_offline_wq,
+                                                &cgrp->finish_destroy_work));
+               }
+
                 cgroup1_check_for_release(cgrp);
                 TRACE_CGROUP_PATH(notify_populated, cgrp,
                                   cgroup_is_populated(cgrp));
@@ -2039,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc)
         return 0;
  }
  
+static void cgroup_finish_destroy_work_fn(struct work_struct *work)
+{
+       struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work);
+
+       cgroup_lock();
+       cgroup_finish_destroy(cgrp);
+       cgroup_unlock();
+       cgroup_put(cgrp);
+}
+
  static void init_cgroup_housekeeping(struct cgroup *cgrp)
  {
         struct cgroup_subsys *ss;
@@ -2065,7 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
  #endif
  
         init_waitqueue_head(&cgrp->offline_waitq);
-       init_waitqueue_head(&cgrp->dying_populated_waitq);
+       INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn);
         INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
  }
  
@@ -3375,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
  
                         if (css->parent &&
                             !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
-                               kill_css(css);
+                               kill_css_sync(css);
+                               kill_css_finish(css);
                         } else if (!css_visible(css)) {
                                 css_clear_dir(css);
                                 if (ss->css_reset)
@@ -5514,7 +5537,7 @@ static struct cftype cgroup_psi_files[] = {
   * css destruction is four-stage process.
   *
   * 1. Destruction starts.  Killing of the percpu_ref is initiated.
- *    Implemented in kill_css().
+ *    Implemented in kill_css_finish().
   *
   * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
   *    and thus css_tryget_online() is guaranteed to fail, the css can be
@@ -5993,7 +6016,7 @@ out_unlock:
  /*
   * This is called when the refcnt of a css is confirmed to be killed.
   * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
- * initiate destruction and put the css ref from kill_css().
+ * initiate destruction and put the css ref from kill_css_finish().
   */
  static void css_killed_work_fn(struct work_struct *work)
  {
@@ -6025,15 +6048,12 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
  }
  
  /**
- * kill_css - destroy a css
- * @css: css to destroy
+ * kill_css_sync - synchronous half of css teardown
+ * @css: css being killed
   *
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference.  ->css_offline() will be invoked
- * asynchronously once css_tryget_online() is guaranteed to fail and when
- * the reference count reaches zero, @css will be released.
+ * See cgroup_destroy_locked().
   */
-static void kill_css(struct cgroup_subsys_state *css)
+static void kill_css_sync(struct cgroup_subsys_state *css)
  {
         struct cgroup_subsys *ss = css->ss;
  
@@ -6056,24 +6076,6 @@ static void kill_css(struct cgroup_subsys_state *css)
          */
         css_clear_dir(css);
  
-       /*
-        * Killing would put the base ref, but we need to keep it alive
-        * until after ->css_offline().
-        */
-       css_get(css);
-
-       /*
-        * cgroup core guarantees that, by the time ->css_offline() is
-        * invoked, no new css reference will be given out via
-        * css_tryget_online().  We can't simply call percpu_ref_kill() and
-        * proceed to offlining css's because percpu_ref_kill() doesn't
-        * guarantee that the ref is seen as killed on all CPUs on return.
-        *
-        * Use percpu_ref_kill_and_confirm() to get notifications as each
-        * css is confirmed to be seen as killed on all CPUs.
-        */
-       percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
-
         css->cgroup->nr_dying_subsys[ss->id]++;
         /*
          * Parent css and cgroup cannot be freed until after the freeing
@@ -6086,44 +6088,88 @@ static void kill_css(struct cgroup_subsys_state *css)
  }
  
  /**
- * cgroup_destroy_locked - the first stage of cgroup destruction
+ * kill_css_finish - deferred half of css teardown
+ * @css: css being killed
+ *
+ * See cgroup_destroy_locked().
+ */
+static void kill_css_finish(struct cgroup_subsys_state *css)
+{
+       lockdep_assert_held(&cgroup_mutex);
+
+       /*
+        * Skip on re-entry: cgroup_apply_control_disable() may have killed @css
+        * earlier. cgroup_destroy_locked() can still walk it because
+        * offline_css() (which NULLs cgrp->subsys[ssid]) runs async.
+        */
+       if (percpu_ref_is_dying(&css->refcnt))
+               return;
+
+       /*
+        * Killing would put the base ref, but we need to keep it alive until
+        * after ->css_offline().
+        */
+       css_get(css);
+
+       /*
+        * cgroup core guarantees that, by the time ->css_offline() is invoked,
+        * no new css reference will be given out via css_tryget_online(). We
+        * can't simply call percpu_ref_kill() and proceed to offlining css's
+        * because percpu_ref_kill() doesn't guarantee that the ref is seen as
+        * killed on all CPUs on return.
+        *
+        * Use percpu_ref_kill_and_confirm() to get notifications as each css is
+        * confirmed to be seen as killed on all CPUs.
+        */
+       percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
+}
+
+/**
+ * cgroup_destroy_locked - destroy @cgrp (called on rmdir)
   * @cgrp: cgroup to be destroyed
   *
- * css's make use of percpu refcnts whose killing latency shouldn't be
- * exposed to userland and are RCU protected.  Also, cgroup core needs to
- * guarantee that css_tryget_online() won't succeed by the time
- * ->css_offline() is invoked.  To satisfy all the requirements,
- * destruction is implemented in the following two steps.
- *
- * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
- *     userland visible parts and start killing the percpu refcnts of
- *     css's.  Set up so that the next stage will be kicked off once all
- *     the percpu refcnts are confirmed to be killed.
- *
- * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
- *     rest of destruction.  Once all cgroup references are gone, the
- *     cgroup is RCU-freed.
- *
- * This function implements s1.  After this step, @cgrp is gone as far as
- * the userland is concerned and a new cgroup with the same name may be
- * created.  As cgroup doesn't care about the names internally, this
- * doesn't cause any problem.
+ * Tear down @cgrp on behalf of rmdir. Constraints:
+ *
+ * - Userspace: rmdir must succeed when cgroup.procs and friends are empty.
+ *
+ * - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's
+ *   subtree is still doing kernel work. A task hidden from cgroup.procs (past
+ *   exit_signals() with signal->live cleared) can still schedule, allocate, and
+ *   consume resources until its final context switch. Dying descendants in the
+ *   subtree can host such tasks too.
+ *
+ * - Kernel: css_tryget_online() must fail by the time ->css_offline() runs.
+ *
+ * The destruction runs in three parts:
+ *
+ * - This function: synchronous user-visible state teardown plus kill_css_sync()
+ *   on each subsystem css.
+ *
+ * - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on
+ *   each subsystem css. Fires once @cgrp's subtree is fully drained, either
+ *   inline here or from cgroup_update_populated().
+ *
+ * - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn ->
+ *   ->css_offline() -> release/free.
+ *
+ * Return 0 on success, -EBUSY if a userspace-visible task or an online child
+ * remains.
   */
  static int cgroup_destroy_locked(struct cgroup *cgrp)
-       __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
         struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
         struct cgroup_subsys_state *css;
         struct cgrp_cset_link *link;
+       struct css_task_iter it;
+       struct task_struct *task;
         int ssid, ret;
  
         lockdep_assert_held(&cgroup_mutex);
  
-       /*
-        * Only migration can raise populated from zero and we're already
-        * holding cgroup_mutex.
-        */
-       if (cgroup_is_populated(cgrp))
+       css_task_iter_start(&cgrp->self, 0, &it);
+       task = css_task_iter_next(&it);
+       css_task_iter_end(&it);
+       if (task)
                 return -EBUSY;
  
         /*
@@ -6147,9 +6193,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
                 link->cset->dead = true;
         spin_unlock_irq(&css_set_lock);
  
-       /* initiate massacre of all css's */
         for_each_css(css, ssid, cgrp)
-               kill_css(css);
+               kill_css_sync(css);
  
         /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
         css_clear_dir(&cgrp->self);
@@ -6180,79 +6225,27 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         /* put the base reference */
         percpu_ref_kill(&cgrp->self.refcnt);
  
+       if (!cgroup_is_populated(cgrp))
+               cgroup_finish_destroy(cgrp);
+
         return 0;
  };
  
  /**
- * cgroup_drain_dying - wait for dying tasks to leave before rmdir
- * @cgrp: the cgroup being removed
- *
- * cgroup.procs and cgroup.threads use css_task_iter which filters out
- * PF_EXITING tasks so that userspace doesn't see tasks that have already been
- * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
- * cgroup has non-empty css_sets - is only updated when dying tasks pass through
- * cgroup_task_dead() in finish_task_switch(). This creates a window where
- * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
- * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
- * tasks.
- *
- * This function aligns cgroup_has_tasks() with what userspace can observe. If
- * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
- * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
- * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
+ * cgroup_finish_destroy - deferred half of @cgrp destruction
+ * @cgrp: cgroup whose subtree just became empty
   *
- * This function only concerns itself with this cgroup's own dying tasks.
- * Whether the cgroup has children is cgroup_destroy_locked()'s problem.
- *
- * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
- * retry the full check from scratch.
- *
- * Must be called with cgroup_mutex held.
+ * See cgroup_destroy_locked() for the rationale.
   */
-static int cgroup_drain_dying(struct cgroup *cgrp)
-       __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
+static void cgroup_finish_destroy(struct cgroup *cgrp)
  {
-       struct css_task_iter it;
-       struct task_struct *task;
-       DEFINE_WAIT(wait);
+       struct cgroup_subsys_state *css;
+       int ssid;
  
         lockdep_assert_held(&cgroup_mutex);
-retry:
-       if (!cgroup_has_tasks(cgrp))
-               return 0;
-
-       /* Same iterator as cgroup.threads - if any task is visible, it's busy */
-       css_task_iter_start(&cgrp->self, 0, &it);
-       task = css_task_iter_next(&it);
-       css_task_iter_end(&it);
-
-       if (task)
-               return -EBUSY;
  
-       /*
-        * All remaining tasks are PF_EXITING and will pass through
-        * cgroup_task_dead() shortly. Wait for a kick and retry.
-        *
-        * cgroup_has_tasks() can't transition from false to true while we're
-        * holding cgroup_mutex, but the true to false transition happens
-        * under css_set_lock (via cgroup_task_dead()). We must retest and
-        * prepare_to_wait() under css_set_lock. Otherwise, the transition
-        * can happen between our first test and prepare_to_wait(), and we
-        * sleep with no one to wake us.
-        */
-       spin_lock_irq(&css_set_lock);
-       if (!cgroup_has_tasks(cgrp)) {
-               spin_unlock_irq(&css_set_lock);
-               return 0;
-       }
-       prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
-                       TASK_UNINTERRUPTIBLE);
-       spin_unlock_irq(&css_set_lock);
-       mutex_unlock(&cgroup_mutex);
-       schedule();
-       finish_wait(&cgrp->dying_populated_waitq, &wait);
-       mutex_lock(&cgroup_mutex);
-       goto retry;
+       for_each_css(css, ssid, cgrp)
+               kill_css_finish(css);
  }
  
  int cgroup_rmdir(struct kernfs_node *kn)
@@ -6264,12 +6257,9 @@ int cgroup_rmdir(struct kernfs_node *kn)
         if (!cgrp)
                 return 0;
  
-       ret = cgroup_drain_dying(cgrp);
-       if (!ret) {
-               ret = cgroup_destroy_locked(cgrp);
-               if (!ret)
-                       TRACE_CGROUP_PATH(rmdir, cgrp);
-       }
+       ret = cgroup_destroy_locked(cgrp);
+       if (!ret)
+               TRACE_CGROUP_PATH(rmdir, cgrp);
  
         cgroup_kn_unlock(kn);
         return ret;
@@ -7029,7 +7019,6 @@ void cgroup_task_exit(struct task_struct *tsk)
  
  static void do_cgroup_task_dead(struct task_struct *tsk)
  {
-       struct cgrp_cset_link *link;
         struct css_set *cset;
         unsigned long flags;
  
@@ -7043,11 +7032,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
         if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
                 list_add_tail(&tsk->cg_list, &cset->dying_tasks);
  
-       /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
-       list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
-               if (waitqueue_active(&link->cgrp->dying_populated_waitq))
-                       wake_up(&link->cgrp->dying_populated_waitq);
-
         if (dl_task(tsk))
                 dec_dl_tasks_cs(tsk);
author	Tejun Heo <tj@kernel.org>
	Fri, 1 May 2026 18:31:22 +0000 (08:31 -1000)
committer	Tejun Heo <tj@kernel.org>
	Mon, 4 May 2026 18:52:26 +0000 (08:52 -1000)
include/linux/cgroup-defs.h		patch \| blob \| blame \| history
kernel/cgroup/cgroup.c		patch \| blob \| blame \| history