cgroup: replace global percpu_rwsem with per threadgroup resem when writing to cgroup...

author Yi Tao <escape@linux.alibaba.com>

Wed, 10 Sep 2025 06:59:35 +0000 (14:59 +0800)

committer Tejun Heo <tj@kernel.org>

Wed, 10 Sep 2025 17:44:51 +0000 (07:44 -1000)
author Yi Tao <escape@linux.alibaba.com>
Wed, 10 Sep 2025 06:59:35 +0000 (14:59 +0800)
committer Tejun Heo <tj@kernel.org>
Wed, 10 Sep 2025 17:44:51 +0000 (07:44 -1000)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index ff3c7d0e3e01785763fc44b697ee6eb01b5850e7..93318fce31f3a869bf95ef2c98b974351462c456 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -91,6 +91,12 @@ enum {
          * cgroup_threadgroup_rwsem. This makes hot path operations such as
          * forks and exits into the slow path and more expensive.
          *
+        * Alleviate the contention between fork, exec, exit operations and
+        * writing to cgroup.procs by taking a per threadgroup rwsem instead of
+        * the global cgroup_threadgroup_rwsem. Fork and other operations
+        * from threads in different thread groups no longer contend with
+        * writing to cgroup.procs.
+        *
          * The static usage pattern of creating a cgroup, enabling controllers,
          * and then seeding it with CLONE_INTO_CGROUP doesn't require write
          * locking cgroup_threadgroup_rwsem and thus doesn't benefit from
@@ -146,6 +152,9 @@ enum cgroup_attach_lock_mode {
  
         /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */
         CGRP_ATTACH_LOCK_NONE,
+
+       /* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */
+       CGRP_ATTACH_LOCK_PER_THREADGROUP,
  };
  
  /*
@@ -846,6 +855,7 @@ struct cgroup_subsys {
  };
  
  extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+extern bool cgroup_enable_per_threadgroup_rwsem;
  
  struct cgroup_of_peak {
         unsigned long           value;
@@ -857,11 +867,14 @@ struct cgroup_of_peak {
   * @tsk: target task
   *
   * Allows cgroup operations to synchronize against threadgroup changes
- * using a percpu_rw_semaphore.
+ * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when
+ * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
   */
  static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
  {
         percpu_down_read(&cgroup_threadgroup_rwsem);
+       if (cgroup_enable_per_threadgroup_rwsem)
+               down_read(&tsk->signal->cgroup_threadgroup_rwsem);
  }
  
  /**
@@ -872,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
   */
  static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
  {
+       if (cgroup_enable_per_threadgroup_rwsem)
+               up_read(&tsk->signal->cgroup_threadgroup_rwsem);
         percpu_up_read(&cgroup_threadgroup_rwsem);
  }
  
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h

index 1ef1edbaaf79a89113b7524b13bab2cc728d4403..7d6449982822e552c315aae521b8d453879cf6e9 100644 (file)
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -226,6 +226,10 @@ struct signal_struct {
         struct tty_audit_buf *tty_audit_buf;
  #endif
  
+#ifdef CONFIG_CGROUPS
+       struct rw_semaphore cgroup_threadgroup_rwsem;
+#endif
+
         /*
          * Thread is the potential origin of an oom condition; kill first on
          * oom
diff --git a/init/init_task.c b/init/init_task.c

index e557f622bd90615978c3c1f890cfcaadee523691..a55e2189206fa4d6ae2345461e38533b38b76afa 100644 (file)
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -27,6 +27,9 @@ static struct signal_struct init_signals = {
         },
         .multiprocess   = HLIST_HEAD_INIT,
         .rlim           = INIT_RLIMITS,
+#ifdef CONFIG_CGROUPS
+       .cgroup_threadgroup_rwsem       = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem),
+#endif
         .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
         .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
  #ifdef CONFIG_POSIX_TIMERS
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h

index a6d6f30b6f657d5745bbebc1d5e25a1c671c8d7e..22051b4f1ccbc0812641e4716b7a5b5f7138bbc9 100644 (file)
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -249,8 +249,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
  
  int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
                        bool threadgroup);
-void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode);
-void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode);
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+                       struct task_struct *tsk);
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+                         struct task_struct *tsk);
  struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
                                              enum cgroup_attach_lock_mode *lock_mode)
         __acquires(&cgroup_threadgroup_rwsem);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c

index 852ebe7ca3a1bb2bfc25c4970a80b014ce6c9e94..a9e029b570c8c8c78a96e776232fac807066c8b4 100644 (file)
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -69,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
         int retval = 0;
  
         cgroup_lock();
-       cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL);
+       cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
         for_each_root(root) {
                 struct cgroup *from_cgrp;
  
@@ -81,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
                 if (retval)
                         break;
         }
-       cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL);
+       cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
         cgroup_unlock();
  
         return retval;
@@ -118,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
  
         cgroup_lock();
  
-       cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL);
+       cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
  
         /* all tasks in @from are being moved, all csets are source */
         spin_lock_irq(&css_set_lock);
@@ -154,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
         } while (task && !ret);
  out_err:
         cgroup_migrate_finish(&mgctx);
-       cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL);
+       cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL);
         cgroup_unlock();
         return ret;
  }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index a6b81b48bb7008f711706bbb6a3f483b7f47f46c..fed701df1167078cf2ef38967c0d8457b47e6d17 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -239,6 +239,14 @@ static u16 have_canfork_callback __read_mostly;
  
  static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
  
+/*
+ * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem,
+ * read protected by either.
+ *
+ * Can only be turned on, but not turned off.
+ */
+bool cgroup_enable_per_threadgroup_rwsem __read_mostly;
+
  /* cgroup namespace for init task */
  struct cgroup_namespace init_cgroup_ns = {
         .ns.count       = REFCOUNT_INIT(2),
@@ -1325,14 +1333,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
  {
         bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
  
-       /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+       /*
+        * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition.
+        * favordynmods can flip while task is between
+        * cgroup_threadgroup_change_begin() and end(), so down_write global
+        * cgroup_threadgroup_rwsem to synchronize them.
+        *
+        * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding
+        * cgroup_threadgroup_rwsem doesn't exlude tasks between
+        * cgroup_thread_group_change_begin() and end() and thus it's unsafe to
+        * turn off. As the scenario is unlikely, simply disallow disabling once
+        * enabled and print out a warning.
+        */
+       percpu_down_write(&cgroup_threadgroup_rwsem);
         if (favor && !favoring) {
+               cgroup_enable_per_threadgroup_rwsem = true;
                 rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
                 root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
         } else if (!favor && favoring) {
+               if (cgroup_enable_per_threadgroup_rwsem)
+                       pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n");
                 rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
                 root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
         }
+       percpu_up_write(&cgroup_threadgroup_rwsem);
  }
  
  static int cgroup_init_root_id(struct cgroup_root *root)
@@ -2482,7 +2506,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
  
  /**
   * cgroup_attach_lock - Lock for ->attach()
- * @lock_mode: whether to down_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether acquire and acquire which rwsem
+ * @tsk: thread group to lock
   *
   * cgroup migration sometimes needs to stabilize threadgroups against forks and
   * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
@@ -2502,8 +2527,15 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
   * Resolve the situation by always acquiring cpus_read_lock() before optionally
   * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
   * CPU hotplug is disabled on entry.
+ *
+ * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead
+ * on dynamic cgroup modifications. see the comment above
+ * CGRP_ROOT_FAVOR_DYNMODS definition.
+ *
+ * tsk is not NULL only when writing to cgroup.procs.
   */
-void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode)
+void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode,
+                       struct task_struct *tsk)
  {
         cpus_read_lock();
  
@@ -2513,6 +2545,9 @@ void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode)
         case CGRP_ATTACH_LOCK_GLOBAL:
                 percpu_down_write(&cgroup_threadgroup_rwsem);
                 break;
+       case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+               down_write(&tsk->signal->cgroup_threadgroup_rwsem);
+               break;
         default:
                 pr_warn("cgroup: Unexpected attach lock mode.");
                 break;
@@ -2521,9 +2556,11 @@ void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode)
  
  /**
   * cgroup_attach_unlock - Undo cgroup_attach_lock()
- * @lock_mode: whether to up_write cgroup_threadgroup_rwsem
+ * @lock_mode: whether release and release which rwsem
+ * @tsk: thread group to lock
   */
-void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode)
+void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode,
+                         struct task_struct *tsk)
  {
         switch (lock_mode) {
         case CGRP_ATTACH_LOCK_NONE:
@@ -2531,6 +2568,9 @@ void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode)
         case CGRP_ATTACH_LOCK_GLOBAL:
                 percpu_up_write(&cgroup_threadgroup_rwsem);
                 break;
+       case CGRP_ATTACH_LOCK_PER_THREADGROUP:
+               up_write(&tsk->signal->cgroup_threadgroup_rwsem);
+               break;
         default:
                 pr_warn("cgroup: Unexpected attach lock mode.");
                 break;
@@ -3042,7 +3082,6 @@ retry_find_task:
                 tsk = ERR_PTR(-EINVAL);
                 goto out_unlock_rcu;
         }
-
         get_task_struct(tsk);
         rcu_read_unlock();
  
@@ -3055,12 +3094,16 @@ retry_find_task:
          */
         lockdep_assert_held(&cgroup_mutex);
  
-       if (pid || threadgroup)
-               *lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
-       else
+       if (pid || threadgroup) {
+               if (cgroup_enable_per_threadgroup_rwsem)
+                       *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP;
+               else
+                       *lock_mode = CGRP_ATTACH_LOCK_GLOBAL;
+       } else {
                 *lock_mode = CGRP_ATTACH_LOCK_NONE;
+       }
  
-       cgroup_attach_lock(*lock_mode);
+       cgroup_attach_lock(*lock_mode, tsk);
  
         if (threadgroup) {
                 if (!thread_group_leader(tsk)) {
@@ -3069,7 +3112,7 @@ retry_find_task:
                          * may strip us of our leadership. If this happens,
                          * throw this task away and try again.
                          */
-                       cgroup_attach_unlock(*lock_mode);
+                       cgroup_attach_unlock(*lock_mode, tsk);
                         put_task_struct(tsk);
                         goto retry_find_task;
                 }
@@ -3085,10 +3128,10 @@ out_unlock_rcu:
  void cgroup_procs_write_finish(struct task_struct *task,
                                enum cgroup_attach_lock_mode lock_mode)
  {
+       cgroup_attach_unlock(lock_mode, task);
+
         /* release reference from cgroup_procs_write_start() */
         put_task_struct(task);
-
-       cgroup_attach_unlock(lock_mode);
  }
  
  static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
@@ -3178,7 +3221,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
         else
                 lock_mode = CGRP_ATTACH_LOCK_NONE;
  
-       cgroup_attach_lock(lock_mode);
+       cgroup_attach_lock(lock_mode, NULL);
  
         /* NULL dst indicates self on default hierarchy */
         ret = cgroup_migrate_prepare_dst(&mgctx);
@@ -3199,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
         ret = cgroup_migrate_execute(&mgctx);
  out_finish:
         cgroup_migrate_finish(&mgctx);
-       cgroup_attach_unlock(lock_mode);
+       cgroup_attach_unlock(lock_mode, NULL);
         return ret;
  }
  
diff --git a/kernel/fork.c b/kernel/fork.c

index c4ada32598bd5e137ea13efd0d1117da9e746114..9a039867ecfd2a8752a52b9e73c4a29bfb6ff5f1 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1688,6 +1688,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
         tty_audit_fork(sig);
         sched_autogroup_fork(sig);
  
+#ifdef CONFIG_CGROUPS
+       init_rwsem(&sig->cgroup_threadgroup_rwsem);
+#endif
+
         sig->oom_score_adj = current->signal->oom_score_adj;
         sig->oom_score_adj_min = current->signal->oom_score_adj_min;
author	Yi Tao <escape@linux.alibaba.com>
	Wed, 10 Sep 2025 06:59:35 +0000 (14:59 +0800)
committer	Tejun Heo <tj@kernel.org>
	Wed, 10 Sep 2025 17:44:51 +0000 (07:44 -1000)
include/linux/cgroup-defs.h		patch \| blob \| blame \| history
include/linux/sched/signal.h		patch \| blob \| blame \| history
init/init_task.c		patch \| blob \| blame \| history
kernel/cgroup/cgroup-internal.h		patch \| blob \| blame \| history
kernel/cgroup/cgroup-v1.c		patch \| blob \| blame \| history
kernel/cgroup/cgroup.c		patch \| blob \| blame \| history
kernel/fork.c		patch \| blob \| blame \| history