]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
cpuset: Defer flushing of the cpuset_migrate_mm_wq to task_work
authorChuyi Zhou <zhouchuyi@bytedance.com>
Thu, 4 Sep 2025 07:45:04 +0000 (15:45 +0800)
committerTejun Heo <tj@kernel.org>
Thu, 4 Sep 2025 17:22:38 +0000 (07:22 -1000)
Now in cpuset_attach(), we need to synchronously wait for
flush_workqueue to complete. The execution time of flushing
cpuset_migrate_mm_wq depends on the amount of mm migration initiated by
cpusets at that time. When the cpuset.mems of a cgroup occupying a large
amount of memory is modified, it may trigger extensive mm migration,
causing cpuset_attach() to block on flush_workqueue for an extended period.
This could be dangerous because cpuset_attach() is within the critical
section of cgroup_mutex, which may ultimately cause all cgroup-related
operations in the system to be blocked.

This patch attempts to defer the flush_workqueue() operation until
returning to userspace using the task_work which is originally proposed by
tejun[1], so that flush happens after cgroup_mutex is dropped. That way we
maintain the operation synchronicity while avoiding bothering anyone else.

[1]: https://lore.kernel.org/cgroups/ZgMFPMjZRZCsq9Q-@slm.duckdns.org/T/#m117f606fa24f66f0823a60f211b36f24bd9e1883

Originally-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Reviewed-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
kernel/cgroup/cpuset.c

index 9fc20ef97d7e5bddeea3fefa9342c351a3664ddd..0d41b4993f8cf4d7feda6096c47d0c82e46b375c 100644 (file)
@@ -40,6 +40,7 @@
 #include <linux/sched/isolation.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include <linux/task_work.h>
 
 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
@@ -2619,9 +2620,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
        }
 }
 
-static void cpuset_post_attach(void)
+static void flush_migrate_mm_task_workfn(struct callback_head *head)
 {
        flush_workqueue(cpuset_migrate_mm_wq);
+       kfree(head);
+}
+
+static void schedule_flush_migrate_mm(void)
+{
+       struct callback_head *flush_cb;
+
+       flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
+       if (!flush_cb)
+               return;
+
+       init_task_work(flush_cb, flush_migrate_mm_task_workfn);
+
+       if (task_work_add(current, flush_cb, TWA_RESUME))
+               kfree(flush_cb);
 }
 
 /*
@@ -3178,6 +3194,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
        struct cpuset *cs;
        struct cpuset *oldcs = cpuset_attach_old_cs;
        bool cpus_updated, mems_updated;
+       bool queue_task_work = false;
 
        cgroup_taskset_first(tset, &css);
        cs = css_cs(css);
@@ -3228,15 +3245,18 @@ static void cpuset_attach(struct cgroup_taskset *tset)
                         * @old_mems_allowed is the right nodesets that we
                         * migrate mm from.
                         */
-                       if (is_memory_migrate(cs))
+                       if (is_memory_migrate(cs)) {
                                cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
                                                  &cpuset_attach_nodemask_to);
-                       else
+                               queue_task_work = true;
+                       } else
                                mmput(mm);
                }
        }
 
 out:
+       if (queue_task_work)
+               schedule_flush_migrate_mm();
        cs->old_mems_allowed = cpuset_attach_nodemask_to;
 
        if (cs->nr_migrate_dl_tasks) {
@@ -3292,7 +3312,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 out_unlock:
        cpuset_full_unlock();
        if (of_cft(of)->private == FILE_MEMLIST)
-               flush_workqueue(cpuset_migrate_mm_wq);
+               schedule_flush_migrate_mm();
        return retval ?: nbytes;
 }
 
@@ -3739,7 +3759,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
        .can_attach     = cpuset_can_attach,
        .cancel_attach  = cpuset_cancel_attach,
        .attach         = cpuset_attach,
-       .post_attach    = cpuset_post_attach,
        .bind           = cpuset_bind,
        .can_fork       = cpuset_can_fork,
        .cancel_fork    = cpuset_cancel_fork,