sched/mmcid: Implement deferred mode change

author Thomas Gleixner <tglx@linutronix.de>

Wed, 19 Nov 2025 17:27:20 +0000 (18:27 +0100)

committer Thomas Gleixner <tglx@linutronix.de>

Tue, 25 Nov 2025 18:45:42 +0000 (19:45 +0100)
author Thomas Gleixner <tglx@linutronix.de>
Wed, 19 Nov 2025 17:27:20 +0000 (18:27 +0100)
committer Thomas Gleixner <tglx@linutronix.de>
Tue, 25 Nov 2025 18:45:42 +0000 (19:45 +0100)
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h

index a3a4f3f10862460758e868cec1fa34658ddc5531..81fbb8885e8d7bdf163c0a1f2f8985a0a12cce6f 100644 (file)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -2,7 +2,9 @@
  #ifndef _LINUX_RSEQ_TYPES_H
  #define _LINUX_RSEQ_TYPES_H
  
+#include <linux/irq_work_types.h>
  #include <linux/types.h>
+#include <linux/workqueue_types.h>
  
  #ifdef CONFIG_RSEQ
  struct rseq;
@@ -122,6 +124,8 @@ struct mm_cid_pcpu {
   * @percpu:            Set, when CIDs are in per CPU mode
   * @transit:           Set to MM_CID_TRANSIT during a mode change transition phase
   * @max_cids:          The exclusive maximum CID value for allocation and convergence
+ * @irq_work:          irq_work to handle the affinity mode change case
+ * @work:              Regular work to handle the affinity mode change case
   * @lock:              Spinlock to protect against affinity setting which can't take @mutex
   * @mutex:             Mutex to serialize forks and exits related to this mm
   * @nr_cpus_allowed:   The number of CPUs in the per MM allowed CPUs map. The map
@@ -139,6 +143,10 @@ struct mm_mm_cid {
         unsigned int            transit;
         unsigned int            max_cids;
  
+       /* Rarely used. Moves @lock and @mutex into the second cacheline */
+       struct irq_work         irq_work;
+       struct work_struct      work;
+
         raw_spinlock_t          lock;
         struct mutex            mutex;
  
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index eb0d59df8acc32b54b8d5d7239dd18835f362b65..cbb543a6efda1beb3463738c544b5c7b542f4088 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10539,8 +10539,17 @@ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpu
  
         /* Adjust the threshold to the wider set */
         mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+       /* Switch back to per task mode? */
+       if (mc->users >= mc->pcpu_thrs)
+               return;
+
+       /* Don't queue twice */
+       if (mc->update_deferred)
+               return;
  
-       /* Scheduling of deferred mode switch goes here */
+       /* Queue the irq work, which schedules the real work */
+       mc->update_deferred = true;
+       irq_work_queue(&mc->irq_work);
  }
  
  static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
@@ -10553,7 +10562,7 @@ static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_p
         }
  }
  
-static void __maybe_unused mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
+static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
  {
         unsigned int cpu;
  
@@ -10714,14 +10723,47 @@ void sched_mm_cid_after_execve(struct task_struct *t)
         mm_cid_select(t);
  }
  
-void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+static void mm_cid_work_fn(struct work_struct *work)
  {
-       struct mm_cid_pcpu __percpu *pcpu = mm->mm_cid.pcpu;
-       int cpu;
+       struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
  
-       for_each_possible_cpu(cpu)
-               per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
+       /* Make it compile, but not functional yet */
+       if (!IS_ENABLED(CONFIG_NEW_MM_CID))
+               return;
+
+       guard(mutex)(&mm->mm_cid.mutex);
+       /* Did the last user task exit already? */
+       if (!mm->mm_cid.users)
+               return;
+
+       scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+               /* Have fork() or exit() handled it already? */
+               if (!mm->mm_cid.update_deferred)
+                       return;
+               /* This clears mm_cid::update_deferred */
+               if (!mm_update_max_cids(mm))
+                       return;
+               /* Affinity changes can only switch back to task mode */
+               if (WARN_ON_ONCE(mm->mm_cid.percpu))
+                       return;
+       }
+       mm_cid_fixup_cpus_to_tasks(mm);
+}
+
+static void mm_cid_irq_work(struct irq_work *work)
+{
+       struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work);
  
+       /*
+        * Needs to be unconditional because mm_cid::lock cannot be held
+        * when scheduling work as mm_update_cpus_allowed() nests inside
+        * rq::lock and schedule_work() might end up in wakeup...
+        */
+       schedule_work(&mm->mm_cid.work);
+}
+
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
+{
         mm->mm_cid.max_cids = 0;
         mm->mm_cid.percpu = 0;
         mm->mm_cid.transit = 0;
@@ -10731,6 +10773,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
         mm->mm_cid.update_deferred = 0;
         raw_spin_lock_init(&mm->mm_cid.lock);
         mutex_init(&mm->mm_cid.mutex);
+       mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
+       INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
         cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
         bitmap_zero(mm_cidmask(mm), num_possible_cpus());
  }
author	Thomas Gleixner <tglx@linutronix.de>
	Wed, 19 Nov 2025 17:27:20 +0000 (18:27 +0100)
committer	Thomas Gleixner <tglx@linutronix.de>
	Tue, 25 Nov 2025 18:45:42 +0000 (19:45 +0100)
include/linux/rseq_types.h		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history