sched/mmcid: Provide new scheduler CID mechanism

author Thomas Gleixner <tglx@linutronix.de>

Wed, 19 Nov 2025 17:27:14 +0000 (18:27 +0100)

committer Thomas Gleixner <tglx@linutronix.de>

Tue, 25 Nov 2025 18:45:41 +0000 (19:45 +0100)
author Thomas Gleixner <tglx@linutronix.de>
Wed, 19 Nov 2025 17:27:14 +0000 (18:27 +0100)
committer Thomas Gleixner <tglx@linutronix.de>
Tue, 25 Nov 2025 18:45:41 +0000 (19:45 +0100)
diff --git a/include/linux/rseq.h b/include/linux/rseq.h

index bf8a6bf315f355838ec38c9dce154e79d5730496..4c0e8bdd2dd9537ad890e11f650f48af43a9d4ad 100644 (file)
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -73,13 +73,13 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t)
  }
  
  /*
- * Invoked from __set_task_cpu() when a task migrates to enforce an IDs
- * update.
+ * Invoked from __set_task_cpu() when a task migrates or from
+ * mm_cid_schedin() when the CID changes to enforce an IDs update.
   *
   * This does not raise TIF_NOTIFY_RESUME as that happens in
   * rseq_sched_switch_event().
   */
-static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu)
+static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
  {
         t->rseq.event.ids_changed = true;
  }
@@ -168,7 +168,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
  static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
  static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
  static inline void rseq_sched_switch_event(struct task_struct *t) { }
-static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { }
+static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
  static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
  static inline void rseq_force_update(void) { }
  static inline void rseq_virt_userspace_exit(void) { }
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h

index 87854effe1ad6c4e160508a4b79d433a072ea682..66b1482e114645a0c358aa1def466f3952b20e12 100644 (file)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -119,23 +119,31 @@ struct mm_cid_pcpu {
  /**
   * struct mm_mm_cid - Storage for per MM CID data
   * @pcpu:              Per CPU storage for CIDs associated to a CPU
+ * @percpu:            Set, when CIDs are in per CPU mode
+ * @transit:           Set to MM_CID_TRANSIT during a mode change transition phase
   * @max_cids:          The exclusive maximum CID value for allocation and convergence
+ * @lock:              Spinlock to protect all fields except @pcpu. It also protects
+ *                     the MM cid cpumask and the MM cidmask bitmap.
+ * @mutex:             Mutex to serialize forks and exits related to this mm
   * @nr_cpus_allowed:   The number of CPUs in the per MM allowed CPUs map. The map
   *                     is growth only.
   * @users:             The number of tasks sharing this MM. Separate from mm::mm_users
   *                     as that is modified by mmget()/mm_put() by other entities which
   *                     do not actually share the MM.
- * @lock:              Spinlock to protect all fields except @pcpu. It also protects
- *                     the MM cid cpumask and the MM cidmask bitmap.
- * @mutex:             Mutex to serialize forks and exits related to this mm
   */
  struct mm_mm_cid {
+       /* Hotpath read mostly members */
         struct mm_cid_pcpu      __percpu *pcpu;
+       unsigned int            percpu;
+       unsigned int            transit;
         unsigned int            max_cids;
-       unsigned int            nr_cpus_allowed;
-       unsigned int            users;
+
         raw_spinlock_t          lock;
         struct mutex            mutex;
+
+       /* Low frequency modified */
+       unsigned int            nr_cpus_allowed;
+       unsigned int            users;
  }____cacheline_aligned_in_smp;
  #else /* CONFIG_SCHED_MM_CID */
  struct mm_mm_cid { };
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 55bb9c9ae32c35983882edf1de2493fb616fd270..659ae56b459f1e2bbad45502b2cbdb3b18b8ebc0 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10495,6 +10495,8 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
                 per_cpu_ptr(pcpu, cpu)->cid = MM_CID_UNSET;
  
         mm->mm_cid.max_cids = 0;
+       mm->mm_cid.percpu = 0;
+       mm->mm_cid.transit = 0;
         mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
         mm->mm_cid.users = 0;
         raw_spin_lock_init(&mm->mm_cid.lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 4b49284504fb0a23af90b2770d91247c6e0d2eeb..82c7978d548ecffa222a82902f338cc6c5becfd5 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2209,7 +2209,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
         smp_wmb();
         WRITE_ONCE(task_thread_info(p)->cpu, cpu);
         p->wake_cpu = cpu;
-       rseq_sched_set_task_cpu(p, cpu);
+       rseq_sched_set_ids_changed(p);
  #endif /* CONFIG_SMP */
  }
  
@@ -3598,6 +3598,153 @@ static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_c
         mm_drop_cid(mm, pcp->cid);
  }
  
+static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids)
+{
+       unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids);
+
+       if (cid >= max_cids)
+               return MM_CID_UNSET;
+       if (test_and_set_bit(cid, mm_cidmask(mm)))
+               return MM_CID_UNSET;
+       return cid;
+}
+
+static inline unsigned int mm_get_cid(struct mm_struct *mm)
+{
+       unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids));
+
+       while (cid == MM_CID_UNSET) {
+               cpu_relax();
+               cid = __mm_get_cid(mm, num_possible_cpus());
+       }
+       return cid;
+}
+
+static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid,
+                                          unsigned int max_cids)
+{
+       unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid);
+
+       /* Is it in the optimal CID space? */
+       if (likely(cid < max_cids))
+               return orig_cid;
+
+       /* Try to find one in the optimal space. Otherwise keep the provided. */
+       new_cid = __mm_get_cid(mm, max_cids);
+       if (new_cid != MM_CID_UNSET) {
+               mm_drop_cid(mm, cid);
+               /* Preserve the ONCPU mode of the original CID */
+               return new_cid | (orig_cid & MM_CID_ONCPU);
+       }
+       return orig_cid;
+}
+
+static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid)
+{
+       if (t->mm_cid.cid != cid) {
+               t->mm_cid.cid = cid;
+               rseq_sched_set_ids_changed(t);
+       }
+}
+
+static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid)
+{
+       __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
+}
+
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
+{
+       unsigned int max_cids, tcid = t->mm_cid.cid;
+       struct mm_struct *mm = t->mm;
+
+       max_cids = READ_ONCE(mm->mm_cid.max_cids);
+       /* Optimize for the common case where both have the ONCPU bit set */
+       if (likely(cid_on_cpu(cpu_cid & tcid))) {
+               if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) {
+                       mm_cid_update_task_cid(t, cpu_cid);
+                       return;
+               }
+               /* Try to converge into the optimal CID space */
+               cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids);
+       } else {
+               /* Hand over or drop the task owned CID */
+               if (cid_on_task(tcid)) {
+                       if (cid_on_cpu(cpu_cid))
+                               mm_unset_cid_on_task(t);
+                       else
+                               cpu_cid = cid_to_cpu_cid(tcid);
+               }
+               /* Still nothing, allocate a new one */
+               if (!cid_on_cpu(cpu_cid))
+                       cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
+       }
+       mm_cid_update_pcpu_cid(mm, cpu_cid);
+       mm_cid_update_task_cid(t, cpu_cid);
+}
+
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
+{
+       unsigned int max_cids, tcid = t->mm_cid.cid;
+       struct mm_struct *mm = t->mm;
+
+       max_cids = READ_ONCE(mm->mm_cid.max_cids);
+       /* Optimize for the common case, where both have the ONCPU bit clear */
+       if (likely(cid_on_task(tcid | cpu_cid))) {
+               if (likely(tcid < max_cids)) {
+                       mm_cid_update_pcpu_cid(mm, tcid);
+                       return;
+               }
+               /* Try to converge into the optimal CID space */
+               tcid = mm_cid_converge(mm, tcid, max_cids);
+       } else {
+               /* Hand over or drop the CPU owned CID */
+               if (cid_on_cpu(cpu_cid)) {
+                       if (cid_on_task(tcid))
+                               mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
+                       else
+                               tcid = cpu_cid_to_cid(cpu_cid);
+               }
+               /* Still nothing, allocate a new one */
+               if (!cid_on_task(tcid))
+                       tcid = mm_get_cid(mm);
+               /* Set the transition mode flag if required */
+               tcid |= READ_ONCE(mm->mm_cid.transit);
+       }
+       mm_cid_update_pcpu_cid(mm, tcid);
+       mm_cid_update_task_cid(t, tcid);
+}
+
+static __always_inline void mm_cid_schedin(struct task_struct *next)
+{
+       struct mm_struct *mm = next->mm;
+       unsigned int cpu_cid;
+
+       if (!next->mm_cid.active)
+               return;
+
+       cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
+       if (likely(!READ_ONCE(mm->mm_cid.percpu)))
+               mm_cid_from_task(next, cpu_cid);
+       else
+               mm_cid_from_cpu(next, cpu_cid);
+}
+
+static __always_inline void mm_cid_schedout(struct task_struct *prev)
+{
+       /* During mode transitions CIDs are temporary and need to be dropped */
+       if (likely(!cid_in_transit(prev->mm_cid.cid)))
+               return;
+
+       mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid));
+       prev->mm_cid.cid = MM_CID_UNSET;
+}
+
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)
+{
+       mm_cid_schedout(prev);
+       mm_cid_schedin(next);
+}
+
  /* Active implementation */
  static inline void init_sched_mm_cid(struct task_struct *t)
  {
@@ -3675,6 +3822,7 @@ static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *n
  #else /* !CONFIG_SCHED_MM_CID: */
  static inline void mm_cid_select(struct task_struct *t) { }
  static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
  #endif /* !CONFIG_SCHED_MM_CID */
  
  extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
author	Thomas Gleixner <tglx@linutronix.de>
	Wed, 19 Nov 2025 17:27:14 +0000 (18:27 +0100)
committer	Thomas Gleixner <tglx@linutronix.de>
	Tue, 25 Nov 2025 18:45:41 +0000 (19:45 +0100)
include/linux/rseq.h		patch \| blob \| blame \| history
include/linux/rseq_types.h		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history