sched/mmcid: Switch over to the new mechanism

author Thomas Gleixner <tglx@linutronix.de>

Wed, 19 Nov 2025 17:27:22 +0000 (18:27 +0100)

committer Thomas Gleixner <tglx@linutronix.de>

Tue, 25 Nov 2025 18:45:42 +0000 (19:45 +0100)
author Thomas Gleixner <tglx@linutronix.de>
Wed, 19 Nov 2025 17:27:22 +0000 (18:27 +0100)
committer Thomas Gleixner <tglx@linutronix.de>
Tue, 25 Nov 2025 18:45:42 +0000 (19:45 +0100)
diff --git a/include/linux/rseq.h b/include/linux/rseq.h

index 4c0e8bdd2dd9537ad890e11f650f48af43a9d4ad..2266f4dc77b6c5ef9ec33859405e935f72ffb544 100644 (file)
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -84,24 +84,6 @@ static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
         t->rseq.event.ids_changed = true;
  }
  
-/*
- * Invoked from switch_mm_cid() in context switch when the task gets a MM
- * CID assigned.
- *
- * This does not raise TIF_NOTIFY_RESUME as that happens in
- * rseq_sched_switch_event().
- */
-static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
-{
-       /*
-        * Requires a comparison as the switch_mm_cid() code does not
-        * provide a conditional for it readily. So avoid excessive updates
-        * when nothing changes.
-        */
-       if (t->rseq.ids.mm_cid != cid)
-               t->rseq.event.ids_changed = true;
-}
-
  /* Enforce a full update after RSEQ registration and when execve() failed */
  static inline void rseq_force_update(void)
  {
@@ -169,7 +151,6 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
  static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
  static inline void rseq_sched_switch_event(struct task_struct *t) { }
  static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
-static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
  static inline void rseq_force_update(void) { }
  static inline void rseq_virt_userspace_exit(void) { }
  static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h

index 81fbb8885e8d7bdf163c0a1f2f8985a0a12cce6f..332dc14b81c9770d87a416ae99cedfa48225eb96 100644 (file)
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -101,18 +101,18 @@ struct rseq_data { };
  /**
   * struct sched_mm_cid - Storage for per task MM CID data
   * @active:    MM CID is active for the task
- * @cid:       The CID associated to the task
- * @last_cid:  The last CID associated to the task
+ * @cid:       The CID associated to the task either permanently or
+ *             borrowed from the CPU
   */
  struct sched_mm_cid {
         unsigned int            active;
         unsigned int            cid;
-       unsigned int            last_cid;
  };
  
  /**
   * struct mm_cid_pcpu - Storage for per CPU MM_CID data
- * @cid:       The CID associated to the CPU
+ * @cid:       The CID associated to the CPU either permanently or
+ *             while a task with a CID is running
   */
  struct mm_cid_pcpu {
         unsigned int    cid;
diff --git a/kernel/fork.c b/kernel/fork.c

index 6c23219e116975657c5f7eb1b164c203c4d86600..8475958e029ba3be7f42e8bac0f05e50bc494e97 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -956,7 +956,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
  
  #ifdef CONFIG_SCHED_MM_CID
         tsk->mm_cid.cid = MM_CID_UNSET;
-       tsk->mm_cid.last_cid = MM_CID_UNSET;
         tsk->mm_cid.active = 0;
  #endif
         return tsk;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index cbb543a6efda1beb3463738c544b5c7b542f4088..62235f1dc04e37c547b98a6927096da760687f1c 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5307,7 +5307,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
                 }
         }
  
-       switch_mm_cid(prev, next);
+       mm_cid_switch_to(prev, next);
  
         /*
          * Tell rseq that the task was scheduled in. Must be after
@@ -10624,7 +10624,7 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
         return true;
  }
  
-static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_fixup_tasks_to_cpus(void)
  {
         struct mm_struct *mm = current->mm;
         struct task_struct *p, *t;
@@ -10674,25 +10674,81 @@ static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
  void sched_mm_cid_fork(struct task_struct *t)
  {
         struct mm_struct *mm = t->mm;
+       bool percpu;
  
         WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
  
         guard(mutex)(&mm->mm_cid.mutex);
-       scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
-               sched_mm_cid_add_user(t, mm);
-               /* Preset last_cid for mm_cid_select() */
-               t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
+       scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+               struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+
+               /* First user ? */
+               if (!mm->mm_cid.users) {
+                       sched_mm_cid_add_user(t, mm);
+                       t->mm_cid.cid = mm_get_cid(mm);
+                       /* Required for execve() */
+                       pcp->cid = t->mm_cid.cid;
+                       return;
+               }
+
+               if (!sched_mm_cid_add_user(t, mm)) {
+                       if (!mm->mm_cid.percpu)
+                               t->mm_cid.cid = mm_get_cid(mm);
+                       return;
+               }
+
+               /* Handle the mode change and transfer current's CID */
+               percpu = !!mm->mm_cid.percpu;
+               if (!percpu)
+                       mm_cid_transit_to_task(current, pcp);
+               else
+                       mm_cid_transfer_to_cpu(current, pcp);
+       }
+
+       if (percpu) {
+               mm_cid_fixup_tasks_to_cpus();
+       } else {
+               mm_cid_fixup_cpus_to_tasks(mm);
+               t->mm_cid.cid = mm_get_cid(mm);
         }
  }
  
  static bool sched_mm_cid_remove_user(struct task_struct *t)
  {
         t->mm_cid.active = 0;
-       mm_unset_cid_on_task(t);
+       scoped_guard(preempt) {
+               /* Clear the transition bit */
+               t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+               mm_unset_cid_on_task(t);
+       }
         t->mm->mm_cid.users--;
         return mm_update_max_cids(t->mm);
  }
  
+static bool __sched_mm_cid_exit(struct task_struct *t)
+{
+       struct mm_struct *mm = t->mm;
+
+       if (!sched_mm_cid_remove_user(t))
+               return false;
+       /*
+        * Contrary to fork() this only deals with a switch back to per
+        * task mode either because the above decreased users or an
+        * affinity change increased the number of allowed CPUs and the
+        * deferred fixup did not run yet.
+        */
+       if (WARN_ON_ONCE(mm->mm_cid.percpu))
+               return false;
+       /*
+        * A failed fork(2) cleanup never gets here, so @current must have
+        * the same MM as @t. That's true for exit() and the failed
+        * pthread_create() cleanup case.
+        */
+       if (WARN_ON_ONCE(current->mm != mm))
+               return false;
+       return true;
+}
+
  /*
   * When a task exits, the MM CID held by the task is not longer required as
   * the task cannot return to user space.
@@ -10703,10 +10759,43 @@ void sched_mm_cid_exit(struct task_struct *t)
  
         if (!mm || !t->mm_cid.active)
                 return;
+       /*
+        * Ensure that only one instance is doing MM CID operations within
+        * a MM. The common case is uncontended. The rare fixup case adds
+        * some overhead.
+        */
+       scoped_guard(mutex, &mm->mm_cid.mutex) {
+               /* mm_cid::mutex is sufficient to protect mm_cid::users */
+               if (likely(mm->mm_cid.users > 1)) {
+                       scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+                               if (!__sched_mm_cid_exit(t))
+                                       return;
+                               /* Mode change required. Transfer currents CID */
+                               mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+                       }
+                       mm_cid_fixup_cpus_to_tasks(mm);
+                       return;
+               }
+               /* Last user */
+               scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+                       /* Required across execve() */
+                       if (t == current)
+                               mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+                       /* Ignore mode change. There is nothing to do. */
+                       sched_mm_cid_remove_user(t);
+               }
+       }
  
-       guard(mutex)(&mm->mm_cid.mutex);
-       scoped_guard(raw_spinlock, &mm->mm_cid.lock)
-               sched_mm_cid_remove_user(t);
+       /*
+        * As this is the last user (execve(), process exit or failed
+        * fork(2)) there is no concurrency anymore.
+        *
+        * Synchronize eventually pending work to ensure that there are no
+        * dangling references left. @t->mm_cid.users is zero so nothing
+        * can queue this work anymore.
+        */
+       irq_work_sync(&mm->mm_cid.irq_work);
+       cancel_work_sync(&mm->mm_cid.work);
  }
  
  /* Deactivate MM CID allocation across execve() */
@@ -10719,18 +10808,12 @@ void sched_mm_cid_before_execve(struct task_struct *t)
  void sched_mm_cid_after_execve(struct task_struct *t)
  {
         sched_mm_cid_fork(t);
-       guard(preempt)();
-       mm_cid_select(t);
  }
  
  static void mm_cid_work_fn(struct work_struct *work)
  {
         struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
  
-       /* Make it compile, but not functional yet */
-       if (!IS_ENABLED(CONFIG_NEW_MM_CID))
-               return;
-
         guard(mutex)(&mm->mm_cid.mutex);
         /* Did the last user task exit already? */
         if (!mm->mm_cid.users)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 82c7978d548ecffa222a82902f338cc6c5becfd5..f9d0515db130645e78fb6e25e6bdb568713b47d2 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3745,83 +3745,7 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
         mm_cid_schedin(next);
  }
  
-/* Active implementation */
-static inline void init_sched_mm_cid(struct task_struct *t)
-{
-       struct mm_struct *mm = t->mm;
-       unsigned int max_cid;
-
-       if (!mm)
-               return;
-
-       /* Preset last_mm_cid */
-       max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
-       t->mm_cid.last_cid = max_cid - 1;
-}
-
-static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
-{
-       struct mm_struct *mm = t->mm;
-
-       if (cid >= max_cids)
-               return false;
-       if (test_and_set_bit(cid, mm_cidmask(mm)))
-               return false;
-       t->mm_cid.cid = t->mm_cid.last_cid = cid;
-       __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
-       return true;
-}
-
-static inline bool mm_cid_get(struct task_struct *t)
-{
-       struct mm_struct *mm = t->mm;
-       unsigned int max_cids;
-
-       max_cids = READ_ONCE(mm->mm_cid.max_cids);
-
-       /* Try to reuse the last CID of this task */
-       if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
-               return true;
-
-       /* Try to reuse the last CID of this mm on this CPU */
-       if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids))
-               return true;
-
-       /* Try the first zero bit in the cidmask. */
-       return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), num_possible_cpus()), max_cids);
-}
-
-static inline void mm_cid_select(struct task_struct *t)
-{
-       /*
-        * mm_cid_get() can fail when the maximum CID, which is determined
-        * by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently.
-        * That's a transient failure as there cannot be more tasks
-        * concurrently on a CPU (or about to be scheduled in) than that.
-        */
-       for (;;) {
-               if (mm_cid_get(t))
-                       break;
-       }
-}
-
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
-{
-       if (prev->mm_cid.active) {
-               if (prev->mm_cid.cid != MM_CID_UNSET)
-                       clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm));
-               prev->mm_cid.cid = MM_CID_UNSET;
-       }
-
-       if (next->mm_cid.active) {
-               mm_cid_select(next);
-               rseq_sched_set_task_mm_cid(next, next->mm_cid.cid);
-       }
-}
-
  #else /* !CONFIG_SCHED_MM_CID: */
-static inline void mm_cid_select(struct task_struct *t) { }
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
  static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
  #endif /* !CONFIG_SCHED_MM_CID */
author	Thomas Gleixner <tglx@linutronix.de>
	Wed, 19 Nov 2025 17:27:22 +0000 (18:27 +0100)
committer	Thomas Gleixner <tglx@linutronix.de>
	Tue, 25 Nov 2025 18:45:42 +0000 (19:45 +0100)
include/linux/rseq.h		patch \| blob \| blame \| history
include/linux/rseq_types.h		patch \| blob \| blame \| history
kernel/fork.c		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history