sched/mmcid: Prevent live lock on task to CPU mode transition

author Thomas Gleixner <tglx@kernel.org>

Mon, 2 Feb 2026 09:39:40 +0000 (10:39 +0100)

committer Peter Zijlstra <peterz@infradead.org>

Wed, 4 Feb 2026 11:21:11 +0000 (12:21 +0100)
author Thomas Gleixner <tglx@kernel.org>
Mon, 2 Feb 2026 09:39:40 +0000 (10:39 +0100)
committer Peter Zijlstra <peterz@infradead.org>
Wed, 4 Feb 2026 11:21:11 +0000 (12:21 +0100)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 045f83ad261e25283d290fd064ad47cd2399dc79..1e790f25f709a87269f147fb5d98fa1d9374aeb2 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -10269,7 +10269,8 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
   * Serialization rules:
   *
   * mm::mm_cid::mutex:  Serializes fork() and exit() and therefore
- *                     protects mm::mm_cid::users.
+ *                     protects mm::mm_cid::users and mode switch
+ *                     transitions
   *
   * mm::mm_cid::lock:   Serializes mm_update_max_cids() and
   *                     mm_update_cpus_allowed(). Nests in mm_cid::mutex
@@ -10285,14 +10286,61 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
   *
   * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
   * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
- * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
- * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
- * task needs to drop the CID into the pool when scheduling out.  Both bits
- * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
- * actually handed over to user space in the RSEQ memory.
+ * MM_CID_ONCPU bit set.
+ *
+ * During the transition of ownership mode, the MM_CID_TRANSIT bit is set
+ * on the CIDs. When this bit is set the tasks drop the CID back into the
+ * pool when scheduling out.
+ *
+ * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the
+ * CID is actually handed over to user space in the RSEQ memory.
   *
   * Mode switching:
   *
+ * All transitions of ownership mode happen in two phases:
+ *
+ *  1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the CIDs
+ *     and denotes that the CID is only temporarily owned by a task. When
+ *     the task schedules out it drops the CID back into the pool if this
+ *     bit is set.
+ *
+ *  2) The initiating context walks the per CPU space or the tasks to fixup
+ *     or drop the CIDs and after completion it clears mm:mm_cid.transit.
+ *     After that point the CIDs are strictly task or CPU owned again.
+ *
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail:
+ *
+ *   - On task to CPU mode switch if a task is scheduled in on one CPU and
+ *     then migrated to another CPU before the fixup freed enough per task
+ *     CIDs.
+ *
+ *   - On CPU to task mode switch if two tasks are scheduled in on the same
+ *     CPU before the fixup freed per CPU CIDs.
+ *
+ *   Both scenarios can result in a live lock because sched_in() is invoked
+ *   with runqueue lock held and loops in search of a CID and the fixup
+ *   thread can't make progress freeing them up because it is stuck on the
+ *   same runqueue lock.
+ *
+ * While MM_CID_TRANSIT is active during the transition phase the MM_CID
+ * bitmap can be contended, but that's a temporary contention bound to the
+ * transition period. After that everything goes back into steady state and
+ * nothing except fork() and exit() will touch the bitmap. This is an
+ * acceptable tradeoff as it completely avoids complex serialization,
+ * memory barriers and atomic operations for the common case.
+ *
+ * Aside of that this mechanism also ensures RT compability:
+ *
+ *   - The task which runs the fixup is fully preemptible except for the
+ *     short runqueue lock held sections.
+ *
+ *   - The transient impact of the bitmap contention is only problematic
+ *     when there is a thundering herd scenario of tasks scheduling in and
+ *     out concurrently. There is not much which can be done about that
+ *     except for avoiding mode switching by a proper overall system
+ *     configuration.
+ *
   * Switching to per CPU mode happens when the user count becomes greater
   * than the maximum number of CIDs, which is calculated by:
   *
@@ -10306,12 +10354,13 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
   *
   * At the point of switching to per CPU mode the new user is not yet
   * visible in the system, so the task which initiated the fork() runs the
- * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
- * either transfers each tasks owned CID to the CPU the task runs on or
- * drops it into the CID pool if a task is not on a CPU at that point in
- * time. Tasks which schedule in before the task walk reaches them do the
- * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
- * it's guaranteed that no task related to that MM owns a CID anymore.
+ * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either marks each task owned CID with MM_CID_TRANSIT if the task is
+ * running on a CPU or drops it into the CID pool if a task is not on a
+ * CPU. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus()
+ * completes it is guaranteed that no task related to that MM owns a CID
+ * anymore.
   *
   * Switching back to task mode happens when the user count goes below the
   * threshold which was recorded on the per CPU mode switch:
@@ -10327,28 +10376,11 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
   * run either in the deferred update function in context of a workqueue or
   * by a task which forks a new one or by a task which exits. Whatever
   * happens first. mm_cid_fixup_cpus_to_task() walks through the possible
- * CPUs and either transfers the CPU owned CIDs to a related task which
- * runs on the CPU or drops it into the pool. Tasks which schedule in on a
- * CPU which the walk did not cover yet do the handover themself.
- *
- * This transition from CPU to per task ownership happens in two phases:
- *
- *  1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
- *     CID and denotes that the CID is only temporarily owned by the
- *     task. When it schedules out the task drops the CID back into the
- *     pool if this bit is set.
- *
- *  2) The initiating context walks the per CPU space and after completion
- *     clears mm:mm_cid.transit. So after that point the CIDs are strictly
- *     task owned again.
- *
- * This two phase transition is required to prevent CID space exhaustion
- * during the transition as a direct transfer of ownership would fail if
- * two tasks are scheduled in on the same CPU before the fixup freed per
- * CPU CIDs.
- *
- * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
- * related to that MM is owned by a CPU anymore.
+ * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a
+ * related task is running on the CPU or drops it into the pool. Tasks
+ * which are scheduled in before the fixup covered them do the handover
+ * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed
+ * that no CID related to that MM is owned by a CPU anymore.
   */
  
  /*
@@ -10400,9 +10432,9 @@ static bool mm_update_max_cids(struct mm_struct *mm)
         /* Mode change required? */
         if (!!mc->percpu == !!mc->pcpu_thrs)
                 return false;
-       /* When switching back to per TASK mode, set the transition flag */
-       if (!mc->pcpu_thrs)
-               WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
+
+       /* Set the transition flag to bridge the transfer */
+       WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
         WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
         return true;
  }
@@ -10493,10 +10525,10 @@ static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
         WRITE_ONCE(mm->mm_cid.transit, 0);
  }
  
-static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
  {
         if (cid_on_task(t->mm_cid.cid)) {
-               t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+               t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid);
                 pcp->cid = t->mm_cid.cid;
         }
  }
@@ -10509,18 +10541,17 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
         if (!t->mm_cid.active)
                 return false;
         if (cid_on_task(t->mm_cid.cid)) {
-               /* If running on the CPU, transfer the CID, otherwise drop it */
+               /* If running on the CPU, put the CID in transit mode, otherwise drop it */
                 if (task_rq(t)->curr == t)
-                       mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+                       mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
                 else
                         mm_unset_cid_on_task(t);
         }
         return true;
  }
  
-static void mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
  {
-       struct mm_struct *mm = current->mm;
         struct task_struct *p, *t;
         unsigned int users;
  
@@ -10558,6 +10589,15 @@ static void mm_cid_fixup_tasks_to_cpus(void)
         }
  }
  
+static void mm_cid_fixup_tasks_to_cpus(void)
+{
+       struct mm_struct *mm = current->mm;
+
+       mm_cid_do_fixup_tasks_to_cpus(mm);
+       /* Clear the transition bit */
+       WRITE_ONCE(mm->mm_cid.transit, 0);
+}
+
  static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
  {
         t->mm_cid.active = 1;
@@ -10596,7 +10636,7 @@ void sched_mm_cid_fork(struct task_struct *t)
                 if (!percpu)
                         mm_cid_transit_to_task(current, pcp);
                 else
-                       mm_cid_transfer_to_cpu(current, pcp);
+                       mm_cid_transit_to_cpu(current, pcp);
         }
  
         if (percpu) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 93fce4bbff5eac1d4719394e89dfae886b74d865..eff207346e8e1b3f30ae67f600999eaf2c8810c1 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3841,6 +3841,10 @@ static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int
                 /* Still nothing, allocate a new one */
                 if (!cid_on_cpu(cpu_cid))
                         cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
+
+               /* Set the transition mode flag if required */
+               if (READ_ONCE(mm->mm_cid.transit))
+                       cpu_cid = cpu_cid_to_cid(cpu_cid) | MM_CID_TRANSIT;
         }
         mm_cid_update_pcpu_cid(mm, cpu_cid);
         mm_cid_update_task_cid(t, cpu_cid);
author	Thomas Gleixner <tglx@kernel.org>
	Mon, 2 Feb 2026 09:39:40 +0000 (10:39 +0100)
committer	Peter Zijlstra <peterz@infradead.org>
	Wed, 4 Feb 2026 11:21:11 +0000 (12:21 +0100)
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history