* Serialization rules:
*
* mm::mm_cid::mutex: Serializes fork() and exit() and therefore
- * protects mm::mm_cid::users.
+ * protects mm::mm_cid::users and mode switch
+ * transitions
*
* mm::mm_cid::lock: Serializes mm_update_max_cids() and
* mm_update_cpus_allowed(). Nests in mm_cid::mutex
*
* A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
* by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
- * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
- * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
- * task needs to drop the CID into the pool when scheduling out. Both bits
- * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
- * actually handed over to user space in the RSEQ memory.
+ * MM_CID_ONCPU bit set.
+ *
+ * During the transition of ownership mode, the MM_CID_TRANSIT bit is set
+ * on the CIDs. When this bit is set the tasks drop the CID back into the
+ * pool when scheduling out.
+ *
+ * Both bits (ONCPU and TRANSIT) are filtered out by task_cid() when the
+ * CID is actually handed over to user space in the RSEQ memory.
*
* Mode switching:
*
+ * All transitions of ownership mode happen in two phases:
+ *
+ * 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the CIDs
+ * and denotes that the CID is only temporarily owned by a task. When
+ * the task schedules out it drops the CID back into the pool if this
+ * bit is set.
+ *
+ * 2) The initiating context walks the per CPU space or the tasks to fixup
+ * or drop the CIDs and after completion it clears mm:mm_cid.transit.
+ * After that point the CIDs are strictly task or CPU owned again.
+ *
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail:
+ *
+ * - On task to CPU mode switch if a task is scheduled in on one CPU and
+ * then migrated to another CPU before the fixup freed enough per task
+ * CIDs.
+ *
+ * - On CPU to task mode switch if two tasks are scheduled in on the same
+ * CPU before the fixup freed per CPU CIDs.
+ *
+ * Both scenarios can result in a live lock because sched_in() is invoked
+ * with runqueue lock held and loops in search of a CID and the fixup
+ * thread can't make progress freeing them up because it is stuck on the
+ * same runqueue lock.
+ *
+ * While MM_CID_TRANSIT is active during the transition phase the MM_CID
+ * bitmap can be contended, but that's a temporary contention bound to the
+ * transition period. After that everything goes back into steady state and
+ * nothing except fork() and exit() will touch the bitmap. This is an
+ * acceptable tradeoff as it completely avoids complex serialization,
+ * memory barriers and atomic operations for the common case.
+ *
+ * Aside of that this mechanism also ensures RT compability:
+ *
+ * - The task which runs the fixup is fully preemptible except for the
+ * short runqueue lock held sections.
+ *
+ * - The transient impact of the bitmap contention is only problematic
+ * when there is a thundering herd scenario of tasks scheduling in and
+ * out concurrently. There is not much which can be done about that
+ * except for avoiding mode switching by a proper overall system
+ * configuration.
+ *
* Switching to per CPU mode happens when the user count becomes greater
* than the maximum number of CIDs, which is calculated by:
*
*
* At the point of switching to per CPU mode the new user is not yet
* visible in the system, so the task which initiated the fork() runs the
- * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
- * either transfers each tasks owned CID to the CPU the task runs on or
- * drops it into the CID pool if a task is not on a CPU at that point in
- * time. Tasks which schedule in before the task walk reaches them do the
- * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
- * it's guaranteed that no task related to that MM owns a CID anymore.
+ * fixup function. mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either marks each task owned CID with MM_CID_TRANSIT if the task is
+ * running on a CPU or drops it into the CID pool if a task is not on a
+ * CPU. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus()
+ * completes it is guaranteed that no task related to that MM owns a CID
+ * anymore.
*
* Switching back to task mode happens when the user count goes below the
* threshold which was recorded on the per CPU mode switch:
* run either in the deferred update function in context of a workqueue or
* by a task which forks a new one or by a task which exits. Whatever
* happens first. mm_cid_fixup_cpus_to_task() walks through the possible
- * CPUs and either transfers the CPU owned CIDs to a related task which
- * runs on the CPU or drops it into the pool. Tasks which schedule in on a
- * CPU which the walk did not cover yet do the handover themself.
- *
- * This transition from CPU to per task ownership happens in two phases:
- *
- * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
- * CID and denotes that the CID is only temporarily owned by the
- * task. When it schedules out the task drops the CID back into the
- * pool if this bit is set.
- *
- * 2) The initiating context walks the per CPU space and after completion
- * clears mm:mm_cid.transit. So after that point the CIDs are strictly
- * task owned again.
- *
- * This two phase transition is required to prevent CID space exhaustion
- * during the transition as a direct transfer of ownership would fail if
- * two tasks are scheduled in on the same CPU before the fixup freed per
- * CPU CIDs.
- *
- * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
- * related to that MM is owned by a CPU anymore.
+ * CPUs and either marks the CPU owned CIDs with MM_CID_TRANSIT if a
+ * related task is running on the CPU or drops it into the pool. Tasks
+ * which are scheduled in before the fixup covered them do the handover
+ * themself. When mm_cid_fixup_cpus_to_tasks() completes it is guaranteed
+ * that no CID related to that MM is owned by a CPU anymore.
*/
/*
/* Mode change required? */
if (!!mc->percpu == !!mc->pcpu_thrs)
return false;
- /* When switching back to per TASK mode, set the transition flag */
- if (!mc->pcpu_thrs)
- WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
+
+ /* Set the transition flag to bridge the transfer */
+ WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
return true;
}
WRITE_ONCE(mm->mm_cid.transit, 0);
}
-static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
if (cid_on_task(t->mm_cid.cid)) {
- t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+ t->mm_cid.cid = cid_to_transit_cid(t->mm_cid.cid);
pcp->cid = t->mm_cid.cid;
}
}
if (!t->mm_cid.active)
return false;
if (cid_on_task(t->mm_cid.cid)) {
- /* If running on the CPU, transfer the CID, otherwise drop it */
+ /* If running on the CPU, put the CID in transit mode, otherwise drop it */
if (task_rq(t)->curr == t)
- mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+ mm_cid_transit_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
else
mm_unset_cid_on_task(t);
}
return true;
}
-static void mm_cid_fixup_tasks_to_cpus(void)
+static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
{
- struct mm_struct *mm = current->mm;
struct task_struct *p, *t;
unsigned int users;
}
}
+static void mm_cid_fixup_tasks_to_cpus(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ mm_cid_do_fixup_tasks_to_cpus(mm);
+ /* Clear the transition bit */
+ WRITE_ONCE(mm->mm_cid.transit, 0);
+}
+
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
t->mm_cid.active = 1;
if (!percpu)
mm_cid_transit_to_task(current, pcp);
else
- mm_cid_transfer_to_cpu(current, pcp);
+ mm_cid_transit_to_cpu(current, pcp);
}
if (percpu) {