From: Linus Torvalds Date: Tue, 2 Dec 2025 16:48:53 +0000 (-0800) Subject: Merge tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git... X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2b09f480f0a1e68111ae36a7be9aa1c93e067255;p=thirdparty%2Flinux.git Merge tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull rseq updates from Thomas Gleixner: "A large overhaul of the restartable sequences and CID management: The recent enablement of RSEQ in glibc resulted in regressions which are caused by the related overhead. It turned out that the decision to invoke the exit to user work was not really a decision. More or less each context switch caused that. There is a long list of small issues which sums up nicely and results in a 3-4% regression in I/O benchmarks. The other detail which caused issues due to extra work in context switch and task migration is the CID (memory context ID) management. It also requires to use a task work to consolidate the CID space, which is executed in the context of an arbitrary task and results in sporadic uncontrolled exit latencies. The rewrite addresses this by: - Removing deprecated and long unsupported functionality - Moving the related data into dedicated data structures which are optimized for fast path processing. - Caching values so actual decisions can be made - Replacing the current implementation with a optimized inlined variant. - Separating fast and slow path for architectures which use the generic entry code, so that only fault and error handling goes into the TIF_NOTIFY_RESUME handler. - Rewriting the CID management so that it becomes mostly invisible in the context switch path. That moves the work of switching modes into the fork/exit path, which is a reasonable tradeoff. That work is only required when a process creates more threads than the cpuset it is allowed to run on or when enough threads exit after that. An artificial thread pool benchmarks which triggers this did not degrade, it actually improved significantly. The main effect in migration heavy scenarios is that runqueue lock held time and therefore contention goes down significantly" * tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched/mmcid: Switch over to the new mechanism sched/mmcid: Implement deferred mode change irqwork: Move data struct to a types header sched/mmcid: Provide CID ownership mode fixup functions sched/mmcid: Provide new scheduler CID mechanism sched/mmcid: Introduce per task/CPU ownership infrastructure sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex sched/mmcid: Provide precomputed maximal value sched/mmcid: Move initialization out of line signal: Move MMCID exit out of sighand lock sched/mmcid: Convert mm CID mask to a bitmap cpumask: Cache num_possible_cpus() sched/mmcid: Use cpumask_weighted_or() cpumask: Introduce cpumask_weighted_or() sched/mmcid: Prevent pointless work in mm_update_cpus_allowed() sched/mmcid: Move scheduler code out of global header sched: Fixup whitespace damage sched/mmcid: Cacheline align MM CID storage sched/mmcid: Use proper data structures sched/mmcid: Revert the complex CID management ... --- 2b09f480f0a1e68111ae36a7be9aa1c93e067255 diff --cc kernel/fork.c index f1857672426e0,8475958e029ba..83e05d6f23070 --- a/kernel/fork.c +++ b/kernel/fork.c @@@ -2453,9 -2451,10 +2451,10 @@@ bad_fork_cleanup_io if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - exit_task_namespaces(p); + exit_nsproxy_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { + sched_mm_cid_exit(p); mm_clear_owner(p->mm, p); mmput(p->mm); } diff --cc kernel/sched/core.c index 0c4ff93eeb78e,62235f1dc04e3..fc358c1b6ca98 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@@ -2665,12 -2694,48 +2666,10 @@@ void set_cpus_allowed_common(struct tas } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { - scoped_guard (sched_change, p, DEQUEUE_SAVE) { - struct rq *rq = task_rq(p); - bool queued, running; - - /* - * This here violates the locking rules for affinity, since we're only - * supposed to change these variables while holding both rq->lock and - * p->pi_lock. - * - * HOWEVER, it magically works, because ttwu() is the only code that - * accesses these variables under p->pi_lock and only does so after - * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() - * before finish_task(). - * - * XXX do further audits, this smells like something putrid. - */ - if (ctx->flags & SCA_MIGRATE_DISABLE) - WARN_ON_ONCE(!p->on_cpu); - else - lockdep_assert_held(&p->pi_lock); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) { - /* - * Because __kthread_bind() calls this on blocked tasks without - * holding rq->lock. - */ - lockdep_assert_rq_held(rq); - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - } - if (running) - put_prev_task(rq, p); - - p->sched_class->set_cpus_allowed(p, ctx); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); ++ scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->sched_class->set_cpus_allowed(p, ctx); - mm_set_cpus_allowed(p->mm, ctx->new_mask); - } } /* @@@ -10589,251 -10607,289 +10492,318 @@@ static inline void mm_cid_transfer_to_c } } - static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) + static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) { - struct rq *rq = cpu_rq(cpu); - struct mm_cid *pcpu_cid; - struct task_struct *curr; - u64 rq_clock; + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ + guard(task_rq_lock)(t); + /* If the task is not active it is not in the users count */ + if (!t->mm_cid.active) + return false; + if (cid_on_task(t->mm_cid.cid)) { + /* If running on the CPU, transfer the CID, otherwise drop it */ + if (task_rq(t)->curr == t) + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); + else + mm_unset_cid_on_task(t); + } + return true; + } - /* - * rq->clock load is racy on 32-bit but one spurious clear once in a - * while is irrelevant. - */ - rq_clock = READ_ONCE(rq->clock); - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); + static void mm_cid_fixup_tasks_to_cpus(void) + { + struct mm_struct *mm = current->mm; + struct task_struct *p, *t; + unsigned int users; /* - * In order to take care of infrequently scheduled tasks, bump the time - * snapshot associated with this cid if an active task using the mm is - * observed on this rq. + * This can obviously race with a concurrent affinity change, which + * increases the number of allowed CPUs for this mm, but that does + * not affect the mode and only changes the CID constraints. A + * possible switch back to per task mode happens either in the + * deferred handler function or in the next fork()/exit(). + * + * The caller has already transferred. The newly incoming task is + * already accounted for, but not yet visible. */ - scoped_guard (rcu) { - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - return; - } + users = mm->mm_cid.users - 2; + if (!users) + return; + + guard(rcu)(); + for_other_threads(current, t) { + if (mm_cid_fixup_task_to_cpu(t, mm)) + users--; } - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) + if (!users) return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); + + /* Happens only for VM_CLONE processes. */ + for_each_process_thread(p, t) { + if (t == current || t->mm != mm) + continue; + if (mm_cid_fixup_task_to_cpu(t, mm)) { + if (--users == 0) + return; + } + } } - static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, - int weight) + static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) { - struct mm_cid *pcpu_cid; - int cid; - - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); - cid = READ_ONCE(pcpu_cid->cid); - if (!mm_cid_is_valid(cid) || cid < weight) - return; - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); + t->mm_cid.active = 1; + mm->mm_cid.users++; + return mm_update_max_cids(mm); } - static void task_mm_cid_work(struct callback_head *work) + void sched_mm_cid_fork(struct task_struct *t) { - unsigned long now = jiffies, old_scan, next_scan; - struct task_struct *t = current; - struct cpumask *cidmask; - struct mm_struct *mm; - int weight, cpu; + struct mm_struct *mm = t->mm; + bool percpu; - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); + WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); - work->next = work; /* Prevent double-add */ - if (t->flags & PF_EXITING) - return; - mm = t->mm; - if (!mm) - return; - old_scan = READ_ONCE(mm->mm_cid_next_scan); - next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); - if (!old_scan) { - unsigned long res; - - res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); - if (res != old_scan) - old_scan = res; + guard(mutex)(&mm->mm_cid.mutex); + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu); + + /* First user ? */ + if (!mm->mm_cid.users) { + sched_mm_cid_add_user(t, mm); + t->mm_cid.cid = mm_get_cid(mm); + /* Required for execve() */ + pcp->cid = t->mm_cid.cid; + return; + } + + if (!sched_mm_cid_add_user(t, mm)) { + if (!mm->mm_cid.percpu) + t->mm_cid.cid = mm_get_cid(mm); + return; + } + + /* Handle the mode change and transfer current's CID */ + percpu = !!mm->mm_cid.percpu; + if (!percpu) + mm_cid_transit_to_task(current, pcp); else - old_scan = next_scan; + mm_cid_transfer_to_cpu(current, pcp); } - if (time_before(now, old_scan)) - return; - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) - return; - cidmask = mm_cidmask(mm); - /* Clear cids that were not recently used. */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_old(mm, cpu); - weight = cpumask_weight(cidmask); - /* - * Clear cids that are greater or equal to the cidmask weight to - * recompact it. - */ - for_each_possible_cpu(cpu) - sched_mm_cid_remote_clear_weight(mm, cpu, weight); - } - - void init_sched_mm_cid(struct task_struct *t) - { - struct mm_struct *mm = t->mm; - int mm_users = 0; - if (mm) { - mm_users = atomic_read(&mm->mm_users); - if (mm_users == 1) - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); + if (percpu) { + mm_cid_fixup_tasks_to_cpus(); + } else { + mm_cid_fixup_cpus_to_tasks(mm); + t->mm_cid.cid = mm_get_cid(mm); } - t->cid_work.next = &t->cid_work; /* Protect against double add */ - init_task_work(&t->cid_work, task_mm_cid_work); } - void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) + static bool sched_mm_cid_remove_user(struct task_struct *t) { - struct callback_head *work = &curr->cid_work; - unsigned long now = jiffies; - - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || - work->next != work) - return; - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) - return; - - /* No page allocation under rq lock */ - task_work_add(curr, work, TWA_RESUME); + t->mm_cid.active = 0; + scoped_guard(preempt) { + /* Clear the transition bit */ + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); + mm_unset_cid_on_task(t); + } + t->mm->mm_cid.users--; + return mm_update_max_cids(t->mm); } - void sched_mm_cid_exit_signals(struct task_struct *t) + static bool __sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - if (!mm) - return; - - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); + if (!sched_mm_cid_remove_user(t)) + return false; + /* + * Contrary to fork() this only deals with a switch back to per + * task mode either because the above decreased users or an + * affinity change increased the number of allowed CPUs and the + * deferred fixup did not run yet. + */ + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return false; /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). + * A failed fork(2) cleanup never gets here, so @current must have + * the same MM as @t. That's true for exit() and the failed + * pthread_create() cleanup case. */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + if (WARN_ON_ONCE(current->mm != mm)) + return false; + return true; } - void sched_mm_cid_before_execve(struct task_struct *t) + /* + * When a task exits, the MM CID held by the task is not longer required as + * the task cannot return to user space. + */ + void sched_mm_cid_exit(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq *rq; - if (!mm) + if (!mm || !t->mm_cid.active) return; + /* + * Ensure that only one instance is doing MM CID operations within + * a MM. The common case is uncontended. The rare fixup case adds + * some overhead. + */ + scoped_guard(mutex, &mm->mm_cid.mutex) { + /* mm_cid::mutex is sufficient to protect mm_cid::users */ + if (likely(mm->mm_cid.users > 1)) { + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + if (!__sched_mm_cid_exit(t)) + return; + /* Mode change required. Transfer currents CID */ + mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); + } + mm_cid_fixup_cpus_to_tasks(mm); + return; + } + /* Last user */ + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Required across execve() */ + if (t == current) + mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); + /* Ignore mode change. There is nothing to do. */ + sched_mm_cid_remove_user(t); + } + } - preempt_disable(); - rq = this_rq(); - guard(rq_lock_irqsave)(rq); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 0); /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). + * As this is the last user (execve(), process exit or failed + * fork(2)) there is no concurrency anymore. + * + * Synchronize eventually pending work to ensure that there are no + * dangling references left. @t->mm_cid.users is zero so nothing + * can queue this work anymore. */ - smp_mb(); - mm_cid_put(mm); - t->last_mm_cid = t->mm_cid = -1; + irq_work_sync(&mm->mm_cid.irq_work); + cancel_work_sync(&mm->mm_cid.work); + } + + /* Deactivate MM CID allocation across execve() */ + void sched_mm_cid_before_execve(struct task_struct *t) + { + sched_mm_cid_exit(t); } + /* Reactivate MM CID after successful execve() */ void sched_mm_cid_after_execve(struct task_struct *t) { - struct mm_struct *mm = t->mm; - struct rq *rq; + sched_mm_cid_fork(t); + } + + static void mm_cid_work_fn(struct work_struct *work) + { + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); - if (!mm) + guard(mutex)(&mm->mm_cid.mutex); + /* Did the last user task exit already? */ + if (!mm->mm_cid.users) return; - preempt_disable(); - rq = this_rq(); - scoped_guard (rq_lock_irqsave, rq) { - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { + /* Have fork() or exit() handled it already? */ + if (!mm->mm_cid.update_deferred) + return; + /* This clears mm_cid::update_deferred */ + if (!mm_update_max_cids(mm)) + return; + /* Affinity changes can only switch back to task mode */ + if (WARN_ON_ONCE(mm->mm_cid.percpu)) + return; } + mm_cid_fixup_cpus_to_tasks(mm); } - void sched_mm_cid_fork(struct task_struct *t) + static void mm_cid_irq_work(struct irq_work *work) + { + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work); + + /* + * Needs to be unconditional because mm_cid::lock cannot be held + * when scheduling work as mm_update_cpus_allowed() nests inside + * rq::lock and schedule_work() might end up in wakeup... + */ + schedule_work(&mm->mm_cid.work); + } + + void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { - WARN_ON_ONCE(!t->mm || t->mm_cid != -1); - t->mm_cid_active = 1; + mm->mm_cid.max_cids = 0; + mm->mm_cid.percpu = 0; + mm->mm_cid.transit = 0; + mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; + mm->mm_cid.users = 0; + mm->mm_cid.pcpu_thrs = 0; + mm->mm_cid.update_deferred = 0; + raw_spin_lock_init(&mm->mm_cid.lock); + mutex_init(&mm->mm_cid.mutex); + mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); + INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); + bitmap_zero(mm_cidmask(mm), num_possible_cpus()); } - #endif /* CONFIG_SCHED_MM_CID */ + #else /* CONFIG_SCHED_MM_CID */ + static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } + #endif /* !CONFIG_SCHED_MM_CID */ -#ifdef CONFIG_SCHED_CLASS_EXT -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx) +static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); + +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) { + struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); struct rq *rq = task_rq(p); + /* + * Must exclusively use matched flags since this is both dequeue and + * enqueue. + */ + WARN_ON_ONCE(flags & 0xFFFF0000); + lockdep_assert_rq_held(rq); - *ctx = (struct sched_enq_and_set_ctx){ + if (!(flags & DEQUEUE_NOCLOCK)) { + update_rq_clock(rq); + flags |= DEQUEUE_NOCLOCK; + } + + if (flags & DEQUEUE_CLASS) { + if (p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); + } + + *ctx = (struct sched_change_ctx){ .p = p, - .queue_flags = queue_flags, + .flags = flags, .queued = task_on_rq_queued(p), - .running = task_current(rq, p), + .running = task_current_donor(rq, p), }; - update_rq_clock(rq); + if (!(flags & DEQUEUE_CLASS)) { + if (p->sched_class->get_prio) + ctx->prio = p->sched_class->get_prio(rq, p); + else + ctx->prio = p->prio; + } + if (ctx->queued) - dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + dequeue_task(rq, p, flags); if (ctx->running) put_prev_task(rq, p); + + if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from) + p->sched_class->switched_from(rq, p); + + return ctx; } -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +void sched_change_end(struct sched_change_ctx *ctx) { - struct rq *rq = task_rq(ctx->p); + struct task_struct *p = ctx->p; + struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq);