Merge tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 2 Dec 2025 16:48:53 +0000 (08:48 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 2 Dec 2025 16:48:53 +0000 (08:48 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Dec 2025 16:48:53 +0000 (08:48 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Dec 2025 16:48:53 +0000 (08:48 -0800)
diff --cc drivers/hv/mshv_root_main.c
Simple merge
diff --cc fs/exec.c
Simple merge
diff --cc include/linux/cleanup.h
Simple merge
diff --cc include/linux/irq-entry-common.h
Simple merge
diff --cc include/linux/mm.h
Simple merge
diff --cc include/linux/sched.h
Simple merge
diff --cc init/init_task.c
Simple merge
diff --cc kernel/exit.c
Simple merge
diff --cc kernel/fork.c

index f1857672426e0fa938186043b86fdddbe20d912b,8475958e029ba3be7f42e8bac0f05e50bc494e97..83e05d6f230701c19ef335bb7eac69143f425568
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -2453,9 -2451,10 +2451,10 @@@ bad_fork_cleanup_io
         if (p->io_context)
                 exit_io_context(p);
   bad_fork_cleanup_namespaces:
- -      exit_task_namespaces(p);
+ +      exit_nsproxy_namespaces(p);
   bad_fork_cleanup_mm:
         if (p->mm) {
+               sched_mm_cid_exit(p);
                 mm_clear_owner(p->mm, p);
                 mmput(p->mm);
         }
diff --cc kernel/sched/core.c

index 0c4ff93eeb78e80e6dc2481f49e670fa9bfdbe0b,62235f1dc04e37c547b98a6927096da760687f1c..fc358c1b6ca987e66917f99a82f5fc307baeef7e
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -2665,12 -2694,48 +2666,10 @@@ void set_cpus_allowed_common(struct tas
   }
   
   static void
- -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
+ +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
   {
-       scoped_guard (sched_change, p, DEQUEUE_SAVE) {
- -      struct rq *rq = task_rq(p);
- -      bool queued, running;
- -
- -      /*
- -       * This here violates the locking rules for affinity, since we're only
- -       * supposed to change these variables while holding both rq->lock and
- -       * p->pi_lock.
- -       *
- -       * HOWEVER, it magically works, because ttwu() is the only code that
- -       * accesses these variables under p->pi_lock and only does so after
- -       * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
- -       * before finish_task().
- -       *
- -       * XXX do further audits, this smells like something putrid.
- -       */
- -      if (ctx->flags & SCA_MIGRATE_DISABLE)
- -              WARN_ON_ONCE(!p->on_cpu);
- -      else
- -              lockdep_assert_held(&p->pi_lock);
- -
- -      queued = task_on_rq_queued(p);
- -      running = task_current_donor(rq, p);
- -
- -      if (queued) {
- -              /*
- -               * Because __kthread_bind() calls this on blocked tasks without
- -               * holding rq->lock.
- -               */
- -              lockdep_assert_rq_held(rq);
- -              dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- -      }
- -      if (running)
- -              put_prev_task(rq, p);
- -
- -      p->sched_class->set_cpus_allowed(p, ctx);
- -
- -      if (queued)
- -              enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- -      if (running)
- -              set_next_task(rq, p);
++      scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ +              p->sched_class->set_cpus_allowed(p, ctx);
-               mm_set_cpus_allowed(p->mm, ctx->new_mask);
-       }
   }
   
   /*
@@@ -10589,251 -10607,289 +10492,318 @@@ static inline void mm_cid_transfer_to_c
         }
   }
   
- static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
   {
-       struct rq *rq = cpu_rq(cpu);
-       struct mm_cid *pcpu_cid;
-       struct task_struct *curr;
-       u64 rq_clock;
+       /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+       guard(task_rq_lock)(t);
+       /* If the task is not active it is not in the users count */
+       if (!t->mm_cid.active)
+               return false;
+       if (cid_on_task(t->mm_cid.cid)) {
+               /* If running on the CPU, transfer the CID, otherwise drop it */
+               if (task_rq(t)->curr == t)
+                       mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+               else
+                       mm_unset_cid_on_task(t);
+       }
+       return true;
+ }
   
-       /*
-        * rq->clock load is racy on 32-bit but one spurious clear once in a
-        * while is irrelevant.
-        */
-       rq_clock = READ_ONCE(rq->clock);
-       pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+ static void mm_cid_fixup_tasks_to_cpus(void)
+ {
+       struct mm_struct *mm = current->mm;
+       struct task_struct *p, *t;
+       unsigned int users;
   
         /*
-        * In order to take care of infrequently scheduled tasks, bump the time
-        * snapshot associated with this cid if an active task using the mm is
-        * observed on this rq.
+        * This can obviously race with a concurrent affinity change, which
+        * increases the number of allowed CPUs for this mm, but that does
+        * not affect the mode and only changes the CID constraints. A
+        * possible switch back to per task mode happens either in the
+        * deferred handler function or in the next fork()/exit().
+        *
+        * The caller has already transferred. The newly incoming task is
+        * already accounted for, but not yet visible.
          */
-       scoped_guard (rcu) {
-               curr = rcu_dereference(rq->curr);
-               if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
-                       WRITE_ONCE(pcpu_cid->time, rq_clock);
-                       return;
-               }
+       users = mm->mm_cid.users - 2;
+       if (!users)
+               return;
+ 
+       guard(rcu)();
+       for_other_threads(current, t) {
+               if (mm_cid_fixup_task_to_cpu(t, mm))
+                       users--;
         }
   
-       if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
+       if (!users)
                 return;
-       sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+ 
+       /* Happens only for VM_CLONE processes. */
+       for_each_process_thread(p, t) {
+               if (t == current || t->mm != mm)
+                       continue;
+               if (mm_cid_fixup_task_to_cpu(t, mm)) {
+                       if (--users == 0)
+                               return;
+               }
+       }
   }
   
- static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
-                                            int weight)
+ static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
   {
-       struct mm_cid *pcpu_cid;
-       int cid;
- 
-       pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
-       cid = READ_ONCE(pcpu_cid->cid);
-       if (!mm_cid_is_valid(cid) || cid < weight)
-               return;
-       sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+       t->mm_cid.active = 1;
+       mm->mm_cid.users++;
+       return mm_update_max_cids(mm);
   }
   
- static void task_mm_cid_work(struct callback_head *work)
+ void sched_mm_cid_fork(struct task_struct *t)
   {
-       unsigned long now = jiffies, old_scan, next_scan;
-       struct task_struct *t = current;
-       struct cpumask *cidmask;
-       struct mm_struct *mm;
-       int weight, cpu;
+       struct mm_struct *mm = t->mm;
+       bool percpu;
   
-       WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
+       WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
   
-       work->next = work;      /* Prevent double-add */
-       if (t->flags & PF_EXITING)
-               return;
-       mm = t->mm;
-       if (!mm)
-               return;
-       old_scan = READ_ONCE(mm->mm_cid_next_scan);
-       next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
-       if (!old_scan) {
-               unsigned long res;
- 
-               res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
-               if (res != old_scan)
-                       old_scan = res;
+       guard(mutex)(&mm->mm_cid.mutex);
+       scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+               struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+ 
+               /* First user ? */
+               if (!mm->mm_cid.users) {
+                       sched_mm_cid_add_user(t, mm);
+                       t->mm_cid.cid = mm_get_cid(mm);
+                       /* Required for execve() */
+                       pcp->cid = t->mm_cid.cid;
+                       return;
+               }
+ 
+               if (!sched_mm_cid_add_user(t, mm)) {
+                       if (!mm->mm_cid.percpu)
+                               t->mm_cid.cid = mm_get_cid(mm);
+                       return;
+               }
+ 
+               /* Handle the mode change and transfer current's CID */
+               percpu = !!mm->mm_cid.percpu;
+               if (!percpu)
+                       mm_cid_transit_to_task(current, pcp);
                 else
-                       old_scan = next_scan;
+                       mm_cid_transfer_to_cpu(current, pcp);
         }
-       if (time_before(now, old_scan))
-               return;
-       if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
-               return;
-       cidmask = mm_cidmask(mm);
-       /* Clear cids that were not recently used. */
-       for_each_possible_cpu(cpu)
-               sched_mm_cid_remote_clear_old(mm, cpu);
-       weight = cpumask_weight(cidmask);
-       /*
-        * Clear cids that are greater or equal to the cidmask weight to
-        * recompact it.
-        */
-       for_each_possible_cpu(cpu)
-               sched_mm_cid_remote_clear_weight(mm, cpu, weight);
- }
- 
- void init_sched_mm_cid(struct task_struct *t)
- {
-       struct mm_struct *mm = t->mm;
-       int mm_users = 0;
   
-       if (mm) {
-               mm_users = atomic_read(&mm->mm_users);
-               if (mm_users == 1)
-                       mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+       if (percpu) {
+               mm_cid_fixup_tasks_to_cpus();
+       } else {
+               mm_cid_fixup_cpus_to_tasks(mm);
+               t->mm_cid.cid = mm_get_cid(mm);
         }
-       t->cid_work.next = &t->cid_work;        /* Protect against double add */
-       init_task_work(&t->cid_work, task_mm_cid_work);
   }
   
- void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+ static bool sched_mm_cid_remove_user(struct task_struct *t)
   {
-       struct callback_head *work = &curr->cid_work;
-       unsigned long now = jiffies;
- 
-       if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
-           work->next != work)
-               return;
-       if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
-               return;
- 
-       /* No page allocation under rq lock */
-       task_work_add(curr, work, TWA_RESUME);
+       t->mm_cid.active = 0;
+       scoped_guard(preempt) {
+               /* Clear the transition bit */
+               t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+               mm_unset_cid_on_task(t);
+       }
+       t->mm->mm_cid.users--;
+       return mm_update_max_cids(t->mm);
   }
   
- void sched_mm_cid_exit_signals(struct task_struct *t)
+ static bool __sched_mm_cid_exit(struct task_struct *t)
   {
         struct mm_struct *mm = t->mm;
-       struct rq *rq;
   
-       if (!mm)
-               return;
- 
-       preempt_disable();
-       rq = this_rq();
-       guard(rq_lock_irqsave)(rq);
-       preempt_enable_no_resched();    /* holding spinlock */
-       WRITE_ONCE(t->mm_cid_active, 0);
+       if (!sched_mm_cid_remove_user(t))
+               return false;
+       /*
+        * Contrary to fork() this only deals with a switch back to per
+        * task mode either because the above decreased users or an
+        * affinity change increased the number of allowed CPUs and the
+        * deferred fixup did not run yet.
+        */
+       if (WARN_ON_ONCE(mm->mm_cid.percpu))
+               return false;
         /*
-        * Store t->mm_cid_active before loading per-mm/cpu cid.
-        * Matches barrier in sched_mm_cid_remote_clear_old().
+        * A failed fork(2) cleanup never gets here, so @current must have
+        * the same MM as @t. That's true for exit() and the failed
+        * pthread_create() cleanup case.
          */
-       smp_mb();
-       mm_cid_put(mm);
-       t->last_mm_cid = t->mm_cid = -1;
+       if (WARN_ON_ONCE(current->mm != mm))
+               return false;
+       return true;
   }
   
- void sched_mm_cid_before_execve(struct task_struct *t)
+ /*
+  * When a task exits, the MM CID held by the task is not longer required as
+  * the task cannot return to user space.
+  */
+ void sched_mm_cid_exit(struct task_struct *t)
   {
         struct mm_struct *mm = t->mm;
-       struct rq *rq;
   
-       if (!mm)
+       if (!mm || !t->mm_cid.active)
                 return;
+       /*
+        * Ensure that only one instance is doing MM CID operations within
+        * a MM. The common case is uncontended. The rare fixup case adds
+        * some overhead.
+        */
+       scoped_guard(mutex, &mm->mm_cid.mutex) {
+               /* mm_cid::mutex is sufficient to protect mm_cid::users */
+               if (likely(mm->mm_cid.users > 1)) {
+                       scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+                               if (!__sched_mm_cid_exit(t))
+                                       return;
+                               /* Mode change required. Transfer currents CID */
+                               mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+                       }
+                       mm_cid_fixup_cpus_to_tasks(mm);
+                       return;
+               }
+               /* Last user */
+               scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+                       /* Required across execve() */
+                       if (t == current)
+                               mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+                       /* Ignore mode change. There is nothing to do. */
+                       sched_mm_cid_remove_user(t);
+               }
+       }
   
-       preempt_disable();
-       rq = this_rq();
-       guard(rq_lock_irqsave)(rq);
-       preempt_enable_no_resched();    /* holding spinlock */
-       WRITE_ONCE(t->mm_cid_active, 0);
         /*
-        * Store t->mm_cid_active before loading per-mm/cpu cid.
-        * Matches barrier in sched_mm_cid_remote_clear_old().
+        * As this is the last user (execve(), process exit or failed
+        * fork(2)) there is no concurrency anymore.
+        *
+        * Synchronize eventually pending work to ensure that there are no
+        * dangling references left. @t->mm_cid.users is zero so nothing
+        * can queue this work anymore.
          */
-       smp_mb();
-       mm_cid_put(mm);
-       t->last_mm_cid = t->mm_cid = -1;
+       irq_work_sync(&mm->mm_cid.irq_work);
+       cancel_work_sync(&mm->mm_cid.work);
+ }
+ 
+ /* Deactivate MM CID allocation across execve() */
+ void sched_mm_cid_before_execve(struct task_struct *t)
+ {
+       sched_mm_cid_exit(t);
   }
   
+ /* Reactivate MM CID after successful execve() */
   void sched_mm_cid_after_execve(struct task_struct *t)
   {
-       struct mm_struct *mm = t->mm;
-       struct rq *rq;
+       sched_mm_cid_fork(t);
+ }
+ 
+ static void mm_cid_work_fn(struct work_struct *work)
+ {
+       struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
   
-       if (!mm)
+       guard(mutex)(&mm->mm_cid.mutex);
+       /* Did the last user task exit already? */
+       if (!mm->mm_cid.users)
                 return;
   
-       preempt_disable();
-       rq = this_rq();
-       scoped_guard (rq_lock_irqsave, rq) {
-               preempt_enable_no_resched();    /* holding spinlock */
-               WRITE_ONCE(t->mm_cid_active, 1);
-               /*
-                * Store t->mm_cid_active before loading per-mm/cpu cid.
-                * Matches barrier in sched_mm_cid_remote_clear_old().
-                */
-               smp_mb();
-               t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
+       scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+               /* Have fork() or exit() handled it already? */
+               if (!mm->mm_cid.update_deferred)
+                       return;
+               /* This clears mm_cid::update_deferred */
+               if (!mm_update_max_cids(mm))
+                       return;
+               /* Affinity changes can only switch back to task mode */
+               if (WARN_ON_ONCE(mm->mm_cid.percpu))
+                       return;
         }
+       mm_cid_fixup_cpus_to_tasks(mm);
   }
   
- void sched_mm_cid_fork(struct task_struct *t)
+ static void mm_cid_irq_work(struct irq_work *work)
+ {
+       struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work);
+ 
+       /*
+        * Needs to be unconditional because mm_cid::lock cannot be held
+        * when scheduling work as mm_update_cpus_allowed() nests inside
+        * rq::lock and schedule_work() might end up in wakeup...
+        */
+       schedule_work(&mm->mm_cid.work);
+ }
+ 
+ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
   {
-       WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
-       t->mm_cid_active = 1;
+       mm->mm_cid.max_cids = 0;
+       mm->mm_cid.percpu = 0;
+       mm->mm_cid.transit = 0;
+       mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+       mm->mm_cid.users = 0;
+       mm->mm_cid.pcpu_thrs = 0;
+       mm->mm_cid.update_deferred = 0;
+       raw_spin_lock_init(&mm->mm_cid.lock);
+       mutex_init(&mm->mm_cid.mutex);
+       mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
+       INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
+       cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
+       bitmap_zero(mm_cidmask(mm), num_possible_cpus());
   }
- #endif /* CONFIG_SCHED_MM_CID */
+ #else /* CONFIG_SCHED_MM_CID */
+ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+ #endif /* !CONFIG_SCHED_MM_CID */
   
- -#ifdef CONFIG_SCHED_CLASS_EXT
- -void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- -                          struct sched_enq_and_set_ctx *ctx)
+ +static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+ +
+ +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
   {
+ +      struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
         struct rq *rq = task_rq(p);
   
+ +      /*
+ +       * Must exclusively use matched flags since this is both dequeue and
+ +       * enqueue.
+ +       */
+ +      WARN_ON_ONCE(flags & 0xFFFF0000);
+ +
         lockdep_assert_rq_held(rq);
   
- -      *ctx = (struct sched_enq_and_set_ctx){
+ +      if (!(flags & DEQUEUE_NOCLOCK)) {
+ +              update_rq_clock(rq);
+ +              flags |= DEQUEUE_NOCLOCK;
+ +      }
+ +
+ +      if (flags & DEQUEUE_CLASS) {
+ +              if (p->sched_class->switching_from)
+ +                      p->sched_class->switching_from(rq, p);
+ +      }
+ +
+ +      *ctx = (struct sched_change_ctx){
                 .p = p,
- -              .queue_flags = queue_flags,
+ +              .flags = flags,
                 .queued = task_on_rq_queued(p),
- -              .running = task_current(rq, p),
+ +              .running = task_current_donor(rq, p),
         };
   
- -      update_rq_clock(rq);
+ +      if (!(flags & DEQUEUE_CLASS)) {
+ +              if (p->sched_class->get_prio)
+ +                      ctx->prio = p->sched_class->get_prio(rq, p);
+ +              else
+ +                      ctx->prio = p->prio;
+ +      }
+ +
         if (ctx->queued)
- -              dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ +              dequeue_task(rq, p, flags);
         if (ctx->running)
                 put_prev_task(rq, p);
+ +
+ +      if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+ +              p->sched_class->switched_from(rq, p);
+ +
+ +      return ctx;
   }
   
- -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+ +void sched_change_end(struct sched_change_ctx *ctx)
   {
- -      struct rq *rq = task_rq(ctx->p);
+ +      struct task_struct *p = ctx->p;
+ +      struct rq *rq = task_rq(p);
   
         lockdep_assert_rq_held(rq);
   
diff --cc kernel/sched/sched.h
Simple merge
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 2 Dec 2025 16:48:53 +0000 (08:48 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 2 Dec 2025 16:48:53 +0000 (08:48 -0800)
		1	2
drivers/hv/mshv_root_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/cleanup.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/irq-entry-common.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/init_task.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history