sched: Reorder some fields in struct rq

author Blake Jones <blakejones@google.com>

Tue, 2 Dec 2025 02:37:43 +0000 (18:37 -0800)

committer Peter Zijlstra <peterz@infradead.org>

Thu, 8 Jan 2026 11:43:56 +0000 (12:43 +0100)
author Blake Jones <blakejones@google.com>
Tue, 2 Dec 2025 02:37:43 +0000 (18:37 -0800)
committer Peter Zijlstra <peterz@infradead.org>
Thu, 8 Jan 2026 11:43:56 +0000 (12:43 +0100)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 3ceaa9dc9a9e8a53aaf1ac4fa3f0fb6d55a99518..58c9d244f12b07d00c8b769e584ebeee3d2cffc8 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1115,26 +1115,50 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
   * acquire operations must be ordered by ascending &runqueue.
   */
  struct rq {
-       /* runqueue lock: */
-       raw_spinlock_t          __lock;
-
+       /*
+        * The following members are loaded together, without holding the
+        * rq->lock, in an extremely hot loop in update_sg_lb_stats()
+        * (called from pick_next_task()). To reduce cache pollution from
+        * this operation, they are placed together on this dedicated cache
+        * line. Even though some of them are frequently modified, they are
+        * loaded much more frequently than they are stored.
+        */
         unsigned int            nr_running;
  #ifdef CONFIG_NUMA_BALANCING
         unsigned int            nr_numa_running;
         unsigned int            nr_preferred_running;
-       unsigned int            numa_migrate_on;
  #endif
+       unsigned int            ttwu_pending;
+       unsigned long           cpu_capacity;
+#ifdef CONFIG_SCHED_PROXY_EXEC
+       struct task_struct __rcu        *donor;  /* Scheduling context */
+       struct task_struct __rcu        *curr;   /* Execution context */
+#else
+       union {
+               struct task_struct __rcu *donor; /* Scheduler context */
+               struct task_struct __rcu *curr;  /* Execution context */
+       };
+#endif
+       struct task_struct      *idle;
+       /* padding left here deliberately */
+
+       /*
+        * The next cacheline holds the (hot) runqueue lock, as well as
+        * some other less performance-critical fields.
+        */
+       u64                     nr_switches     ____cacheline_aligned;
+
+       /* runqueue lock: */
+       raw_spinlock_t          __lock;
+
  #ifdef CONFIG_NO_HZ_COMMON
-       unsigned long           last_blocked_load_update_tick;
-       unsigned int            has_blocked_load;
-       call_single_data_t      nohz_csd;
         unsigned int            nohz_tick_stopped;
         atomic_t                nohz_flags;
+       unsigned int            has_blocked_load;
+       unsigned long           last_blocked_load_update_tick;
+       call_single_data_t      nohz_csd;
  #endif /* CONFIG_NO_HZ_COMMON */
  
-       unsigned int            ttwu_pending;
-       u64                     nr_switches;
-
  #ifdef CONFIG_UCLAMP_TASK
         /* Utilization clamp values based on CPU's RUNNABLE tasks */
         struct uclamp_rq        uclamp[UCLAMP_CNT] ____cacheline_aligned;
@@ -1157,6 +1181,9 @@ struct rq {
         struct list_head        *tmp_alone_branch;
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned int            numa_migrate_on;
+#endif
         /*
          * This is part of a global counter where only the total sum
          * over all CPUs matters. A task can increase this counter on
@@ -1165,37 +1192,29 @@ struct rq {
          */
         unsigned long           nr_uninterruptible;
  
-#ifdef CONFIG_SCHED_PROXY_EXEC
-       struct task_struct __rcu        *donor;  /* Scheduling context */
-       struct task_struct __rcu        *curr;   /* Execution context */
-#else
-       union {
-               struct task_struct __rcu *donor; /* Scheduler context */
-               struct task_struct __rcu *curr;  /* Execution context */
-       };
-#endif
         struct sched_dl_entity  *dl_server;
-       struct task_struct      *idle;
         struct task_struct      *stop;
         const struct sched_class *next_class;
         unsigned long           next_balance;
         struct mm_struct        *prev_mm;
  
-       unsigned int            clock_update_flags;
-       u64                     clock;
-       /* Ensure that all clocks are in the same cache line */
+       /*
+        * The following fields of clock data are frequently referenced
+        * and updated together, and should go on their own cache line.
+        */
         u64                     clock_task ____cacheline_aligned;
         u64                     clock_pelt;
+       u64                     clock;
         unsigned long           lost_idle_time;
+       unsigned int            clock_update_flags;
         u64                     clock_pelt_idle;
         u64                     clock_idle;
+
  #ifndef CONFIG_64BIT
         u64                     clock_pelt_idle_copy;
         u64                     clock_idle_copy;
  #endif
  
-       atomic_t                nr_iowait;
-
         u64 last_seen_need_resched_ns;
         int ticks_without_resched;
  
@@ -1206,8 +1225,6 @@ struct rq {
         struct root_domain              *rd;
         struct sched_domain __rcu       *sd;
  
-       unsigned long           cpu_capacity;
-
         struct balance_callback *balance_callback;
  
         unsigned char           nohz_idle_balance;
@@ -1317,7 +1334,9 @@ struct rq {
         call_single_data_t      cfsb_csd;
         struct list_head        cfsb_csd_list;
  #endif
-};
+
+       atomic_t                nr_iowait;
+} __no_randomize_layout;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
author	Blake Jones <blakejones@google.com>
	Tue, 2 Dec 2025 02:37:43 +0000 (18:37 -0800)
committer	Peter Zijlstra <peterz@infradead.org>
	Thu, 8 Jan 2026 11:43:56 +0000 (12:43 +0100)