Merge branch 'linus' into sched/core, to pick up fixes

author Ingo Molnar <mingo@kernel.org>

Fri, 25 Aug 2017 09:07:13 +0000 (11:07 +0200)

committer Ingo Molnar <mingo@kernel.org>

Fri, 25 Aug 2017 09:07:13 +0000 (11:07 +0200)
author Ingo Molnar <mingo@kernel.org>
Fri, 25 Aug 2017 09:07:13 +0000 (11:07 +0200)
committer Ingo Molnar <mingo@kernel.org>
Fri, 25 Aug 2017 09:07:13 +0000 (11:07 +0200)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h

index 6358a85e2270b83a8872ef00724feb77f41c0e58..c1d2a9892352742a1512306138da023d77c6050b 100644 (file)
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -75,12 +75,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
  
  extern void setup_node_to_cpumask_map(void);
  
-/*
- * Returns the number of the node containing Node 'node'. This
- * architecture is flat, so it is a pretty simple function!
- */
-#define parent_node(node) (node)
-
  #define pcibus_to_node(bus) __pcibus_to_node(bus)
  
  extern int __node_distance(int, int);
diff --git a/fs/proc/base.c b/fs/proc/base.c

index 719c2e943ea1028f22cd2e400b7b6f9289ca6791..98fd8f6df85122beda8066b734e23aabe3e7b306 100644 (file)
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = {
  static int sched_show(struct seq_file *m, void *v)
  {
         struct inode *inode = m->private;
+       struct pid_namespace *ns = inode->i_sb->s_fs_info;
         struct task_struct *p;
  
         p = get_proc_task(inode);
         if (!p)
                 return -ESRCH;
-       proc_sched_show_task(p, m);
+       proc_sched_show_task(p, ns, m);
  
         put_task_struct(p);
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index c05ac5f5aa034db128e9abcdd02e176f21904b8b..2a65eebadf63301ca03e09a5b883778b401484f8 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1232,6 +1232,19 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
         return task_pgrp_nr_ns(tsk, &init_pid_ns);
  }
  
+static inline char task_state_to_char(struct task_struct *task)
+{
+       const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+       unsigned long state = task->state;
+
+       state = state ? __ffs(state) + 1 : 0;
+
+       /* Make sure the string lines up properly with the number of task states: */
+       BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+
+       return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?';
+}
+
  /**
   * is_global_init - check if a task structure is init. Since init
   * is free to have sub-threads we need to check tgid.
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h

index e0eaee54c5a4c45347fe198aff578277a1d3bd12..5d58d49e9f87634fa6c14db44a4eb62eb1552fc4 100644 (file)
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -6,6 +6,7 @@
   */
  
  struct task_struct;
+struct pid_namespace;
  
  extern void dump_cpu_task(int cpu);
  
@@ -34,7 +35,8 @@ extern void sched_show_task(struct task_struct *p);
  
  #ifdef CONFIG_SCHED_DEBUG
  struct seq_file;
-extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
+extern void proc_sched_show_task(struct task_struct *p,
+                                struct pid_namespace *ns, struct seq_file *m);
  extern void proc_sched_set_task(struct task_struct *p);
  #endif
  
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h

index c97e5f09692768d0cf2a3770c45aacf2fdc8baf1..79a2a744648dd704b024cab1b37d85e4b2949ca0 100644 (file)
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -30,7 +30,6 @@ extern int lockdep_tasklist_lock_is_held(void);
  
  extern asmlinkage void schedule_tail(struct task_struct *prev);
  extern void init_idle(struct task_struct *idle, int cpu);
-extern void init_idle_bootup_task(struct task_struct *idle);
  
  extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
  extern void sched_dead(struct task_struct *p);
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

index 7d065abc7a470d7b7efdadb634788c695d809af5..d7b6dab956ec4a130cd835d255b3a93bab764e54 100644 (file)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,6 +71,14 @@ struct sched_domain_shared {
         atomic_t        ref;
         atomic_t        nr_busy_cpus;
         int             has_idle_cores;
+
+       /*
+        * Some variables from the most recent sd_lb_stats for this domain,
+        * used by wake_affine().
+        */
+       unsigned long   nr_running;
+       unsigned long   load;
+       unsigned long   capacity;
  };
  
  struct sched_domain {
diff --git a/init/main.c b/init/main.c

index 052481fbe3633f64b420c5bbd6deea3be261e6a9..881d62438b1a79643ab6db8e4098569978f3469a 100644 (file)
--- a/init/main.c
+++ b/init/main.c
@@ -430,7 +430,6 @@ static noinline void __ref rest_init(void)
          * The boot idle thread must execute schedule()
          * at least once to get things moving:
          */
-       init_idle_bootup_task(current);
         schedule_preempt_disabled();
         /* Call into cpu_idle with preempt disabled */
         cpu_startup_entry(CPUHP_ONLINE);
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c

index da39489d2d80e9b8f6781610e97c23ec0014f45b..de6d7f4dfcb52ce4f7386b052b5ded37a9f30054 100644 (file)
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void)
                 goto out_fail;
  
         tg = sched_create_group(&root_task_group);
-
         if (IS_ERR(tg))
                 goto out_free;
  
@@ -101,7 +100,7 @@ out_free:
  out_fail:
         if (printk_ratelimit()) {
                 printk(KERN_WARNING "autogroup_create: %s failure.\n",
-                       ag ? "sched_create_group()" : "kmalloc()");
+                       ag ? "sched_create_group()" : "kzalloc()");
         }
  
         return autogroup_kref_get(&autogroup_default);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c

index 13fc5ae9bf2f6c96ee82ad140d89e3231ade1c88..2950f446820ddc6fa5e34f2e330cef8afffee1b2 100644 (file)
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -47,6 +47,13 @@ EXPORT_SYMBOL(complete);
   *
   * It may be assumed that this function implies a write memory barrier before
   * changing the task state if and only if any tasks are woken up.
+ *
+ * Since complete_all() sets the completion of @x permanently to done
+ * to allow multiple waiters to finish, a call to reinit_completion()
+ * must be used on @x if @x is to be used again. The code must make
+ * sure that all waiters have woken and finished before reinitializing
+ * @x. Also note that the function completion_done() can not be used
+ * to know if there are still waiters after complete_all() has been called.
   */
  void complete_all(struct completion *x)
  {
@@ -297,6 +304,7 @@ EXPORT_SYMBOL(try_wait_for_completion);
   *     Return: 0 if there are waiters (wait_for_completion() in progress)
   *              1 if there are no waiters.
   *
+ *     Note, this will always return true if complete_all() was called on @X.
   */
  bool completion_done(struct completion *x)
  {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 0869b20fba81f6f1a7f5f73bde65fadc483f0fa2..f9f9948e2470f970bef0fb1117bf960966dad2d1 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5103,24 +5103,17 @@ out_unlock:
         return retval;
  }
  
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
  void sched_show_task(struct task_struct *p)
  {
         unsigned long free = 0;
         int ppid;
-       unsigned long state = p->state;
-
-       /* Make sure the string lines up properly with the number of task states: */
-       BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
  
         if (!try_get_task_stack(p))
                 return;
-       if (state)
-               state = __ffs(state) + 1;
-       printk(KERN_INFO "%-15.15s %c", p->comm,
-               state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-       if (state == TASK_RUNNING)
+
+       printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+       if (p->state == TASK_RUNNING)
                 printk(KERN_CONT "  running task    ");
  #ifdef CONFIG_DEBUG_STACK_USAGE
         free = stack_not_used(p);
@@ -5177,11 +5170,6 @@ void show_state_filter(unsigned long state_filter)
                 debug_show_all_locks();
  }
  
-void init_idle_bootup_task(struct task_struct *idle)
-{
-       idle->sched_class = &idle_sched_class;
-}
-
  /**
   * init_idle - set up an idle thread for a given CPU
   * @idle: task in question
@@ -5438,7 +5426,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
                  */
                 next = pick_next_task(rq, &fake_task, rf);
                 BUG_ON(!next);
-               next->sched_class->put_prev_task(rq, next);
+               put_prev_task(rq, next);
  
                 /*
                  * Rules for changing task_struct::cpus_allowed are holding
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c

index fba235c7d02679a5881f71971f84d778a4222972..8d9562d890d3041421aaf9d0978563293b322c46 100644 (file)
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp)
   * @p: the task
   * @later_mask: a mask to fill in with the selected CPUs (or NULL)
   *
- * Returns: int - best CPU (heap maximum if suitable)
+ * Returns: int - CPUs were found
   */
  int cpudl_find(struct cpudl *cp, struct task_struct *p,
                struct cpumask *later_mask)
  {
-       int best_cpu = -1;
         const struct sched_dl_entity *dl_se = &p->dl;
  
         if (later_mask &&
             cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
-               best_cpu = cpumask_any(later_mask);
-               goto out;
-       } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
-                       dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
-               best_cpu = cpudl_maximum(cp);
-               if (later_mask)
-                       cpumask_set_cpu(best_cpu, later_mask);
-       }
+               return 1;
+       } else {
+               int best_cpu = cpudl_maximum(cp);
+               WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
  
-out:
-       WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+               if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+                   dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+                       if (later_mask)
+                               cpumask_set_cpu(best_cpu, later_mask);
  
-       return best_cpu;
+                       return 1;
+               }
+       }
+       return 0;
  }
  
  /*
@@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp)
  {
         int i;
  
-       memset(cp, 0, sizeof(*cp));
         raw_spin_lock_init(&cp->lock);
         cp->size = 0;
  
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c

index 981fcd7dc394eb10dd26113c66815dd0aa8e6a46..2511aba36b89e5e6581d3b9f0c33257a3c09a394 100644 (file)
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp)
  {
         int i;
  
-       memset(cp, 0, sizeof(*cp));
-
         for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
                 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
  
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 755bd3f1a1a93a8f1daf713dd36578637ce515c5..d05bd9457a408c0f769719c38306eebde3dd6464 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1594,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
          * let's hope p can move out.
          */
         if (rq->curr->nr_cpus_allowed == 1 ||
-           cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+           !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
                 return;
  
         /*
@@ -1602,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
          * see if it is pushed or pulled somewhere else.
          */
         if (p->nr_cpus_allowed != 1 &&
-           cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+           cpudl_find(&rq->rd->cpudl, p, NULL))
                 return;
  
         resched_curr(rq);
@@ -1655,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
         return rb_entry(left, struct sched_dl_entity, rb_node);
  }
  
-struct task_struct *
+static struct task_struct *
  pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
         struct sched_dl_entity *dl_se;
@@ -1798,7 +1798,7 @@ static int find_later_rq(struct task_struct *task)
         struct sched_domain *sd;
         struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
         int this_cpu = smp_processor_id();
-       int best_cpu, cpu = task_cpu(task);
+       int cpu = task_cpu(task);
  
         /* Make sure the mask is initialized first */
         if (unlikely(!later_mask))
@@ -1811,17 +1811,14 @@ static int find_later_rq(struct task_struct *task)
          * We have to consider system topology and task affinity
          * first, then we can look for a suitable cpu.
          */
-       best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
-                       task, later_mask);
-       if (best_cpu == -1)
+       if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
                 return -1;
  
         /*
-        * If we are here, some target has been found,
-        * the most suitable of which is cached in best_cpu.
-        * This is, among the runqueues where the current tasks
-        * have later deadlines than the task's one, the rq
-        * with the latest possible one.
+        * If we are here, some targets have been found, including
+        * the most suitable which is, among the runqueues where the
+        * current tasks have later deadlines than the task's one, the
+        * rq with the latest possible one.
          *
          * Now we check how well this matches with task's
          * affinity and system topology.
@@ -1841,6 +1838,7 @@ static int find_later_rq(struct task_struct *task)
         rcu_read_lock();
         for_each_domain(cpu, sd) {
                 if (sd->flags & SD_WAKE_AFFINE) {
+                       int best_cpu;
  
                         /*
                          * If possible, preempting this_cpu is
@@ -1852,12 +1850,15 @@ static int find_later_rq(struct task_struct *task)
                                 return this_cpu;
                         }
  
+                       best_cpu = cpumask_first_and(later_mask,
+                                                       sched_domain_span(sd));
                         /*
-                        * Last chance: if best_cpu is valid and is
-                        * in the mask, that becomes our choice.
+                        * Last chance: if a cpu being in both later_mask
+                        * and current sd span is valid, that becomes our
+                        * choice. Of course, the latest possible cpu is
+                        * already under consideration through later_mask.
                          */
-                       if (best_cpu < nr_cpu_ids &&
-                           cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+                       if (best_cpu < nr_cpu_ids) {
                                 rcu_read_unlock();
                                 return best_cpu;
                         }
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 4fa66de52bd6ad8d88ed2c95650a274206e3de1d..cfd84f79e0755f69de11aa560c58b82b1c42d283 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -421,13 +421,15 @@ static char *task_group_path(struct task_group *tg)
  }
  #endif
  
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
  static void
  print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
  {
         if (rq->curr == p)
-               SEQ_printf(m, "R");
+               SEQ_printf(m, ">R");
         else
-               SEQ_printf(m, " ");
+               SEQ_printf(m, " %c", task_state_to_char(p));
  
         SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
                 p->comm, task_pid_nr(p),
@@ -456,9 +458,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
  
         SEQ_printf(m,
         "\nrunnable tasks:\n"
-       "            task   PID         tree-key  switches  prio"
+       " S           task   PID         tree-key  switches  prio"
         "     wait-time             sum-exec        sum-sleep\n"
-       "------------------------------------------------------"
+       "-------------------------------------------------------"
         "----------------------------------------------------\n");
  
         rcu_read_lock();
@@ -872,11 +874,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
  #endif
  }
  
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+                                                 struct seq_file *m)
  {
         unsigned long nr_switches;
  
-       SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+       SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
                                                 get_nr_threads(p));
         SEQ_printf(m,
                 "---------------------------------------------------------"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index c95880e216f6900ed06e92e93ddc355d20e59092..8d5868771cb307c8dbfe5f2b08e6e16b1018e18b 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
                         /*
                          * For !fair tasks do:
                          *
-                       update_cfs_rq_load_avg(now, cfs_rq, false);
+                       update_cfs_rq_load_avg(now, cfs_rq);
                         attach_entity_load_avg(cfs_rq, se);
                         switched_from_fair(rq, p);
                          *
@@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
  /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
  unsigned int sysctl_numa_balancing_scan_delay = 1000;
  
+struct numa_group {
+       atomic_t refcount;
+
+       spinlock_t lock; /* nr_tasks, tasks */
+       int nr_tasks;
+       pid_t gid;
+       int active_nodes;
+
+       struct rcu_head rcu;
+       unsigned long total_faults;
+       unsigned long max_faults_cpu;
+       /*
+        * Faults_cpu is used to decide whether memory should move
+        * towards the CPU. As a consequence, these stats are weighted
+        * more by CPU use than by memory faults.
+        */
+       unsigned long *faults_cpu;
+       unsigned long faults[0];
+};
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
  static unsigned int task_nr_scan_windows(struct task_struct *p)
  {
         unsigned long rss = 0;
@@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p)
         return max_t(unsigned int, floor, scan);
  }
  
+static unsigned int task_scan_start(struct task_struct *p)
+{
+       unsigned long smin = task_scan_min(p);
+       unsigned long period = smin;
+
+       /* Scale the maximum scan period with the amount of shared memory. */
+       if (p->numa_group) {
+               struct numa_group *ng = p->numa_group;
+               unsigned long shared = group_faults_shared(ng);
+               unsigned long private = group_faults_priv(ng);
+
+               period *= atomic_read(&ng->refcount);
+               period *= shared + 1;
+               period /= private + shared + 1;
+       }
+
+       return max(smin, period);
+}
+
  static unsigned int task_scan_max(struct task_struct *p)
  {
-       unsigned int smin = task_scan_min(p);
-       unsigned int smax;
+       unsigned long smin = task_scan_min(p);
+       unsigned long smax;
  
         /* Watch for min being lower than max due to floor calculations */
         smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+       /* Scale the maximum scan period with the amount of shared memory. */
+       if (p->numa_group) {
+               struct numa_group *ng = p->numa_group;
+               unsigned long shared = group_faults_shared(ng);
+               unsigned long private = group_faults_priv(ng);
+               unsigned long period = smax;
+
+               period *= atomic_read(&ng->refcount);
+               period *= shared + 1;
+               period /= private + shared + 1;
+
+               smax = max(smax, period);
+       }
+
         return max(smin, smax);
  }
  
@@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
         rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
  }
  
-struct numa_group {
-       atomic_t refcount;
-
-       spinlock_t lock; /* nr_tasks, tasks */
-       int nr_tasks;
-       pid_t gid;
-       int active_nodes;
-
-       struct rcu_head rcu;
-       unsigned long total_faults;
-       unsigned long max_faults_cpu;
-       /*
-        * Faults_cpu is used to decide whether memory should move
-        * towards the CPU. As a consequence, these stats are weighted
-        * more by CPU use than by memory faults.
-        */
-       unsigned long *faults_cpu;
-       unsigned long faults[0];
-};
-
  /* Shared or private faults. */
  #define NR_NUMA_HINT_FAULT_TYPES 2
  
@@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
                 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
  }
  
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+       unsigned long faults = 0;
+       int node;
+
+       for_each_online_node(node) {
+               faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+       }
+
+       return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+       unsigned long faults = 0;
+       int node;
+
+       for_each_online_node(node) {
+               faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+       }
+
+       return faults;
+}
+
  /*
   * A node triggering more than 1/3 as many NUMA faults as the maximum is
   * considered part of a numa group's pseudo-interleaving set. Migrations
@@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
                group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
  }
  
-static unsigned long weighted_cpuload(const int cpu);
+static unsigned long weighted_cpuload(struct rq *rq);
  static unsigned long source_load(int cpu, int type);
  static unsigned long target_load(int cpu, int type);
  static unsigned long capacity_of(int cpu);
@@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
                 struct rq *rq = cpu_rq(cpu);
  
                 ns->nr_running += rq->nr_running;
-               ns->load += weighted_cpuload(cpu);
+               ns->load += weighted_cpuload(rq);
                 ns->compute_capacity += capacity_of(cpu);
  
                 cpus++;
@@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)
          * Reset the scan period if the task is being rescheduled on an
          * alternative node to recheck if the tasks is now properly placed.
          */
-       p->numa_scan_period = task_scan_min(p);
+       p->numa_scan_period = task_scan_start(p);
  
         if (env.best_task == NULL) {
                 ret = migrate_task_to(p, env.best_cpu);
@@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p,
                         unsigned long shared, unsigned long private)
  {
         unsigned int period_slot;
-       int ratio;
+       int lr_ratio, ps_ratio;
         int diff;
  
         unsigned long remote = p->numa_faults_locality[0];
@@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p,
          *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
          */
         period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
-       ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
-       if (ratio >= NUMA_PERIOD_THRESHOLD) {
-               int slot = ratio - NUMA_PERIOD_THRESHOLD;
+       lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+       ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+       if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+               /*
+                * Most memory accesses are local. There is no need to
+                * do fast NUMA scanning, since memory is already local.
+                */
+               int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+               if (!slot)
+                       slot = 1;
+               diff = slot * period_slot;
+       } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+               /*
+                * Most memory accesses are shared with other tasks.
+                * There is no point in continuing fast NUMA scanning,
+                * since other tasks may just move the memory elsewhere.
+                */
+               int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
                 if (!slot)
                         slot = 1;
                 diff = slot * period_slot;
         } else {
-               diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
                 /*
-                * Scale scan rate increases based on sharing. There is an
-                * inverse relationship between the degree of sharing and
-                * the adjustment made to the scanning period. Broadly
-                * speaking the intent is that there is little point
-                * scanning faster if shared accesses dominate as it may
-                * simply bounce migrations uselessly
+                * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+                * yet they are not on the local NUMA node. Speed up
+                * NUMA scanning to get the memory moved over.
                  */
-               ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
-               diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+               int ratio = max(lr_ratio, ps_ratio);
+               diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
         }
  
         p->numa_scan_period = clamp(p->numa_scan_period + diff,
@@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work)
  
         if (p->numa_scan_period == 0) {
                 p->numa_scan_period_max = task_scan_max(p);
-               p->numa_scan_period = task_scan_min(p);
+               p->numa_scan_period = task_scan_start(p);
         }
  
         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
@@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
  
         if (now > curr->node_stamp + period) {
                 if (!curr->node_stamp)
-                       curr->numa_scan_period = task_scan_min(curr);
+                       curr->numa_scan_period = task_scan_start(curr);
                 curr->node_stamp += period;
  
                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
@@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
         }
  }
  
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
-                                   struct task_struct *p, int this_cpu,
-                                   int prev_cpu, int sync)
-{
-       struct numa_stats prev_load, this_load;
-       s64 this_eff_load, prev_eff_load;
-
-       update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
-       update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
-       /*
-        * If sync wakeup then subtract the (maximum possible)
-        * effect of the currently running task from the load
-        * of the current CPU:
-        */
-       if (sync) {
-               unsigned long current_load = task_h_load(current);
-
-               if (this_load.load > current_load)
-                       this_load.load -= current_load;
-               else
-                       this_load.load = 0;
-       }
-
-       /*
-        * In low-load situations, where this_cpu's node is idle due to the
-        * sync cause above having dropped this_load.load to 0, move the task.
-        * Moving to an idle socket will not create a bad imbalance.
-        *
-        * Otherwise check if the nodes are near enough in load to allow this
-        * task to be woken on this_cpu's node.
-        */
-       if (this_load.load > 0) {
-               unsigned long task_load = task_h_load(p);
-
-               this_eff_load = 100;
-               this_eff_load *= prev_load.compute_capacity;
-
-               prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-               prev_eff_load *= this_load.compute_capacity;
-
-               this_eff_load *= this_load.load + task_load;
-               prev_eff_load *= prev_load.load - task_load;
-
-               return this_eff_load <= prev_eff_load;
-       }
-
-       return true;
-}
  #else
  static void task_tick_numa(struct rq *rq, struct task_struct *curr)
  {
@@ -2652,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
  {
  }
  
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
-                                   struct task_struct *p, int this_cpu,
-                                   int prev_cpu, int sync)
-{
-       return true;
-}
-#endif /* !SMP */
  #endif /* CONFIG_NUMA_BALANCING */
  
  static void
@@ -2790,6 +2801,29 @@ static inline void update_cfs_shares(struct sched_entity *se)
  }
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+       if (&this_rq()->cfs == cfs_rq) {
+               /*
+                * There are a few boundary cases this might miss but it should
+                * get called often enough that that should (hopefully) not be
+                * a real problem -- added to that it only calls on the local
+                * CPU, so if we enqueue remotely we'll miss an update, but
+                * the next tick/schedule should update.
+                *
+                * It will not get called when we go idle, because the idle
+                * thread is a different class (!fair), nor will the utilization
+                * number include things like RT tasks.
+                *
+                * As is, the util number is not freq-invariant (we'd have to
+                * implement arch_scale_freq_capacity() for that).
+                *
+                * See cpu_util().
+                */
+               cpufreq_update_util(rq_of(cfs_rq), 0);
+       }
+}
+
  #ifdef CONFIG_SMP
  /*
   * Approximate:
@@ -2967,6 +3001,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
  
         sa->last_update_time += delta << 10;
  
+       /*
+        * running is a subset of runnable (weight) so running can't be set if
+        * runnable is clear. But there are some corner cases where the current
+        * se has been already dequeued but cfs_rq->curr still points to it.
+        * This means that weight will be 0 but not running for a sched_entity
+        * but also for a cfs_rq if the latter becomes idle. As an example,
+        * this happens during idle_balance() which calls
+        * update_blocked_averages()
+        */
+       if (!weight)
+               running = 0;
+
         /*
          * Now we know we crossed measurement unit boundaries. The *_avg
          * accrues by two steps:
@@ -3276,29 +3322,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
  
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
-       if (&this_rq()->cfs == cfs_rq) {
-               /*
-                * There are a few boundary cases this might miss but it should
-                * get called often enough that that should (hopefully) not be
-                * a real problem -- added to that it only calls on the local
-                * CPU, so if we enqueue remotely we'll miss an update, but
-                * the next tick/schedule should update.
-                *
-                * It will not get called when we go idle, because the idle
-                * thread is a different class (!fair), nor will the utilization
-                * number include things like RT tasks.
-                *
-                * As is, the util number is not freq-invariant (we'd have to
-                * implement arch_scale_freq_capacity() for that).
-                *
-                * See cpu_util().
-                */
-               cpufreq_update_util(rq_of(cfs_rq), 0);
-       }
-}
-
  /*
   * Unsigned subtract and clamp on underflow.
   *
@@ -3320,7 +3343,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
   * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
   * @now: current time, as per cfs_rq_clock_task()
   * @cfs_rq: cfs_rq to update
- * @update_freq: should we call cfs_rq_util_change() or will the call do so
   *
   * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
   * avg. The immediate corollary is that all (fair) tasks must be attached, see
@@ -3334,7 +3356,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
   * call update_tg_load_avg() when this function returns true.
   */
  static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  {
         struct sched_avg *sa = &cfs_rq->avg;
         int decayed, removed_load = 0, removed_util = 0;
@@ -3362,7 +3384,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
         cfs_rq->load_last_update_time_copy = sa->last_update_time;
  #endif
  
-       if (update_freq && (decayed || removed_util))
+       if (decayed || removed_util)
                 cfs_rq_util_change(cfs_rq);
  
         return decayed || removed_load;
@@ -3390,7 +3412,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
         if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
                 __update_load_avg_se(now, cpu, cfs_rq, se);
  
-       decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
+       decayed  = update_cfs_rq_load_avg(now, cfs_rq);
         decayed |= propagate_entity_load_avg(se);
  
         if (decayed && (flags & UPDATE_TG))
@@ -3534,7 +3556,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
  #else /* CONFIG_SMP */
  
  static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  {
         return 0;
  }
@@ -3544,7 +3566,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  
  static inline void update_load_avg(struct sched_entity *se, int not_used1)
  {
-       cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+       cfs_rq_util_change(cfs_rq_of(se));
  }
  
  static inline void
@@ -5125,9 +5147,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
  }
  
  /* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(struct rq *rq)
  {
-       return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+       return cfs_rq_runnable_load_avg(&rq->cfs);
  }
  
  #ifdef CONFIG_NO_HZ_COMMON
@@ -5172,7 +5194,7 @@ static void cpu_load_update_idle(struct rq *this_rq)
         /*
          * bail if there's load or we're actually up-to-date.
          */
-       if (weighted_cpuload(cpu_of(this_rq)))
+       if (weighted_cpuload(this_rq))
                 return;
  
         cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -5193,7 +5215,7 @@ void cpu_load_update_nohz_start(void)
          * concurrently we'll exit nohz. And cpu_load write can race with
          * cpu_load_update_idle() but both updater would be writing the same.
          */
-       this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+       this_rq->cpu_load[0] = weighted_cpuload(this_rq);
  }
  
  /*
@@ -5209,7 +5231,7 @@ void cpu_load_update_nohz_stop(void)
         if (curr_jiffies == this_rq->last_load_update_tick)
                 return;
  
-       load = weighted_cpuload(cpu_of(this_rq));
+       load = weighted_cpuload(this_rq);
         rq_lock(this_rq, &rf);
         update_rq_clock(this_rq);
         cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -5235,7 +5257,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
   */
  void cpu_load_update_active(struct rq *this_rq)
  {
-       unsigned long load = weighted_cpuload(cpu_of(this_rq));
+       unsigned long load = weighted_cpuload(this_rq);
  
         if (tick_nohz_tick_stopped())
                 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -5253,7 +5275,7 @@ void cpu_load_update_active(struct rq *this_rq)
  static unsigned long source_load(int cpu, int type)
  {
         struct rq *rq = cpu_rq(cpu);
-       unsigned long total = weighted_cpuload(cpu);
+       unsigned long total = weighted_cpuload(rq);
  
         if (type == 0 || !sched_feat(LB_BIAS))
                 return total;
@@ -5268,7 +5290,7 @@ static unsigned long source_load(int cpu, int type)
  static unsigned long target_load(int cpu, int type)
  {
         struct rq *rq = cpu_rq(cpu);
-       unsigned long total = weighted_cpuload(cpu);
+       unsigned long total = weighted_cpuload(rq);
  
         if (type == 0 || !sched_feat(LB_BIAS))
                 return total;
@@ -5290,7 +5312,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
         unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-       unsigned long load_avg = weighted_cpuload(cpu);
+       unsigned long load_avg = weighted_cpuload(rq);
  
         if (nr_running)
                 return load_avg / nr_running;
@@ -5345,20 +5367,115 @@ static int wake_wide(struct task_struct *p)
         return 1;
  }
  
+struct llc_stats {
+       unsigned long   nr_running;
+       unsigned long   load;
+       unsigned long   capacity;
+       int             has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+       struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+       if (!sds)
+               return false;
+
+       stats->nr_running       = READ_ONCE(sds->nr_running);
+       stats->load             = READ_ONCE(sds->load);
+       stats->capacity         = READ_ONCE(sds->capacity);
+       stats->has_capacity     = stats->nr_running < per_cpu(sd_llc_size, cpu);
+
+       return true;
+}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ *
+ * Since we're running on 'stale' values, we might in fact create an imbalance
+ * but recomputing these values is expensive, as that'd mean iteration 2 cache
+ * domains worth of CPUs.
+ */
+static bool
+wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+               int this_cpu, int prev_cpu, int sync)
+{
+       struct llc_stats prev_stats, this_stats;
+       s64 this_eff_load, prev_eff_load;
+       unsigned long task_load;
+
+       if (!get_llc_stats(&prev_stats, prev_cpu) ||
+           !get_llc_stats(&this_stats, this_cpu))
+               return false;
+
+       /*
+        * If sync wakeup then subtract the (maximum possible)
+        * effect of the currently running task from the load
+        * of the current LLC.
+        */
+       if (sync) {
+               unsigned long current_load = task_h_load(current);
+
+               /* in this case load hits 0 and this LLC is considered 'idle' */
+               if (current_load > this_stats.load)
+                       return true;
+
+               this_stats.load -= current_load;
+       }
+
+       /*
+        * The has_capacity stuff is not SMT aware, but by trying to balance
+        * the nr_running on both ends we try and fill the domain at equal
+        * rates, thereby first consuming cores before siblings.
+        */
+
+       /* if the old cache has capacity, stay there */
+       if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+               return false;
+
+       /* if this cache has capacity, come here */
+       if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+               return true;
+
+       /*
+        * Check to see if we can move the load without causing too much
+        * imbalance.
+        */
+       task_load = task_h_load(p);
+
+       this_eff_load = 100;
+       this_eff_load *= prev_stats.capacity;
+
+       prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+       prev_eff_load *= this_stats.capacity;
+
+       this_eff_load *= this_stats.load + task_load;
+       prev_eff_load *= prev_stats.load - task_load;
+
+       return this_eff_load <= prev_eff_load;
+}
+
  static int wake_affine(struct sched_domain *sd, struct task_struct *p,
                        int prev_cpu, int sync)
  {
         int this_cpu = smp_processor_id();
-       bool affine = false;
+       bool affine;
  
         /*
-        * Common case: CPUs are in the same socket, and select_idle_sibling()
-        * will do its thing regardless of what we return:
+        * Default to no affine wakeups; wake_affine() should not effect a task
+        * placement the load-balancer feels inclined to undo. The conservative
+        * option is therefore to not move tasks when they wake up.
          */
-       if (cpus_share_cache(prev_cpu, this_cpu))
-               affine = true;
-       else
-               affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+       affine = false;
+
+       /*
+        * If the wakeup is across cache domains, try to evaluate if movement
+        * makes sense, otherwise rely on select_idle_siblings() to do
+        * placement inside the cache domain.
+        */
+       if (!cpus_share_cache(prev_cpu, this_cpu))
+               affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
  
         schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
         if (affine) {
@@ -5550,7 +5667,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
                                 shallowest_idle_cpu = i;
                         }
                 } else if (shallowest_idle_cpu == -1) {
-                       load = weighted_cpuload(i);
+                       load = weighted_cpuload(cpu_rq(i));
                         if (load < min_load || (load == min_load && i == this_cpu)) {
                                 min_load = load;
                                 least_loaded_cpu = i;
@@ -6187,10 +6304,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
         int new_tasks;
  
  again:
-#ifdef CONFIG_FAIR_GROUP_SCHED
         if (!cfs_rq->nr_running)
                 goto idle;
  
+#ifdef CONFIG_FAIR_GROUP_SCHED
         if (prev->sched_class != &fair_sched_class)
                 goto simple;
  
@@ -6220,11 +6337,17 @@ again:
                         /*
                          * This call to check_cfs_rq_runtime() will do the
                          * throttle and dequeue its entity in the parent(s).
-                        * Therefore the 'simple' nr_running test will indeed
+                        * Therefore the nr_running test will indeed
                          * be correct.
                          */
-                       if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+                       if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+                               cfs_rq = &rq->cfs;
+
+                               if (!cfs_rq->nr_running)
+                                       goto idle;
+
                                 goto simple;
+                       }
                 }
  
                 se = pick_next_entity(cfs_rq, curr);
@@ -6264,12 +6387,8 @@ again:
  
         return p;
  simple:
-       cfs_rq = &rq->cfs;
  #endif
  
-       if (!cfs_rq->nr_running)
-               goto idle;
-
         put_prev_task(rq, prev);
  
         do {
@@ -6917,7 +7036,7 @@ static void update_blocked_averages(int cpu)
                 if (throttled_hierarchy(cfs_rq))
                         continue;
  
-               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
+               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
                         update_tg_load_avg(cfs_rq, 0);
  
                 /* Propagate pending load changes to the parent, if any: */
@@ -6990,7 +7109,7 @@ static inline void update_blocked_averages(int cpu)
  
         rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
-       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
+       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
         rq_unlock_irqrestore(rq, &rf);
  }
  
@@ -7036,6 +7155,7 @@ struct sg_lb_stats {
  struct sd_lb_stats {
         struct sched_group *busiest;    /* Busiest group in this sd */
         struct sched_group *local;      /* Local group in this sd */
+       unsigned long total_running;
         unsigned long total_load;       /* Total load of all groups in sd */
         unsigned long total_capacity;   /* Total capacity of all groups in sd */
         unsigned long avg_load; /* Average load across all groups in sd */
@@ -7055,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
         *sds = (struct sd_lb_stats){
                 .busiest = NULL,
                 .local = NULL,
+               .total_running = 0UL,
                 .total_load = 0UL,
                 .total_capacity = 0UL,
                 .busiest_stat = {
@@ -7363,7 +7484,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 sgs->nr_numa_running += rq->nr_numa_running;
                 sgs->nr_preferred_running += rq->nr_preferred_running;
  #endif
-               sgs->sum_weighted_load += weighted_cpuload(i);
+               sgs->sum_weighted_load += weighted_cpuload(rq);
                 /*
                  * No need to call idle_cpu() if nr_running is not 0
                  */
@@ -7490,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
   */
  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
  {
+       struct sched_domain_shared *shared = env->sd->shared;
         struct sched_domain *child = env->sd->child;
         struct sched_group *sg = env->sd->groups;
         struct sg_lb_stats *local = &sds->local_stat;
@@ -7546,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
  
  next_group:
                 /* Now, start updating sd_lb_stats */
+               sds->total_running += sgs->sum_nr_running;
                 sds->total_load += sgs->group_load;
                 sds->total_capacity += sgs->group_capacity;
  
@@ -7561,6 +7684,21 @@ next_group:
                         env->dst_rq->rd->overload = overload;
         }
  
+       if (!shared)
+               return;
+
+       /*
+        * Since these are sums over groups they can contain some CPUs
+        * multiple times for the NUMA domains.
+        *
+        * Currently only wake_affine_llc() and find_busiest_group()
+        * uses these numbers, only the last is affected by this problem.
+        *
+        * XXX fix that.
+        */
+       WRITE_ONCE(shared->nr_running,  sds->total_running);
+       WRITE_ONCE(shared->load,        sds->total_load);
+       WRITE_ONCE(shared->capacity,    sds->total_capacity);
  }
  
  /**
@@ -7790,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         if (!sds.busiest || busiest->sum_nr_running == 0)
                 goto out_balanced;
  
+       /* XXX broken for overlapping NUMA groups */
         sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
                                                 / sds.total_capacity;
  
@@ -7892,7 +8031,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
  
                 capacity = capacity_of(i);
  
-               wl = weighted_cpuload(i);
+               wl = weighted_cpuload(rq);
  
                 /*
                  * When comparing with imbalance, use weighted_cpuload()
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index 79895aec281eb5ad198fae3e5e8aed31849ed900..bd8b6d6f538786afe603901b359a12fe609fb284 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
  
  static int init_rootdomain(struct root_domain *rd)
  {
-       memset(rd, 0, sizeof(*rd));
-
         if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
                 goto out;
         if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
@@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void)
  {
         struct root_domain *rd;
  
-       rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+       rd = kzalloc(sizeof(*rd), GFP_KERNEL);
         if (!rd)
                 return NULL;
  
@@ -1595,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
         }
  }
  
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                 struct sched_domain *child, int cpu)
  {
author	Ingo Molnar <mingo@kernel.org>
	Fri, 25 Aug 2017 09:07:13 +0000 (11:07 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 25 Aug 2017 09:07:13 +0000 (11:07 +0200)
arch/x86/include/asm/topology.h		patch \| blob \| blame \| history
fs/proc/base.c		patch \| blob \| blame \| history
include/linux/sched.h		patch \| blob \| blame \| history
include/linux/sched/debug.h		patch \| blob \| blame \| history
include/linux/sched/task.h		patch \| blob \| blame \| history
include/linux/sched/topology.h		patch \| blob \| blame \| history
init/main.c		patch \| blob \| blame \| history
kernel/sched/autogroup.c		patch \| blob \| blame \| history
kernel/sched/completion.c		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/cpudeadline.c		patch \| blob \| blame \| history
kernel/sched/cpupri.c		patch \| blob \| blame \| history
kernel/sched/deadline.c		patch \| blob \| blame \| history
kernel/sched/debug.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history
kernel/sched/topology.c		patch \| blob \| blame \| history