--- /dev/null
+From 1ad3aaf3fcd2444406628a19a9b9e0922b95e2d4 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 17 May 2017 12:53:50 +0200
+Subject: sched/core: Implement new approach to scale select_idle_cpu()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 1ad3aaf3fcd2444406628a19a9b9e0922b95e2d4 upstream.
+
+Hackbench recently suffered a bunch of pain, first by commit:
+
+ 4c77b18cf8b7 ("sched/fair: Make select_idle_cpu() more aggressive")
+
+and then by commit:
+
+ c743f0a5c50f ("sched/fair, cpumask: Export for_each_cpu_wrap()")
+
+which fixed a bug in the initial for_each_cpu_wrap() implementation
+that made select_idle_cpu() even more expensive. The bug was that it
+would skip over CPUs when bits were consequtive in the bitmask.
+
+This however gave me an idea to fix select_idle_cpu(); where the old
+scheme was a cliff-edge throttle on idle scanning, this introduces a
+more gradual approach. Instead of stopping to scan entirely, we limit
+how many CPUs we scan.
+
+Initial benchmarks show that it mostly recovers hackbench while not
+hurting anything else, except Mason's schbench, but not as bad as the
+old thing.
+
+It also appears to recover the tbench high-end, which also suffered like
+hackbench.
+
+Tested-by: Matt Fleming <matt@codeblueprint.co.uk>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Chris Mason <clm@fb.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: hpa@zytor.com
+Cc: kitsunyan <kitsunyan@inbox.ru>
+Cc: linux-kernel@vger.kernel.org
+Cc: lvenanci@redhat.com
+Cc: riel@redhat.com
+Cc: xiaolong.ye@intel.com
+Link: http://lkml.kernel.org/r/20170517105350.hk5m4h4jb6dfr65a@hirez.programming.kicks-ass.net
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 21 ++++++++++++++++-----
+ kernel/sched/features.h | 1 +
+ 2 files changed, 17 insertions(+), 5 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5772,27 +5772,38 @@ static inline int select_idle_smt(struct
+ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+ {
+ struct sched_domain *this_sd;
+- u64 avg_cost, avg_idle = this_rq()->avg_idle;
++ u64 avg_cost, avg_idle;
+ u64 time, cost;
+ s64 delta;
+- int cpu;
++ int cpu, nr = INT_MAX;
+
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+ return -1;
+
+- avg_cost = this_sd->avg_scan_cost;
+-
+ /*
+ * Due to large variance we need a large fuzz factor; hackbench in
+ * particularly is sensitive here.
+ */
+- if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
++ avg_idle = this_rq()->avg_idle / 512;
++ avg_cost = this_sd->avg_scan_cost + 1;
++
++ if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
+ return -1;
+
++ if (sched_feat(SIS_PROP)) {
++ u64 span_avg = sd->span_weight * avg_idle;
++ if (span_avg > 4*avg_cost)
++ nr = div_u64(span_avg, avg_cost);
++ else
++ nr = 4;
++ }
++
+ time = local_clock();
+
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
++ if (!--nr)
++ return -1;
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ continue;
+ if (idle_cpu(cpu))
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
+ * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
+ */
+ SCHED_FEAT(SIS_AVG_CPU, false)
++SCHED_FEAT(SIS_PROP, true)
+
+ /*
+ * Issue a WARN when we do multiple update_rq_clock() calls
--- /dev/null
+From c743f0a5c50f2fcbc628526279cfa24f3dabe182 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Fri, 14 Apr 2017 14:20:05 +0200
+Subject: sched/fair, cpumask: Export for_each_cpu_wrap()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit c743f0a5c50f2fcbc628526279cfa24f3dabe182 upstream.
+
+More users for for_each_cpu_wrap() have appeared. Promote the construct
+to generic cpumask interface.
+
+The implementation is slightly modified to reduce arguments.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Lauro Ramos Venancio <lvenanci@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: lwang@redhat.com
+Link: http://lkml.kernel.org/r/20170414122005.o35me2h5nowqkxbv@hirez.programming.kicks-ass.net
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cpumask.h | 17 +++++++++++++++++
+ kernel/sched/fair.c | 45 ++++-----------------------------------------
+ lib/cpumask.c | 32 ++++++++++++++++++++++++++++++++
+ 3 files changed, 53 insertions(+), 41 deletions(-)
+
+--- a/include/linux/cpumask.h
++++ b/include/linux/cpumask.h
+@@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsign
+ (cpu) = cpumask_next_zero((cpu), (mask)), \
+ (cpu) < nr_cpu_ids;)
+
++extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
++
++/**
++ * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
++ * @cpu: the (optionally unsigned) integer iterator
++ * @mask: the cpumask poiter
++ * @start: the start location
++ *
++ * The implementation does not assume any bit in @mask is set (including @start).
++ *
++ * After the loop, cpu is >= nr_cpu_ids.
++ */
++#define for_each_cpu_wrap(cpu, mask, start) \
++ for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \
++ (cpu) < nr_cpumask_bits; \
++ (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
++
+ /**
+ * for_each_cpu_and - iterate over every cpu in both masks
+ * @cpu: the (optionally unsigned) integer iterator
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5640,43 +5640,6 @@ find_idlest_cpu(struct sched_group *grou
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ }
+
+-/*
+- * Implement a for_each_cpu() variant that starts the scan at a given cpu
+- * (@start), and wraps around.
+- *
+- * This is used to scan for idle CPUs; such that not all CPUs looking for an
+- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+- * through the LLC domain.
+- *
+- * Especially tbench is found sensitive to this.
+- */
+-
+-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+-{
+- int next;
+-
+-again:
+- next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+-
+- if (*wrapped) {
+- if (next >= start)
+- return nr_cpumask_bits;
+- } else {
+- if (next >= nr_cpumask_bits) {
+- *wrapped = 1;
+- n = -1;
+- goto again;
+- }
+- }
+-
+- return next;
+-}
+-
+-#define for_each_cpu_wrap(cpu, mask, start, wrap) \
+- for ((wrap) = 0, (cpu) = (start)-1; \
+- (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
+- (cpu) < nr_cpumask_bits; )
+-
+ #ifdef CONFIG_SCHED_SMT
+
+ static inline void set_idle_cores(int cpu, int val)
+@@ -5736,7 +5699,7 @@ unlock:
+ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+ {
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+- int core, cpu, wrap;
++ int core, cpu;
+
+ if (!static_branch_likely(&sched_smt_present))
+ return -1;
+@@ -5746,7 +5709,7 @@ static int select_idle_core(struct task_
+
+ cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+
+- for_each_cpu_wrap(core, cpus, target, wrap) {
++ for_each_cpu_wrap(core, cpus, target) {
+ bool idle = true;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+@@ -5812,7 +5775,7 @@ static int select_idle_cpu(struct task_s
+ u64 avg_cost, avg_idle = this_rq()->avg_idle;
+ u64 time, cost;
+ s64 delta;
+- int cpu, wrap;
++ int cpu;
+
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+@@ -5829,7 +5792,7 @@ static int select_idle_cpu(struct task_s
+
+ time = local_clock();
+
+- for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
++ for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ continue;
+ if (idle_cpu(cpu))
+--- a/lib/cpumask.c
++++ b/lib/cpumask.c
+@@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask
+ }
+ EXPORT_SYMBOL(cpumask_any_but);
+
++/**
++ * cpumask_next_wrap - helper to implement for_each_cpu_wrap
++ * @n: the cpu prior to the place to search
++ * @mask: the cpumask pointer
++ * @start: the start point of the iteration
++ * @wrap: assume @n crossing @start terminates the iteration
++ *
++ * Returns >= nr_cpu_ids on completion
++ *
++ * Note: the @wrap argument is required for the start condition when
++ * we cannot assume @start is set in @mask.
++ */
++int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
++{
++ int next;
++
++again:
++ next = cpumask_next(n, mask);
++
++ if (wrap && n < start && next >= start) {
++ return nr_cpumask_bits;
++
++ } else if (next >= nr_cpumask_bits) {
++ wrap = true;
++ n = -1;
++ goto again;
++ }
++
++ return next;
++}
++EXPORT_SYMBOL(cpumask_next_wrap);
++
+ /* These are not inline because of header tangles. */
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ /**
--- /dev/null
+From 815abf5af45f04f759f12f3172afd15226fd7f71 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Fri, 23 Jun 2017 12:55:30 -0400
+Subject: sched/fair: Remove effective_load()
+
+From: Rik van Riel <riel@redhat.com>
+
+commit 815abf5af45f04f759f12f3172afd15226fd7f71 upstream.
+
+The effective_load() function was only used by the NUMA balancing
+code, and not by the regular load balancing code. Now that the
+NUMA balancing code no longer uses it either, get rid of it.
+
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: jhladky@redhat.com
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/20170623165530.22514-5-riel@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 124 ----------------------------------------------------
+ 1 file changed, 1 insertion(+), 123 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1381,7 +1381,6 @@ static unsigned long weighted_cpuload(co
+ static unsigned long source_load(int cpu, int type);
+ static unsigned long target_load(int cpu, int type);
+ static unsigned long capacity_of(int cpu);
+-static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+
+ /* Cached statistics for all CPUs within a node */
+ struct numa_stats {
+@@ -3044,8 +3043,7 @@ __update_load_avg_cfs_rq(u64 now, int cp
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+- * Updating tg's load_avg is necessary before update_cfs_share() (which is
+- * done) and effective_load() (which is not done because it is too costly).
++ * Updating tg's load_avg is necessary before update_cfs_share().
+ */
+ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+ {
+@@ -5277,126 +5275,6 @@ static unsigned long cpu_avg_load_per_ta
+ return 0;
+ }
+
+-#ifdef CONFIG_FAIR_GROUP_SCHED
+-/*
+- * effective_load() calculates the load change as seen from the root_task_group
+- *
+- * Adding load to a group doesn't make a group heavier, but can cause movement
+- * of group shares between cpus. Assuming the shares were perfectly aligned one
+- * can calculate the shift in shares.
+- *
+- * Calculate the effective load difference if @wl is added (subtracted) to @tg
+- * on this @cpu and results in a total addition (subtraction) of @wg to the
+- * total group weight.
+- *
+- * Given a runqueue weight distribution (rw_i) we can compute a shares
+- * distribution (s_i) using:
+- *
+- * s_i = rw_i / \Sum rw_j (1)
+- *
+- * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+- * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+- * shares distribution (s_i):
+- *
+- * rw_i = { 2, 4, 1, 0 }
+- * s_i = { 2/7, 4/7, 1/7, 0 }
+- *
+- * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+- * task used to run on and the CPU the waker is running on), we need to
+- * compute the effect of waking a task on either CPU and, in case of a sync
+- * wakeup, compute the effect of the current task going to sleep.
+- *
+- * So for a change of @wl to the local @cpu with an overall group weight change
+- * of @wl we can compute the new shares distribution (s'_i) using:
+- *
+- * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
+- *
+- * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+- * differences in waking a task to CPU 0. The additional task changes the
+- * weight and shares distributions like:
+- *
+- * rw'_i = { 3, 4, 1, 0 }
+- * s'_i = { 3/8, 4/8, 1/8, 0 }
+- *
+- * We can then compute the difference in effective weight by using:
+- *
+- * dw_i = S * (s'_i - s_i) (3)
+- *
+- * Where 'S' is the group weight as seen by its parent.
+- *
+- * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+- * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+- * 4/7) times the weight of the group.
+- */
+-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
+-{
+- struct sched_entity *se = tg->se[cpu];
+-
+- if (!tg->parent) /* the trivial, non-cgroup case */
+- return wl;
+-
+- for_each_sched_entity(se) {
+- struct cfs_rq *cfs_rq = se->my_q;
+- long W, w = cfs_rq_load_avg(cfs_rq);
+-
+- tg = cfs_rq->tg;
+-
+- /*
+- * W = @wg + \Sum rw_j
+- */
+- W = wg + atomic_long_read(&tg->load_avg);
+-
+- /* Ensure \Sum rw_j >= rw_i */
+- W -= cfs_rq->tg_load_avg_contrib;
+- W += w;
+-
+- /*
+- * w = rw_i + @wl
+- */
+- w += wl;
+-
+- /*
+- * wl = S * s'_i; see (2)
+- */
+- if (W > 0 && w < W)
+- wl = (w * (long)scale_load_down(tg->shares)) / W;
+- else
+- wl = scale_load_down(tg->shares);
+-
+- /*
+- * Per the above, wl is the new se->load.weight value; since
+- * those are clipped to [MIN_SHARES, ...) do so now. See
+- * calc_cfs_shares().
+- */
+- if (wl < MIN_SHARES)
+- wl = MIN_SHARES;
+-
+- /*
+- * wl = dw_i = S * (s'_i - s_i); see (3)
+- */
+- wl -= se->avg.load_avg;
+-
+- /*
+- * Recursively apply this logic to all parent groups to compute
+- * the final effective load change on the root group. Since
+- * only the @tg group gets extra weight, all parent groups can
+- * only redistribute existing shares. @wl is the shift in shares
+- * resulting from this level per the above.
+- */
+- wg = 0;
+- }
+-
+- return wl;
+-}
+-#else
+-
+-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
+-{
+- return wl;
+-}
+-
+-#endif
+-
+ static void record_wakee(struct task_struct *p)
+ {
+ /*
--- /dev/null
+From 7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Fri, 23 Jun 2017 12:55:28 -0400
+Subject: sched/fair: Simplify wake_affine() for the single socket case
+
+From: Rik van Riel <riel@redhat.com>
+
+commit 7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf upstream.
+
+Then 'this_cpu' and 'prev_cpu' are in the same socket, select_idle_sibling()
+will do its thing regardless of the return value of wake_affine().
+
+Just return true and don't look at all the other things.
+
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: jhladky@redhat.com
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/20170623165530.22514-3-riel@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5399,6 +5399,13 @@ static int wake_affine(struct sched_doma
+ this_load = target_load(this_cpu, idx);
+
+ /*
++ * Common case: CPUs are in the same socket, and select_idle_sibling()
++ * will do its thing regardless of what we return:
++ */
++ if (cpus_share_cache(prev_cpu, this_cpu))
++ return true;
++
++ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+@@ -5986,11 +5993,15 @@ select_task_rq_fair(struct task_struct *
+
+ if (affine_sd) {
+ sd = NULL; /* Prefer wake_affine over balance flags */
+- if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
++ if (cpu == prev_cpu)
++ goto pick_cpu;
++
++ if (wake_affine(affine_sd, p, prev_cpu, sync))
+ new_cpu = cpu;
+ }
+
+ if (!sd) {
++ pick_cpu:
+ if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+
--- /dev/null
+From ff801b716effd652f420204eddb36f6e4a716819 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jun 2017 08:25:52 +0200
+Subject: sched/numa: Hide numa_wake_affine() from UP build
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit ff801b716effd652f420204eddb36f6e4a716819 upstream.
+
+Stephen reported the following build warning in UP:
+
+kernel/sched/fair.c:2657:9: warning: 'struct sched_domain' declared inside
+parameter list
+ ^
+/home/sfr/next/next/kernel/sched/fair.c:2657:9: warning: its scope is only this
+definition or declaration, which is probably not what you want
+
+Hide the numa_wake_affine() inline stub on UP builds to get rid of it.
+
+Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
+Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2651,12 +2651,14 @@ static inline void account_numa_dequeue(
+ {
+ }
+
++#ifdef CONFIG_SMP
+ static inline bool numa_wake_affine(struct sched_domain *sd,
+ struct task_struct *p, int this_cpu,
+ int prev_cpu, int sync)
+ {
+ return true;
+ }
++#endif /* !SMP */
+ #endif /* CONFIG_NUMA_BALANCING */
+
+ static void
--- /dev/null
+From 3fed382b46baac83703130fe4cd3d9147f427fb9 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Fri, 23 Jun 2017 12:55:29 -0400
+Subject: sched/numa: Implement NUMA node level wake_affine()
+
+From: Rik van Riel <riel@redhat.com>
+
+commit 3fed382b46baac83703130fe4cd3d9147f427fb9 upstream.
+
+Since select_idle_sibling() can place a task anywhere on a socket,
+comparing loads between individual CPU cores makes no real sense
+for deciding whether to do an affine wakeup across sockets, either.
+
+Instead, compare the load between the sockets in a similar way the
+load balancer and the numa balancing code do.
+
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: jhladky@redhat.com
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/20170623165530.22514-4-riel@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 130 ++++++++++++++++++++++++++++------------------------
+ 1 file changed, 71 insertions(+), 59 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2585,6 +2585,60 @@ void task_tick_numa(struct rq *rq, struc
+ }
+ }
+ }
++
++/*
++ * Can a task be moved from prev_cpu to this_cpu without causing a load
++ * imbalance that would trigger the load balancer?
++ */
++static inline bool numa_wake_affine(struct sched_domain *sd,
++ struct task_struct *p, int this_cpu,
++ int prev_cpu, int sync)
++{
++ struct numa_stats prev_load, this_load;
++ s64 this_eff_load, prev_eff_load;
++
++ update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
++ update_numa_stats(&this_load, cpu_to_node(this_cpu));
++
++ /*
++ * If sync wakeup then subtract the (maximum possible)
++ * effect of the currently running task from the load
++ * of the current CPU:
++ */
++ if (sync) {
++ unsigned long current_load = task_h_load(current);
++
++ if (this_load.load > current_load)
++ this_load.load -= current_load;
++ else
++ this_load.load = 0;
++ }
++
++ /*
++ * In low-load situations, where this_cpu's node is idle due to the
++ * sync cause above having dropped this_load.load to 0, move the task.
++ * Moving to an idle socket will not create a bad imbalance.
++ *
++ * Otherwise check if the nodes are near enough in load to allow this
++ * task to be woken on this_cpu's node.
++ */
++ if (this_load.load > 0) {
++ unsigned long task_load = task_h_load(p);
++
++ this_eff_load = 100;
++ this_eff_load *= prev_load.compute_capacity;
++
++ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
++ prev_eff_load *= this_load.compute_capacity;
++
++ this_eff_load *= this_load.load + task_load;
++ prev_eff_load *= prev_load.load - task_load;
++
++ return this_eff_load <= prev_eff_load;
++ }
++
++ return true;
++}
+ #else
+ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+ {
+@@ -2597,6 +2651,13 @@ static inline void account_numa_enqueue(
+ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+ {
+ }
++
++static inline bool numa_wake_affine(struct sched_domain *sd,
++ struct task_struct *p, int this_cpu,
++ int prev_cpu, int sync)
++{
++ return true;
++}
+ #endif /* CONFIG_NUMA_BALANCING */
+
+ static void
+@@ -5386,74 +5447,25 @@ static int wake_wide(struct task_struct
+ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int prev_cpu, int sync)
+ {
+- s64 this_load, load;
+- s64 this_eff_load, prev_eff_load;
+- int idx, this_cpu;
+- struct task_group *tg;
+- unsigned long weight;
+- int balanced;
+-
+- idx = sd->wake_idx;
+- this_cpu = smp_processor_id();
+- load = source_load(prev_cpu, idx);
+- this_load = target_load(this_cpu, idx);
++ int this_cpu = smp_processor_id();
++ bool affine = false;
+
+ /*
+ * Common case: CPUs are in the same socket, and select_idle_sibling()
+ * will do its thing regardless of what we return:
+ */
+ if (cpus_share_cache(prev_cpu, this_cpu))
+- return true;
+-
+- /*
+- * If sync wakeup then subtract the (maximum possible)
+- * effect of the currently running task from the load
+- * of the current CPU:
+- */
+- if (sync) {
+- tg = task_group(current);
+- weight = current->se.avg.load_avg;
+-
+- this_load += effective_load(tg, this_cpu, -weight, -weight);
+- load += effective_load(tg, prev_cpu, 0, -weight);
+- }
+-
+- tg = task_group(p);
+- weight = p->se.avg.load_avg;
+-
+- /*
+- * In low-load situations, where prev_cpu is idle and this_cpu is idle
+- * due to the sync cause above having dropped this_load to 0, we'll
+- * always have an imbalance, but there's really nothing you can do
+- * about that, so that's good too.
+- *
+- * Otherwise check if either cpus are near enough in load to allow this
+- * task to be woken on this_cpu.
+- */
+- this_eff_load = 100;
+- this_eff_load *= capacity_of(prev_cpu);
+-
+- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+- prev_eff_load *= capacity_of(this_cpu);
+-
+- if (this_load > 0) {
+- this_eff_load *= this_load +
+- effective_load(tg, this_cpu, weight, weight);
+-
+- prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+- }
+-
+- balanced = this_eff_load <= prev_eff_load;
++ affine = true;
++ else
++ affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+
+ schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
++ if (affine) {
++ schedstat_inc(sd->ttwu_move_affine);
++ schedstat_inc(p->se.statistics.nr_wakeups_affine);
++ }
+
+- if (!balanced)
+- return 0;
+-
+- schedstat_inc(sd->ttwu_move_affine);
+- schedstat_inc(p->se.statistics.nr_wakeups_affine);
+-
+- return 1;
++ return affine;
+ }
+
+ static inline int task_util(struct task_struct *p);
--- /dev/null
+From 739294fb03f590401bbd7faa6d31a507e3ffada5 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Fri, 23 Jun 2017 12:55:27 -0400
+Subject: sched/numa: Override part of migrate_degrades_locality() when idle balancing
+
+From: Rik van Riel <riel@redhat.com>
+
+commit 739294fb03f590401bbd7faa6d31a507e3ffada5 upstream.
+
+Several tests in the NAS benchmark seem to run a lot slower with
+NUMA balancing enabled, than with NUMA balancing disabled. The
+slower run time corresponds with increased idle time.
+
+Overriding the final test of migrate_degrades_locality (but still
+doing the other NUMA tests first) seems to improve performance
+of those benchmarks.
+
+Reported-by: Jirka Hladky <jhladky@redhat.com>
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/20170623165530.22514-2-riel@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -6661,6 +6661,10 @@ static int migrate_degrades_locality(str
+ if (dst_nid == p->numa_preferred_nid)
+ return 0;
+
++ /* Leaving a core idle is often worse than degrading locality. */
++ if (env->idle != CPU_NOT_IDLE)
++ return -1;
++
+ if (numa_group) {
+ src_faults = group_faults(p, src_nid);
+ dst_faults = group_faults(p, dst_nid);
--- /dev/null
+From 8655d5497735b288f8a9b458bd22e7d1bf95bb61 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Mon, 15 May 2017 15:13:16 +0200
+Subject: sched/numa: Use down_read_trylock() for the mmap_sem
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 8655d5497735b288f8a9b458bd22e7d1bf95bb61 upstream.
+
+A customer has reported a soft-lockup when running an intensive
+memory stress test, where the trace on multiple CPU's looks like this:
+
+ RIP: 0010:[<ffffffff810c53fe>]
+ [<ffffffff810c53fe>] native_queued_spin_lock_slowpath+0x10e/0x190
+...
+ Call Trace:
+ [<ffffffff81182d07>] queued_spin_lock_slowpath+0x7/0xa
+ [<ffffffff811bc331>] change_protection_range+0x3b1/0x930
+ [<ffffffff811d4be8>] change_prot_numa+0x18/0x30
+ [<ffffffff810adefe>] task_numa_work+0x1fe/0x310
+ [<ffffffff81098322>] task_work_run+0x72/0x90
+
+Further investigation showed that the lock contention here is pmd_lock().
+
+The task_numa_work() function makes sure that only one thread is let to perform
+the work in a single scan period (via cmpxchg), but if there's a thread with
+mmap_sem locked for writing for several periods, multiple threads in
+task_numa_work() can build up a convoy waiting for mmap_sem for read and then
+all get unblocked at once.
+
+This patch changes the down_read() to the trylock version, which prevents the
+build up. For a workload experiencing mmap_sem contention, it's probably better
+to postpone the NUMA balancing work anyway. This seems to have fixed the soft
+lockups involving pmd_lock(), which is in line with the convoy theory.
+
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Acked-by: Rik van Riel <riel@redhat.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/20170515131316.21909-1-vbabka@suse.cz
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2469,7 +2469,8 @@ void task_numa_work(struct callback_head
+ return;
+
+
+- down_read(&mm->mmap_sem);
++ if (!down_read_trylock(&mm->mmap_sem))
++ return;
+ vma = find_vma(mm, start);
+ if (!vma) {
+ reset_ptenuma_scan(p);
tpm-fix-a-kernel-memory-leak-in-tpm-sysfs.c.patch
powerpc-powernv-fix-cpu_hotplug-n-idle.c-compile-error.patch
x86-uaccess-optimize-copy_user_enhanced_fast_string-for-short-strings.patch
+sched-fair-cpumask-export-for_each_cpu_wrap.patch
+sched-core-implement-new-approach-to-scale-select_idle_cpu.patch
+sched-numa-use-down_read_trylock-for-the-mmap_sem.patch
+sched-numa-override-part-of-migrate_degrades_locality-when-idle-balancing.patch
+sched-fair-simplify-wake_affine-for-the-single-socket-case.patch
+sched-numa-implement-numa-node-level-wake_affine.patch
+sched-fair-remove-effective_load.patch
+sched-numa-hide-numa_wake_affine-from-up-build.patch