]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.12-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 10 Jul 2017 15:24:40 +0000 (17:24 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 10 Jul 2017 15:24:40 +0000 (17:24 +0200)
added patches:
sched-core-implement-new-approach-to-scale-select_idle_cpu.patch
sched-fair-cpumask-export-for_each_cpu_wrap.patch
sched-fair-remove-effective_load.patch
sched-fair-simplify-wake_affine-for-the-single-socket-case.patch
sched-numa-hide-numa_wake_affine-from-up-build.patch
sched-numa-implement-numa-node-level-wake_affine.patch
sched-numa-override-part-of-migrate_degrades_locality-when-idle-balancing.patch
sched-numa-use-down_read_trylock-for-the-mmap_sem.patch

queue-4.12/sched-core-implement-new-approach-to-scale-select_idle_cpu.patch [new file with mode: 0644]
queue-4.12/sched-fair-cpumask-export-for_each_cpu_wrap.patch [new file with mode: 0644]
queue-4.12/sched-fair-remove-effective_load.patch [new file with mode: 0644]
queue-4.12/sched-fair-simplify-wake_affine-for-the-single-socket-case.patch [new file with mode: 0644]
queue-4.12/sched-numa-hide-numa_wake_affine-from-up-build.patch [new file with mode: 0644]
queue-4.12/sched-numa-implement-numa-node-level-wake_affine.patch [new file with mode: 0644]
queue-4.12/sched-numa-override-part-of-migrate_degrades_locality-when-idle-balancing.patch [new file with mode: 0644]
queue-4.12/sched-numa-use-down_read_trylock-for-the-mmap_sem.patch [new file with mode: 0644]
queue-4.12/series

diff --git a/queue-4.12/sched-core-implement-new-approach-to-scale-select_idle_cpu.patch b/queue-4.12/sched-core-implement-new-approach-to-scale-select_idle_cpu.patch
new file mode 100644 (file)
index 0000000..38af9df
--- /dev/null
@@ -0,0 +1,112 @@
+From 1ad3aaf3fcd2444406628a19a9b9e0922b95e2d4 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 17 May 2017 12:53:50 +0200
+Subject: sched/core: Implement new approach to scale select_idle_cpu()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 1ad3aaf3fcd2444406628a19a9b9e0922b95e2d4 upstream.
+
+Hackbench recently suffered a bunch of pain, first by commit:
+
+  4c77b18cf8b7 ("sched/fair: Make select_idle_cpu() more aggressive")
+
+and then by commit:
+
+  c743f0a5c50f ("sched/fair, cpumask: Export for_each_cpu_wrap()")
+
+which fixed a bug in the initial for_each_cpu_wrap() implementation
+that made select_idle_cpu() even more expensive. The bug was that it
+would skip over CPUs when bits were consequtive in the bitmask.
+
+This however gave me an idea to fix select_idle_cpu(); where the old
+scheme was a cliff-edge throttle on idle scanning, this introduces a
+more gradual approach. Instead of stopping to scan entirely, we limit
+how many CPUs we scan.
+
+Initial benchmarks show that it mostly recovers hackbench while not
+hurting anything else, except Mason's schbench, but not as bad as the
+old thing.
+
+It also appears to recover the tbench high-end, which also suffered like
+hackbench.
+
+Tested-by: Matt Fleming <matt@codeblueprint.co.uk>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Chris Mason <clm@fb.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: hpa@zytor.com
+Cc: kitsunyan <kitsunyan@inbox.ru>
+Cc: linux-kernel@vger.kernel.org
+Cc: lvenanci@redhat.com
+Cc: riel@redhat.com
+Cc: xiaolong.ye@intel.com
+Link: http://lkml.kernel.org/r/20170517105350.hk5m4h4jb6dfr65a@hirez.programming.kicks-ass.net
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c     |   21 ++++++++++++++++-----
+ kernel/sched/features.h |    1 +
+ 2 files changed, 17 insertions(+), 5 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5772,27 +5772,38 @@ static inline int select_idle_smt(struct
+ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+ {
+       struct sched_domain *this_sd;
+-      u64 avg_cost, avg_idle = this_rq()->avg_idle;
++      u64 avg_cost, avg_idle;
+       u64 time, cost;
+       s64 delta;
+-      int cpu;
++      int cpu, nr = INT_MAX;
+       this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+       if (!this_sd)
+               return -1;
+-      avg_cost = this_sd->avg_scan_cost;
+-
+       /*
+        * Due to large variance we need a large fuzz factor; hackbench in
+        * particularly is sensitive here.
+        */
+-      if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
++      avg_idle = this_rq()->avg_idle / 512;
++      avg_cost = this_sd->avg_scan_cost + 1;
++
++      if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
+               return -1;
++      if (sched_feat(SIS_PROP)) {
++              u64 span_avg = sd->span_weight * avg_idle;
++              if (span_avg > 4*avg_cost)
++                      nr = div_u64(span_avg, avg_cost);
++              else
++                      nr = 4;
++      }
++
+       time = local_clock();
+       for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
++              if (!--nr)
++                      return -1;
+               if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+                       continue;
+               if (idle_cpu(cpu))
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -55,6 +55,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
+  * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
+  */
+ SCHED_FEAT(SIS_AVG_CPU, false)
++SCHED_FEAT(SIS_PROP, true)
+ /*
+  * Issue a WARN when we do multiple update_rq_clock() calls
diff --git a/queue-4.12/sched-fair-cpumask-export-for_each_cpu_wrap.patch b/queue-4.12/sched-fair-cpumask-export-for_each_cpu_wrap.patch
new file mode 100644 (file)
index 0000000..88f437f
--- /dev/null
@@ -0,0 +1,182 @@
+From c743f0a5c50f2fcbc628526279cfa24f3dabe182 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Fri, 14 Apr 2017 14:20:05 +0200
+Subject: sched/fair, cpumask: Export for_each_cpu_wrap()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit c743f0a5c50f2fcbc628526279cfa24f3dabe182 upstream.
+
+More users for for_each_cpu_wrap() have appeared. Promote the construct
+to generic cpumask interface.
+
+The implementation is slightly modified to reduce arguments.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Lauro Ramos Venancio <lvenanci@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: lwang@redhat.com
+Link: http://lkml.kernel.org/r/20170414122005.o35me2h5nowqkxbv@hirez.programming.kicks-ass.net
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cpumask.h |   17 +++++++++++++++++
+ kernel/sched/fair.c     |   45 ++++-----------------------------------------
+ lib/cpumask.c           |   32 ++++++++++++++++++++++++++++++++
+ 3 files changed, 53 insertions(+), 41 deletions(-)
+
+--- a/include/linux/cpumask.h
++++ b/include/linux/cpumask.h
+@@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsign
+               (cpu) = cpumask_next_zero((cpu), (mask)),       \
+               (cpu) < nr_cpu_ids;)
++extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
++
++/**
++ * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
++ * @cpu: the (optionally unsigned) integer iterator
++ * @mask: the cpumask poiter
++ * @start: the start location
++ *
++ * The implementation does not assume any bit in @mask is set (including @start).
++ *
++ * After the loop, cpu is >= nr_cpu_ids.
++ */
++#define for_each_cpu_wrap(cpu, mask, start)                                   \
++      for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false);      \
++           (cpu) < nr_cpumask_bits;                                           \
++           (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
++
+ /**
+  * for_each_cpu_and - iterate over every cpu in both masks
+  * @cpu: the (optionally unsigned) integer iterator
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5640,43 +5640,6 @@ find_idlest_cpu(struct sched_group *grou
+       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ }
+-/*
+- * Implement a for_each_cpu() variant that starts the scan at a given cpu
+- * (@start), and wraps around.
+- *
+- * This is used to scan for idle CPUs; such that not all CPUs looking for an
+- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+- * through the LLC domain.
+- *
+- * Especially tbench is found sensitive to this.
+- */
+-
+-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+-{
+-      int next;
+-
+-again:
+-      next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+-
+-      if (*wrapped) {
+-              if (next >= start)
+-                      return nr_cpumask_bits;
+-      } else {
+-              if (next >= nr_cpumask_bits) {
+-                      *wrapped = 1;
+-                      n = -1;
+-                      goto again;
+-              }
+-      }
+-
+-      return next;
+-}
+-
+-#define for_each_cpu_wrap(cpu, mask, start, wrap)                             \
+-      for ((wrap) = 0, (cpu) = (start)-1;                                     \
+-              (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),     \
+-              (cpu) < nr_cpumask_bits; )
+-
+ #ifdef CONFIG_SCHED_SMT
+ static inline void set_idle_cores(int cpu, int val)
+@@ -5736,7 +5699,7 @@ unlock:
+ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+ {
+       struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+-      int core, cpu, wrap;
++      int core, cpu;
+       if (!static_branch_likely(&sched_smt_present))
+               return -1;
+@@ -5746,7 +5709,7 @@ static int select_idle_core(struct task_
+       cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+-      for_each_cpu_wrap(core, cpus, target, wrap) {
++      for_each_cpu_wrap(core, cpus, target) {
+               bool idle = true;
+               for_each_cpu(cpu, cpu_smt_mask(core)) {
+@@ -5812,7 +5775,7 @@ static int select_idle_cpu(struct task_s
+       u64 avg_cost, avg_idle = this_rq()->avg_idle;
+       u64 time, cost;
+       s64 delta;
+-      int cpu, wrap;
++      int cpu;
+       this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+       if (!this_sd)
+@@ -5829,7 +5792,7 @@ static int select_idle_cpu(struct task_s
+       time = local_clock();
+-      for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
++      for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+               if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+                       continue;
+               if (idle_cpu(cpu))
+--- a/lib/cpumask.c
++++ b/lib/cpumask.c
+@@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask
+ }
+ EXPORT_SYMBOL(cpumask_any_but);
++/**
++ * cpumask_next_wrap - helper to implement for_each_cpu_wrap
++ * @n: the cpu prior to the place to search
++ * @mask: the cpumask pointer
++ * @start: the start point of the iteration
++ * @wrap: assume @n crossing @start terminates the iteration
++ *
++ * Returns >= nr_cpu_ids on completion
++ *
++ * Note: the @wrap argument is required for the start condition when
++ * we cannot assume @start is set in @mask.
++ */
++int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
++{
++      int next;
++
++again:
++      next = cpumask_next(n, mask);
++
++      if (wrap && n < start && next >= start) {
++              return nr_cpumask_bits;
++
++      } else if (next >= nr_cpumask_bits) {
++              wrap = true;
++              n = -1;
++              goto again;
++      }
++
++      return next;
++}
++EXPORT_SYMBOL(cpumask_next_wrap);
++
+ /* These are not inline because of header tangles. */
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ /**
diff --git a/queue-4.12/sched-fair-remove-effective_load.patch b/queue-4.12/sched-fair-remove-effective_load.patch
new file mode 100644 (file)
index 0000000..dc272f9
--- /dev/null
@@ -0,0 +1,177 @@
+From 815abf5af45f04f759f12f3172afd15226fd7f71 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Fri, 23 Jun 2017 12:55:30 -0400
+Subject: sched/fair: Remove effective_load()
+
+From: Rik van Riel <riel@redhat.com>
+
+commit 815abf5af45f04f759f12f3172afd15226fd7f71 upstream.
+
+The effective_load() function was only used by the NUMA balancing
+code, and not by the regular load balancing code. Now that the
+NUMA balancing code no longer uses it either, get rid of it.
+
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: jhladky@redhat.com
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/20170623165530.22514-5-riel@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |  124 ----------------------------------------------------
+ 1 file changed, 1 insertion(+), 123 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1381,7 +1381,6 @@ static unsigned long weighted_cpuload(co
+ static unsigned long source_load(int cpu, int type);
+ static unsigned long target_load(int cpu, int type);
+ static unsigned long capacity_of(int cpu);
+-static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+ /* Cached statistics for all CPUs within a node */
+ struct numa_stats {
+@@ -3044,8 +3043,7 @@ __update_load_avg_cfs_rq(u64 now, int cp
+  * differential update where we store the last value we propagated. This in
+  * turn allows skipping updates if the differential is 'small'.
+  *
+- * Updating tg's load_avg is necessary before update_cfs_share() (which is
+- * done) and effective_load() (which is not done because it is too costly).
++ * Updating tg's load_avg is necessary before update_cfs_share().
+  */
+ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+ {
+@@ -5277,126 +5275,6 @@ static unsigned long cpu_avg_load_per_ta
+       return 0;
+ }
+-#ifdef CONFIG_FAIR_GROUP_SCHED
+-/*
+- * effective_load() calculates the load change as seen from the root_task_group
+- *
+- * Adding load to a group doesn't make a group heavier, but can cause movement
+- * of group shares between cpus. Assuming the shares were perfectly aligned one
+- * can calculate the shift in shares.
+- *
+- * Calculate the effective load difference if @wl is added (subtracted) to @tg
+- * on this @cpu and results in a total addition (subtraction) of @wg to the
+- * total group weight.
+- *
+- * Given a runqueue weight distribution (rw_i) we can compute a shares
+- * distribution (s_i) using:
+- *
+- *   s_i = rw_i / \Sum rw_j                                           (1)
+- *
+- * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+- * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+- * shares distribution (s_i):
+- *
+- *   rw_i = {   2,   4,   1,   0 }
+- *   s_i  = { 2/7, 4/7, 1/7,   0 }
+- *
+- * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+- * task used to run on and the CPU the waker is running on), we need to
+- * compute the effect of waking a task on either CPU and, in case of a sync
+- * wakeup, compute the effect of the current task going to sleep.
+- *
+- * So for a change of @wl to the local @cpu with an overall group weight change
+- * of @wl we can compute the new shares distribution (s'_i) using:
+- *
+- *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                          (2)
+- *
+- * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+- * differences in waking a task to CPU 0. The additional task changes the
+- * weight and shares distributions like:
+- *
+- *   rw'_i = {   3,   4,   1,   0 }
+- *   s'_i  = { 3/8, 4/8, 1/8,   0 }
+- *
+- * We can then compute the difference in effective weight by using:
+- *
+- *   dw_i = S * (s'_i - s_i)                                          (3)
+- *
+- * Where 'S' is the group weight as seen by its parent.
+- *
+- * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+- * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+- * 4/7) times the weight of the group.
+- */
+-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
+-{
+-      struct sched_entity *se = tg->se[cpu];
+-
+-      if (!tg->parent)        /* the trivial, non-cgroup case */
+-              return wl;
+-
+-      for_each_sched_entity(se) {
+-              struct cfs_rq *cfs_rq = se->my_q;
+-              long W, w = cfs_rq_load_avg(cfs_rq);
+-
+-              tg = cfs_rq->tg;
+-
+-              /*
+-               * W = @wg + \Sum rw_j
+-               */
+-              W = wg + atomic_long_read(&tg->load_avg);
+-
+-              /* Ensure \Sum rw_j >= rw_i */
+-              W -= cfs_rq->tg_load_avg_contrib;
+-              W += w;
+-
+-              /*
+-               * w = rw_i + @wl
+-               */
+-              w += wl;
+-
+-              /*
+-               * wl = S * s'_i; see (2)
+-               */
+-              if (W > 0 && w < W)
+-                      wl = (w * (long)scale_load_down(tg->shares)) / W;
+-              else
+-                      wl = scale_load_down(tg->shares);
+-
+-              /*
+-               * Per the above, wl is the new se->load.weight value; since
+-               * those are clipped to [MIN_SHARES, ...) do so now. See
+-               * calc_cfs_shares().
+-               */
+-              if (wl < MIN_SHARES)
+-                      wl = MIN_SHARES;
+-
+-              /*
+-               * wl = dw_i = S * (s'_i - s_i); see (3)
+-               */
+-              wl -= se->avg.load_avg;
+-
+-              /*
+-               * Recursively apply this logic to all parent groups to compute
+-               * the final effective load change on the root group. Since
+-               * only the @tg group gets extra weight, all parent groups can
+-               * only redistribute existing shares. @wl is the shift in shares
+-               * resulting from this level per the above.
+-               */
+-              wg = 0;
+-      }
+-
+-      return wl;
+-}
+-#else
+-
+-static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
+-{
+-      return wl;
+-}
+-
+-#endif
+-
+ static void record_wakee(struct task_struct *p)
+ {
+       /*
diff --git a/queue-4.12/sched-fair-simplify-wake_affine-for-the-single-socket-case.patch b/queue-4.12/sched-fair-simplify-wake_affine-for-the-single-socket-case.patch
new file mode 100644 (file)
index 0000000..4d06404
--- /dev/null
@@ -0,0 +1,64 @@
+From 7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Fri, 23 Jun 2017 12:55:28 -0400
+Subject: sched/fair: Simplify wake_affine() for the single socket case
+
+From: Rik van Riel <riel@redhat.com>
+
+commit 7d894e6e34a5cdd12309c7e4a3f830277ad4b7bf upstream.
+
+Then 'this_cpu' and 'prev_cpu' are in the same socket, select_idle_sibling()
+will do its thing regardless of the return value of wake_affine().
+
+Just return true and don't look at all the other things.
+
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: jhladky@redhat.com
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/20170623165530.22514-3-riel@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |   13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5399,6 +5399,13 @@ static int wake_affine(struct sched_doma
+       this_load = target_load(this_cpu, idx);
+       /*
++       * Common case: CPUs are in the same socket, and select_idle_sibling()
++       * will do its thing regardless of what we return:
++       */
++      if (cpus_share_cache(prev_cpu, this_cpu))
++              return true;
++
++      /*
+        * If sync wakeup then subtract the (maximum possible)
+        * effect of the currently running task from the load
+        * of the current CPU:
+@@ -5986,11 +5993,15 @@ select_task_rq_fair(struct task_struct *
+       if (affine_sd) {
+               sd = NULL; /* Prefer wake_affine over balance flags */
+-              if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
++              if (cpu == prev_cpu)
++                      goto pick_cpu;
++
++              if (wake_affine(affine_sd, p, prev_cpu, sync))
+                       new_cpu = cpu;
+       }
+       if (!sd) {
++ pick_cpu:
+               if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+                       new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
diff --git a/queue-4.12/sched-numa-hide-numa_wake_affine-from-up-build.patch b/queue-4.12/sched-numa-hide-numa_wake_affine-from-up-build.patch
new file mode 100644 (file)
index 0000000..d81ed10
--- /dev/null
@@ -0,0 +1,48 @@
+From ff801b716effd652f420204eddb36f6e4a716819 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 29 Jun 2017 08:25:52 +0200
+Subject: sched/numa: Hide numa_wake_affine() from UP build
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit ff801b716effd652f420204eddb36f6e4a716819 upstream.
+
+Stephen reported the following build warning in UP:
+
+kernel/sched/fair.c:2657:9: warning: 'struct sched_domain' declared inside
+parameter list
+         ^
+/home/sfr/next/next/kernel/sched/fair.c:2657:9: warning: its scope is only this
+definition or declaration, which is probably not what you want
+
+Hide the numa_wake_affine() inline stub on UP builds to get rid of it.
+
+Fixes: 3fed382b46ba ("sched/numa: Implement NUMA node level wake_affine()")
+Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2651,12 +2651,14 @@ static inline void account_numa_dequeue(
+ {
+ }
++#ifdef CONFIG_SMP
+ static inline bool numa_wake_affine(struct sched_domain *sd,
+                                   struct task_struct *p, int this_cpu,
+                                   int prev_cpu, int sync)
+ {
+       return true;
+ }
++#endif /* !SMP */
+ #endif /* CONFIG_NUMA_BALANCING */
+ static void
diff --git a/queue-4.12/sched-numa-implement-numa-node-level-wake_affine.patch b/queue-4.12/sched-numa-implement-numa-node-level-wake_affine.patch
new file mode 100644 (file)
index 0000000..3fcc443
--- /dev/null
@@ -0,0 +1,195 @@
+From 3fed382b46baac83703130fe4cd3d9147f427fb9 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Fri, 23 Jun 2017 12:55:29 -0400
+Subject: sched/numa: Implement NUMA node level wake_affine()
+
+From: Rik van Riel <riel@redhat.com>
+
+commit 3fed382b46baac83703130fe4cd3d9147f427fb9 upstream.
+
+Since select_idle_sibling() can place a task anywhere on a socket,
+comparing loads between individual CPU cores makes no real sense
+for deciding whether to do an affine wakeup across sockets, either.
+
+Instead, compare the load between the sockets in a similar way the
+load balancer and the numa balancing code do.
+
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: jhladky@redhat.com
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/20170623165530.22514-4-riel@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |  130 ++++++++++++++++++++++++++++------------------------
+ 1 file changed, 71 insertions(+), 59 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2585,6 +2585,60 @@ void task_tick_numa(struct rq *rq, struc
+               }
+       }
+ }
++
++/*
++ * Can a task be moved from prev_cpu to this_cpu without causing a load
++ * imbalance that would trigger the load balancer?
++ */
++static inline bool numa_wake_affine(struct sched_domain *sd,
++                                  struct task_struct *p, int this_cpu,
++                                  int prev_cpu, int sync)
++{
++      struct numa_stats prev_load, this_load;
++      s64 this_eff_load, prev_eff_load;
++
++      update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
++      update_numa_stats(&this_load, cpu_to_node(this_cpu));
++
++      /*
++       * If sync wakeup then subtract the (maximum possible)
++       * effect of the currently running task from the load
++       * of the current CPU:
++       */
++      if (sync) {
++              unsigned long current_load = task_h_load(current);
++
++              if (this_load.load > current_load)
++                      this_load.load -= current_load;
++              else
++                      this_load.load = 0;
++      }
++
++      /*
++       * In low-load situations, where this_cpu's node is idle due to the
++       * sync cause above having dropped this_load.load to 0, move the task.
++       * Moving to an idle socket will not create a bad imbalance.
++       *
++       * Otherwise check if the nodes are near enough in load to allow this
++       * task to be woken on this_cpu's node.
++       */
++      if (this_load.load > 0) {
++              unsigned long task_load = task_h_load(p);
++
++              this_eff_load = 100;
++              this_eff_load *= prev_load.compute_capacity;
++
++              prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
++              prev_eff_load *= this_load.compute_capacity;
++
++              this_eff_load *= this_load.load + task_load;
++              prev_eff_load *= prev_load.load - task_load;
++
++              return this_eff_load <= prev_eff_load;
++      }
++
++      return true;
++}
+ #else
+ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+ {
+@@ -2597,6 +2651,13 @@ static inline void account_numa_enqueue(
+ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+ {
+ }
++
++static inline bool numa_wake_affine(struct sched_domain *sd,
++                                  struct task_struct *p, int this_cpu,
++                                  int prev_cpu, int sync)
++{
++      return true;
++}
+ #endif /* CONFIG_NUMA_BALANCING */
+ static void
+@@ -5386,74 +5447,25 @@ static int wake_wide(struct task_struct
+ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+                      int prev_cpu, int sync)
+ {
+-      s64 this_load, load;
+-      s64 this_eff_load, prev_eff_load;
+-      int idx, this_cpu;
+-      struct task_group *tg;
+-      unsigned long weight;
+-      int balanced;
+-
+-      idx       = sd->wake_idx;
+-      this_cpu  = smp_processor_id();
+-      load      = source_load(prev_cpu, idx);
+-      this_load = target_load(this_cpu, idx);
++      int this_cpu = smp_processor_id();
++      bool affine = false;
+       /*
+        * Common case: CPUs are in the same socket, and select_idle_sibling()
+        * will do its thing regardless of what we return:
+        */
+       if (cpus_share_cache(prev_cpu, this_cpu))
+-              return true;
+-
+-      /*
+-       * If sync wakeup then subtract the (maximum possible)
+-       * effect of the currently running task from the load
+-       * of the current CPU:
+-       */
+-      if (sync) {
+-              tg = task_group(current);
+-              weight = current->se.avg.load_avg;
+-
+-              this_load += effective_load(tg, this_cpu, -weight, -weight);
+-              load += effective_load(tg, prev_cpu, 0, -weight);
+-      }
+-
+-      tg = task_group(p);
+-      weight = p->se.avg.load_avg;
+-
+-      /*
+-       * In low-load situations, where prev_cpu is idle and this_cpu is idle
+-       * due to the sync cause above having dropped this_load to 0, we'll
+-       * always have an imbalance, but there's really nothing you can do
+-       * about that, so that's good too.
+-       *
+-       * Otherwise check if either cpus are near enough in load to allow this
+-       * task to be woken on this_cpu.
+-       */
+-      this_eff_load = 100;
+-      this_eff_load *= capacity_of(prev_cpu);
+-
+-      prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+-      prev_eff_load *= capacity_of(this_cpu);
+-
+-      if (this_load > 0) {
+-              this_eff_load *= this_load +
+-                      effective_load(tg, this_cpu, weight, weight);
+-
+-              prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+-      }
+-
+-      balanced = this_eff_load <= prev_eff_load;
++              affine = true;
++      else
++              affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+       schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
++      if (affine) {
++              schedstat_inc(sd->ttwu_move_affine);
++              schedstat_inc(p->se.statistics.nr_wakeups_affine);
++      }
+-      if (!balanced)
+-              return 0;
+-
+-      schedstat_inc(sd->ttwu_move_affine);
+-      schedstat_inc(p->se.statistics.nr_wakeups_affine);
+-
+-      return 1;
++      return affine;
+ }
+ static inline int task_util(struct task_struct *p);
diff --git a/queue-4.12/sched-numa-override-part-of-migrate_degrades_locality-when-idle-balancing.patch b/queue-4.12/sched-numa-override-part-of-migrate_degrades_locality-when-idle-balancing.patch
new file mode 100644 (file)
index 0000000..a563c60
--- /dev/null
@@ -0,0 +1,47 @@
+From 739294fb03f590401bbd7faa6d31a507e3ffada5 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Fri, 23 Jun 2017 12:55:27 -0400
+Subject: sched/numa: Override part of migrate_degrades_locality() when idle balancing
+
+From: Rik van Riel <riel@redhat.com>
+
+commit 739294fb03f590401bbd7faa6d31a507e3ffada5 upstream.
+
+Several tests in the NAS benchmark seem to run a lot slower with
+NUMA balancing enabled, than with NUMA balancing disabled. The
+slower run time corresponds with increased idle time.
+
+Overriding the final test of migrate_degrades_locality (but still
+doing the other NUMA tests first) seems to improve performance
+of those benchmarks.
+
+Reported-by: Jirka Hladky <jhladky@redhat.com>
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/20170623165530.22514-2-riel@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -6661,6 +6661,10 @@ static int migrate_degrades_locality(str
+       if (dst_nid == p->numa_preferred_nid)
+               return 0;
++      /* Leaving a core idle is often worse than degrading locality. */
++      if (env->idle != CPU_NOT_IDLE)
++              return -1;
++
+       if (numa_group) {
+               src_faults = group_faults(p, src_nid);
+               dst_faults = group_faults(p, dst_nid);
diff --git a/queue-4.12/sched-numa-use-down_read_trylock-for-the-mmap_sem.patch b/queue-4.12/sched-numa-use-down_read_trylock-for-the-mmap_sem.patch
new file mode 100644 (file)
index 0000000..901da92
--- /dev/null
@@ -0,0 +1,63 @@
+From 8655d5497735b288f8a9b458bd22e7d1bf95bb61 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Mon, 15 May 2017 15:13:16 +0200
+Subject: sched/numa: Use down_read_trylock() for the mmap_sem
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 8655d5497735b288f8a9b458bd22e7d1bf95bb61 upstream.
+
+A customer has reported a soft-lockup when running an intensive
+memory stress test, where the trace on multiple CPU's looks like this:
+
+ RIP: 0010:[<ffffffff810c53fe>]
+  [<ffffffff810c53fe>] native_queued_spin_lock_slowpath+0x10e/0x190
+...
+ Call Trace:
+  [<ffffffff81182d07>] queued_spin_lock_slowpath+0x7/0xa
+  [<ffffffff811bc331>] change_protection_range+0x3b1/0x930
+  [<ffffffff811d4be8>] change_prot_numa+0x18/0x30
+  [<ffffffff810adefe>] task_numa_work+0x1fe/0x310
+  [<ffffffff81098322>] task_work_run+0x72/0x90
+
+Further investigation showed that the lock contention here is pmd_lock().
+
+The task_numa_work() function makes sure that only one thread is let to perform
+the work in a single scan period (via cmpxchg), but if there's a thread with
+mmap_sem locked for writing for several periods, multiple threads in
+task_numa_work() can build up a convoy waiting for mmap_sem for read and then
+all get unblocked at once.
+
+This patch changes the down_read() to the trylock version, which prevents the
+build up. For a workload experiencing mmap_sem contention, it's probably better
+to postpone the NUMA balancing work anyway. This seems to have fixed the soft
+lockups involving pmd_lock(), which is in line with the convoy theory.
+
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Acked-by: Rik van Riel <riel@redhat.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/20170515131316.21909-1-vbabka@suse.cz
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2469,7 +2469,8 @@ void task_numa_work(struct callback_head
+               return;
+-      down_read(&mm->mmap_sem);
++      if (!down_read_trylock(&mm->mmap_sem))
++              return;
+       vma = find_vma(mm, start);
+       if (!vma) {
+               reset_ptenuma_scan(p);
index e99ca45d4bceb38025870f5a54ef14ca1fc4de37..991517d0cc8608a0cb209c4614d99efafad8bd47 100644 (file)
@@ -15,3 +15,11 @@ tpm-issue-a-tpm2_shutdown-for-tpm2-devices.patch
 tpm-fix-a-kernel-memory-leak-in-tpm-sysfs.c.patch
 powerpc-powernv-fix-cpu_hotplug-n-idle.c-compile-error.patch
 x86-uaccess-optimize-copy_user_enhanced_fast_string-for-short-strings.patch
+sched-fair-cpumask-export-for_each_cpu_wrap.patch
+sched-core-implement-new-approach-to-scale-select_idle_cpu.patch
+sched-numa-use-down_read_trylock-for-the-mmap_sem.patch
+sched-numa-override-part-of-migrate_degrades_locality-when-idle-balancing.patch
+sched-fair-simplify-wake_affine-for-the-single-socket-case.patch
+sched-numa-implement-numa-node-level-wake_affine.patch
+sched-fair-remove-effective_load.patch
+sched-numa-hide-numa_wake_affine-from-up-build.patch