--- /dev/null
+From c6508a39640b9a27fc2bc10cb708152672c82045 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Fri, 14 Apr 2017 14:20:05 +0200
+Subject: sched/fair, cpumask: Export for_each_cpu_wrap()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit c6508a39640b9a27fc2bc10cb708152672c82045 upstream.
+
+commit c743f0a5c50f2fcbc628526279cfa24f3dabe182 upstream.
+
+More users for for_each_cpu_wrap() have appeared. Promote the construct
+to generic cpumask interface.
+
+The implementation is slightly modified to reduce arguments.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Lauro Ramos Venancio <lvenanci@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: lwang@redhat.com
+Link: http://lkml.kernel.org/r/20170414122005.o35me2h5nowqkxbv@hirez.programming.kicks-ass.net
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cpumask.h | 17 +++++++++++++++++
+ kernel/sched/fair.c | 45 ++++-----------------------------------------
+ lib/cpumask.c | 32 ++++++++++++++++++++++++++++++++
+ 3 files changed, 53 insertions(+), 41 deletions(-)
+
+--- a/include/linux/cpumask.h
++++ b/include/linux/cpumask.h
+@@ -236,6 +236,23 @@ unsigned int cpumask_local_spread(unsign
+ (cpu) = cpumask_next_zero((cpu), (mask)), \
+ (cpu) < nr_cpu_ids;)
+
++extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
++
++/**
++ * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
++ * @cpu: the (optionally unsigned) integer iterator
++ * @mask: the cpumask poiter
++ * @start: the start location
++ *
++ * The implementation does not assume any bit in @mask is set (including @start).
++ *
++ * After the loop, cpu is >= nr_cpu_ids.
++ */
++#define for_each_cpu_wrap(cpu, mask, start) \
++ for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \
++ (cpu) < nr_cpumask_bits; \
++ (cpu) = cpumask_next_wrap((cpu), (mask), (start), true))
++
+ /**
+ * for_each_cpu_and - iterate over every cpu in both masks
+ * @cpu: the (optionally unsigned) integer iterator
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5310,43 +5310,6 @@ find_idlest_cpu(struct sched_group *grou
+ return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ }
+
+-/*
+- * Implement a for_each_cpu() variant that starts the scan at a given cpu
+- * (@start), and wraps around.
+- *
+- * This is used to scan for idle CPUs; such that not all CPUs looking for an
+- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+- * through the LLC domain.
+- *
+- * Especially tbench is found sensitive to this.
+- */
+-
+-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+-{
+- int next;
+-
+-again:
+- next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+-
+- if (*wrapped) {
+- if (next >= start)
+- return nr_cpumask_bits;
+- } else {
+- if (next >= nr_cpumask_bits) {
+- *wrapped = 1;
+- n = -1;
+- goto again;
+- }
+- }
+-
+- return next;
+-}
+-
+-#define for_each_cpu_wrap(cpu, mask, start, wrap) \
+- for ((wrap) = 0, (cpu) = (start)-1; \
+- (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
+- (cpu) < nr_cpumask_bits; )
+-
+ #ifdef CONFIG_SCHED_SMT
+
+ static inline void set_idle_cores(int cpu, int val)
+@@ -5406,14 +5369,14 @@ unlock:
+ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+ {
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+- int core, cpu, wrap;
++ int core, cpu;
+
+ if (!test_idle_cores(target, false))
+ return -1;
+
+ cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+
+- for_each_cpu_wrap(core, cpus, target, wrap) {
++ for_each_cpu_wrap(core, cpus, target) {
+ bool idle = true;
+
+ for_each_cpu(cpu, cpu_smt_mask(core)) {
+@@ -5476,7 +5439,7 @@ static int select_idle_cpu(struct task_s
+ u64 avg_cost, avg_idle = this_rq()->avg_idle;
+ u64 time, cost;
+ s64 delta;
+- int cpu, wrap;
++ int cpu;
+
+ this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+ if (!this_sd)
+@@ -5493,7 +5456,7 @@ static int select_idle_cpu(struct task_s
+
+ time = local_clock();
+
+- for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
++ for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(cpu))
+--- a/lib/cpumask.c
++++ b/lib/cpumask.c
+@@ -43,6 +43,38 @@ int cpumask_any_but(const struct cpumask
+ }
+ EXPORT_SYMBOL(cpumask_any_but);
+
++/**
++ * cpumask_next_wrap - helper to implement for_each_cpu_wrap
++ * @n: the cpu prior to the place to search
++ * @mask: the cpumask pointer
++ * @start: the start point of the iteration
++ * @wrap: assume @n crossing @start terminates the iteration
++ *
++ * Returns >= nr_cpu_ids on completion
++ *
++ * Note: the @wrap argument is required for the start condition when
++ * we cannot assume @start is set in @mask.
++ */
++int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
++{
++ int next;
++
++again:
++ next = cpumask_next(n, mask);
++
++ if (wrap && n < start && next >= start) {
++ return nr_cpumask_bits;
++
++ } else if (next >= nr_cpumask_bits) {
++ wrap = true;
++ n = -1;
++ goto again;
++ }
++
++ return next;
++}
++EXPORT_SYMBOL(cpumask_next_wrap);
++
+ /* These are not inline because of header tangles. */
+ #ifdef CONFIG_CPUMASK_OFFSTACK
+ /**
--- /dev/null
+From f32d782e31bf079f600dcec126ed117b0577e85c Mon Sep 17 00:00:00 2001
+From: Lauro Ramos Venancio <lvenanci@redhat.com>
+Date: Thu, 20 Apr 2017 16:51:40 -0300
+Subject: sched/topology: Optimize build_group_mask()
+
+From: Lauro Ramos Venancio <lvenanci@redhat.com>
+
+commit f32d782e31bf079f600dcec126ed117b0577e85c upstream.
+
+The group mask is always used in intersection with the group CPUs. So,
+when building the group mask, we don't have to care about CPUs that are
+not part of the group.
+
+Signed-off-by: Lauro Ramos Venancio <lvenanci@redhat.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: lwang@redhat.com
+Cc: riel@redhat.com
+Link: http://lkml.kernel.org/r/1492717903-5195-2-git-send-email-lvenanci@redhat.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/core.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -6113,12 +6113,12 @@ enum s_alloc {
+ */
+ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+ {
+- const struct cpumask *span = sched_domain_span(sd);
++ const struct cpumask *sg_span = sched_group_cpus(sg);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+- for_each_cpu(i, span) {
++ for_each_cpu(i, sg_span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;