From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 18 Feb 2014 22:37:04 +0000 (-0800)
Subject: 3.4-stable patches
X-Git-Tag: v3.4.81~3
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=ff08ad4408bf17367ded4d225d463518f8ae2d42;p=thirdparty%2Fkernel%2Fstable-queue.git

3.4-stable patches

added patches:
	ib-qib-convert-qib_user_sdma_pin_pages-to-use-get_user_pages_fast.patch
	sched-nohz-fix-rq-cpu_load-calculations-some-more.patch
	sched-nohz-fix-rq-cpu_load-calculations.patch
---

diff --git a/queue-3.4/ib-qib-convert-qib_user_sdma_pin_pages-to-use-get_user_pages_fast.patch b/queue-3.4/ib-qib-convert-qib_user_sdma_pin_pages-to-use-get_user_pages_fast.patch
new file mode 100644
index 00000000000..15cf9410524
--- /dev/null
+++ b/queue-3.4/ib-qib-convert-qib_user_sdma_pin_pages-to-use-get_user_pages_fast.patch
@@ -0,0 +1,60 @@
+From 603e7729920e42b3c2f4dbfab9eef4878cb6e8fa Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Fri, 4 Oct 2013 09:29:12 -0400
+Subject: IB/qib: Convert qib_user_sdma_pin_pages() to use get_user_pages_fast()
+
+From: Jan Kara <jack@suse.cz>
+
+commit 603e7729920e42b3c2f4dbfab9eef4878cb6e8fa upstream.
+
+qib_user_sdma_queue_pkts() gets called with mmap_sem held for
+writing. Except for get_user_pages() deep down in
+qib_user_sdma_pin_pages() we don't seem to need mmap_sem at all.  Even
+more interestingly the function qib_user_sdma_queue_pkts() (and also
+qib_user_sdma_coalesce() called somewhat later) call copy_from_user()
+which can hit a page fault and we deadlock on trying to get mmap_sem
+when handling that fault.
+
+So just make qib_user_sdma_pin_pages() use get_user_pages_fast() and
+leave mmap_sem locking for mm.
+
+This deadlock has actually been observed in the wild when the node
+is under memory pressure.
+
+Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Roland Dreier <roland@purestorage.com>
+[Backported to 3.4: (Thank to Ben Hutchings)
+ - Adjust context
+ - Adjust indentation and nr_pages argument in qib_user_sdma_pin_pages()]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/infiniband/hw/qib/qib_user_sdma.c |    6 +-----
+ 1 file changed, 1 insertion(+), 5 deletions(-)
+
+--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
++++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
+@@ -284,8 +284,7 @@ static int qib_user_sdma_pin_pages(const
+ 	int j;
+ 	int ret;
+ 
+-	ret = get_user_pages(current, current->mm, addr,
+-			     npages, 0, 1, pages, NULL);
++	ret = get_user_pages_fast(addr, npages, 0, pages);
+ 
+ 	if (ret != npages) {
+ 		int i;
+@@ -830,10 +829,7 @@ int qib_user_sdma_writev(struct qib_ctxt
+ 	while (dim) {
+ 		const int mxp = 8;
+ 
+-		down_write(&current->mm->mmap_sem);
+ 		ret = qib_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
+-		up_write(&current->mm->mmap_sem);
+-
+ 		if (ret <= 0)
+ 			goto done_unlock;
+ 		else {
diff --git a/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations-some-more.patch b/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations-some-more.patch
new file mode 100644
index 00000000000..5cb51f101ae
--- /dev/null
+++ b/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations-some-more.patch
@@ -0,0 +1,141 @@
+From 5aaa0b7a2ed5b12692c9ffb5222182bd558d3146 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Thu, 17 May 2012 17:15:29 +0200
+Subject: sched/nohz: Fix rq->cpu_load calculations some more
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit 5aaa0b7a2ed5b12692c9ffb5222182bd558d3146 upstream.
+
+Follow up on commit 556061b00 ("sched/nohz: Fix rq->cpu_load[]
+calculations") since while that fixed the busy case it regressed the
+mostly idle case.
+
+Add a callback from the nohz exit to also age the rq->cpu_load[]
+array. This closes the hole where either there was no nohz load
+balance pass during the nohz, or there was a 'significant' amount of
+idle time between the last nohz balance and the nohz exit.
+
+So we'll update unconditionally from the tick to not insert any
+accidental 0 load periods while busy, and we try and catch up from
+nohz idle balance and nohz exit. Both these are still prone to missing
+a jiffy, but that has always been the case.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: pjt@google.com
+Cc: Venkatesh Pallipadi <venki@google.com>
+Link: http://lkml.kernel.org/n/tip-kt0trz0apodbf84ucjfdbr1a@git.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Li Zefan <lizefan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/sched.h    |    1 
+ kernel/sched/core.c      |   53 ++++++++++++++++++++++++++++++++++++++---------
+ kernel/time/tick-sched.c |    1 
+ 3 files changed, 45 insertions(+), 10 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -144,6 +144,7 @@ extern unsigned long this_cpu_load(void)
+ 
+ 
+ extern void calc_global_load(unsigned long ticks);
++extern void update_cpu_load_nohz(void);
+ 
+ extern unsigned long get_parent_ip(unsigned long addr);
+ 
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -2649,25 +2649,32 @@ static void __update_cpu_load(struct rq
+ 	sched_avg_update(this_rq);
+ }
+ 
++#ifdef CONFIG_NO_HZ
++/*
++ * There is no sane way to deal with nohz on smp when using jiffies because the
++ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
++ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
++ *
++ * Therefore we cannot use the delta approach from the regular tick since that
++ * would seriously skew the load calculation. However we'll make do for those
++ * updates happening while idle (nohz_idle_balance) or coming out of idle
++ * (tick_nohz_idle_exit).
++ *
++ * This means we might still be one tick off for nohz periods.
++ */
++
+ /*
+  * Called from nohz_idle_balance() to update the load ratings before doing the
+  * idle balance.
+  */
+ void update_idle_cpu_load(struct rq *this_rq)
+ {
+-	unsigned long curr_jiffies = jiffies;
++	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+ 	unsigned long load = this_rq->load.weight;
+ 	unsigned long pending_updates;
+ 
+ 	/*
+-	 * Bloody broken means of dealing with nohz, but better than nothing..
+-	 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
+-	 * update and see 0 difference the one time and 2 the next, even though
+-	 * we ticked at roughtly the same rate.
+-	 *
+-	 * Hence we only use this from nohz_idle_balance() and skip this
+-	 * nonsense when called from the scheduler_tick() since that's
+-	 * guaranteed a stable rate.
++	 * bail if there's load or we're actually up-to-date.
+ 	 */
+ 	if (load || curr_jiffies == this_rq->last_load_update_tick)
+ 		return;
+@@ -2679,12 +2686,38 @@ void update_idle_cpu_load(struct rq *thi
+ }
+ 
+ /*
++ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
++ */
++void update_cpu_load_nohz(void)
++{
++	struct rq *this_rq = this_rq();
++	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
++	unsigned long pending_updates;
++
++	if (curr_jiffies == this_rq->last_load_update_tick)
++		return;
++
++	raw_spin_lock(&this_rq->lock);
++	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
++	if (pending_updates) {
++		this_rq->last_load_update_tick = curr_jiffies;
++		/*
++		 * We were idle, this means load 0, the current load might be
++		 * !0 due to remote wakeups and the sort.
++		 */
++		__update_cpu_load(this_rq, 0, pending_updates);
++	}
++	raw_spin_unlock(&this_rq->lock);
++}
++#endif /* CONFIG_NO_HZ */
++
++/*
+  * Called from scheduler_tick()
+  */
+ static void update_cpu_load_active(struct rq *this_rq)
+ {
+ 	/*
+-	 * See the mess in update_idle_cpu_load().
++	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+ 	 */
+ 	this_rq->last_load_update_tick = jiffies;
+ 	__update_cpu_load(this_rq, this_rq->load.weight, 1);
+--- a/kernel/time/tick-sched.c
++++ b/kernel/time/tick-sched.c
+@@ -582,6 +582,7 @@ void tick_nohz_idle_exit(void)
+ 	/* Update jiffies first */
+ 	select_nohz_load_balancer(0);
+ 	tick_do_update_jiffies64(now);
++	update_cpu_load_nohz();
+ 
+ #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ 	/*
diff --git a/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations.patch b/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations.patch
new file mode 100644
index 00000000000..9259e800bc2
--- /dev/null
+++ b/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations.patch
@@ -0,0 +1,145 @@
+From 556061b00c9f2fd6a5524b6bde823ef12f299ecf Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Fri, 11 May 2012 17:31:26 +0200
+Subject: sched/nohz: Fix rq->cpu_load[] calculations
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit 556061b00c9f2fd6a5524b6bde823ef12f299ecf upstream.
+
+While investigating why the load-balancer did funny I found that the
+rq->cpu_load[] tables were completely screwy.. a bit more digging
+revealed that the updates that got through were missing ticks followed
+by a catchup of 2 ticks.
+
+The catchup assumes the cpu was idle during that time (since only nohz
+can cause missed ticks and the machine is idle etc..) this means that
+esp. the higher indices were significantly lower than they ought to
+be.
+
+The reason for this is that its not correct to compare against jiffies
+on every jiffy on any other cpu than the cpu that updates jiffies.
+
+This patch cludges around it by only doing the catch-up stuff from
+nohz_idle_balance() and doing the regular stuff unconditionally from
+the tick.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: pjt@google.com
+Cc: Venkatesh Pallipadi <venki@google.com>
+Link: http://lkml.kernel.org/n/tip-tp4kj18xdd5aj4vvj0qg55s2@git.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Li Zefan <lizefan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/core.c  |   53 +++++++++++++++++++++++++++++++++++++--------------
+ kernel/sched/fair.c  |    2 -
+ kernel/sched/sched.h |    2 -
+ 3 files changed, 41 insertions(+), 16 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *
+ }
+ #endif
+ 
+-void update_cpu_load(struct rq *this_rq);
+-
+ static void set_load_weight(struct task_struct *p)
+ {
+ 	int prio = p->static_prio - MAX_RT_PRIO;
+@@ -2620,22 +2618,13 @@ decay_load_missed(unsigned long load, un
+  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+  * every tick. We fix it up based on jiffies.
+  */
+-void update_cpu_load(struct rq *this_rq)
++static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
++			      unsigned long pending_updates)
+ {
+-	unsigned long this_load = this_rq->load.weight;
+-	unsigned long curr_jiffies = jiffies;
+-	unsigned long pending_updates;
+ 	int i, scale;
+ 
+ 	this_rq->nr_load_updates++;
+ 
+-	/* Avoid repeated calls on same jiffy, when moving in and out of idle */
+-	if (curr_jiffies == this_rq->last_load_update_tick)
+-		return;
+-
+-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+-	this_rq->last_load_update_tick = curr_jiffies;
+-
+ 	/* Update our load: */
+ 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+ 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+@@ -2660,9 +2649,45 @@ void update_cpu_load(struct rq *this_rq)
+ 	sched_avg_update(this_rq);
+ }
+ 
++/*
++ * Called from nohz_idle_balance() to update the load ratings before doing the
++ * idle balance.
++ */
++void update_idle_cpu_load(struct rq *this_rq)
++{
++	unsigned long curr_jiffies = jiffies;
++	unsigned long load = this_rq->load.weight;
++	unsigned long pending_updates;
++
++	/*
++	 * Bloody broken means of dealing with nohz, but better than nothing..
++	 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
++	 * update and see 0 difference the one time and 2 the next, even though
++	 * we ticked at roughtly the same rate.
++	 *
++	 * Hence we only use this from nohz_idle_balance() and skip this
++	 * nonsense when called from the scheduler_tick() since that's
++	 * guaranteed a stable rate.
++	 */
++	if (load || curr_jiffies == this_rq->last_load_update_tick)
++		return;
++
++	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
++	this_rq->last_load_update_tick = curr_jiffies;
++
++	__update_cpu_load(this_rq, load, pending_updates);
++}
++
++/*
++ * Called from scheduler_tick()
++ */
+ static void update_cpu_load_active(struct rq *this_rq)
+ {
+-	update_cpu_load(this_rq);
++	/*
++	 * See the mess in update_idle_cpu_load().
++	 */
++	this_rq->last_load_update_tick = jiffies;
++	__update_cpu_load(this_rq, this_rq->load.weight, 1);
+ 
+ 	calc_load_account_active(this_rq);
+ }
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5042,7 +5042,7 @@ static void nohz_idle_balance(int this_c
+ 
+ 		raw_spin_lock_irq(&this_rq->lock);
+ 		update_rq_clock(this_rq);
+-		update_cpu_load(this_rq);
++		update_idle_cpu_load(this_rq);
+ 		raw_spin_unlock_irq(&this_rq->lock);
+ 
+ 		rebalance_domains(balance_cpu, CPU_IDLE);
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -873,7 +873,7 @@ extern void resched_cpu(int cpu);
+ extern struct rt_bandwidth def_rt_bandwidth;
+ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+ 
+-extern void update_cpu_load(struct rq *this_rq);
++extern void update_idle_cpu_load(struct rq *this_rq);
+ 
+ #ifdef CONFIG_CGROUP_CPUACCT
+ #include <linux/cgroup.h>
diff --git a/queue-3.4/series b/queue-3.4/series
index a3671465e00..b49f32ea2b6 100644
--- a/queue-3.4/series
+++ b/queue-3.4/series
@@ -11,3 +11,6 @@ dm-sysfs-fix-a-module-unload-race.patch
 ftrace-synchronize-setting-function_trace_op-with-ftrace_trace_function.patch
 ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch
 ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch
+sched-nohz-fix-rq-cpu_load-calculations.patch
+sched-nohz-fix-rq-cpu_load-calculations-some-more.patch
+ib-qib-convert-qib_user_sdma_pin_pages-to-use-get_user_pages_fast.patch