From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 13 Jan 2014 18:28:27 +0000 (-0800)
Subject: 3.4-stable patches
X-Git-Tag: v3.4.77~1
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=721818c98ad3df5d9cb6aae0a059fa691e77b424;p=thirdparty%2Fkernel%2Fstable-queue.git

3.4-stable patches

added patches:
	sched-fix-cfs_bandwidth-misuse-of-hrtimer_expires_remaining.patch
	sched-fix-hrtimer_cancel-rq-lock-deadlock.patch
	sched-fix-race-on-toggling-cfs_bandwidth_used.patch
	sched-guarantee-new-group-entities-always-have-weight.patch
---

diff --git a/queue-3.4/sched-fix-cfs_bandwidth-misuse-of-hrtimer_expires_remaining.patch b/queue-3.4/sched-fix-cfs_bandwidth-misuse-of-hrtimer_expires_remaining.patch
new file mode 100644
index 00000000000..a5af1b91bc4
--- /dev/null
+++ b/queue-3.4/sched-fix-cfs_bandwidth-misuse-of-hrtimer_expires_remaining.patch
@@ -0,0 +1,57 @@
+From db06e78cc13d70f10877e0557becc88ab3ad2be8 Mon Sep 17 00:00:00 2001
+From: Ben Segall <bsegall@google.com>
+Date: Wed, 16 Oct 2013 11:16:17 -0700
+Subject: sched: Fix cfs_bandwidth misuse of hrtimer_expires_remaining
+
+From: Ben Segall <bsegall@google.com>
+
+commit db06e78cc13d70f10877e0557becc88ab3ad2be8 upstream.
+
+hrtimer_expires_remaining does not take internal hrtimer locks and thus
+must be guarded against concurrent __hrtimer_start_range_ns (but
+returning HRTIMER_RESTART is safe). Use cfs_b->lock to make it safe.
+
+Signed-off-by: Ben Segall <bsegall@google.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Cc: pjt@google.com
+Link: http://lkml.kernel.org/r/20131016181617.22647.73829.stgit@sword-of-the-dawn.mtv.corp.google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Chris J Arges <chris.j.arges@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |   14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1831,7 +1831,13 @@ static const u64 min_bandwidth_expiratio
+ /* how long we wait to gather additional slack before distributing */
+ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+ 
+-/* are we near the end of the current quota period? */
++/*
++ * Are we near the end of the current quota period?
++ *
++ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
++ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
++ * migrate_hrtimers, base is never cleared, so we are fine.
++ */
+ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+ {
+ 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
+@@ -1907,10 +1913,12 @@ static void do_sched_cfs_slack_timer(str
+ 	u64 expires;
+ 
+ 	/* confirm we're still not at a refresh boundary */
+-	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
++	raw_spin_lock(&cfs_b->lock);
++	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
++		raw_spin_unlock(&cfs_b->lock);
+ 		return;
++	}
+ 
+-	raw_spin_lock(&cfs_b->lock);
+ 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+ 		runtime = cfs_b->runtime;
+ 		cfs_b->runtime = 0;
diff --git a/queue-3.4/sched-fix-hrtimer_cancel-rq-lock-deadlock.patch b/queue-3.4/sched-fix-hrtimer_cancel-rq-lock-deadlock.patch
new file mode 100644
index 00000000000..d7cbf97073f
--- /dev/null
+++ b/queue-3.4/sched-fix-hrtimer_cancel-rq-lock-deadlock.patch
@@ -0,0 +1,63 @@
+From 927b54fccbf04207ec92f669dce6806848cbec7d Mon Sep 17 00:00:00 2001
+From: Ben Segall <bsegall@google.com>
+Date: Wed, 16 Oct 2013 11:16:22 -0700
+Subject: sched: Fix hrtimer_cancel()/rq->lock deadlock
+
+From: Ben Segall <bsegall@google.com>
+
+commit 927b54fccbf04207ec92f669dce6806848cbec7d upstream.
+
+__start_cfs_bandwidth calls hrtimer_cancel while holding rq->lock,
+waiting for the hrtimer to finish. However, if sched_cfs_period_timer
+runs for another loop iteration, the hrtimer can attempt to take
+rq->lock, resulting in deadlock.
+
+Fix this by ensuring that cfs_b->timer_active is cleared only if the
+_latest_ call to do_sched_cfs_period_timer is returning as idle. Then
+__start_cfs_bandwidth can just call hrtimer_try_to_cancel and wait for
+that to succeed or timer_active == 1.
+
+Signed-off-by: Ben Segall <bsegall@google.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Cc: pjt@google.com
+Link: http://lkml.kernel.org/r/20131016181622.22647.16643.stgit@sword-of-the-dawn.mtv.corp.google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Chris J Arges <chris.j.arges@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |   15 +++++++++++----
+ 1 file changed, 11 insertions(+), 4 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1771,6 +1771,13 @@ static int do_sched_cfs_period_timer(str
+ 	if (idle)
+ 		goto out_unlock;
+ 
++	/*
++	 * if we have relooped after returning idle once, we need to update our
++	 * status as actually running, so that other cpus doing
++	 * __start_cfs_bandwidth will stop trying to cancel us.
++	 */
++	cfs_b->timer_active = 1;
++
+ 	__refill_cfs_bandwidth_runtime(cfs_b);
+ 
+ 	if (!throttled) {
+@@ -2043,11 +2050,11 @@ void __start_cfs_bandwidth(struct cfs_ba
+ 	 * (timer_active==0 becomes visible before the hrtimer call-back
+ 	 * terminates).  In either case we ensure that it's re-programmed
+ 	 */
+-	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
++	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
++	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
++		/* bounce the lock to allow do_sched_cfs_period_timer to run */
+ 		raw_spin_unlock(&cfs_b->lock);
+-		/* ensure cfs_b->lock is available while we wait */
+-		hrtimer_cancel(&cfs_b->period_timer);
+-
++		cpu_relax();
+ 		raw_spin_lock(&cfs_b->lock);
+ 		/* if someone else restarted the timer then we're done */
+ 		if (cfs_b->timer_active)
diff --git a/queue-3.4/sched-fix-race-on-toggling-cfs_bandwidth_used.patch b/queue-3.4/sched-fix-race-on-toggling-cfs_bandwidth_used.patch
new file mode 100644
index 00000000000..0fc6cab4ed9
--- /dev/null
+++ b/queue-3.4/sched-fix-race-on-toggling-cfs_bandwidth_used.patch
@@ -0,0 +1,105 @@
+From 1ee14e6c8cddeeb8a490d7b54cd9016e4bb900b4 Mon Sep 17 00:00:00 2001
+From: Ben Segall <bsegall@google.com>
+Date: Wed, 16 Oct 2013 11:16:12 -0700
+Subject: sched: Fix race on toggling cfs_bandwidth_used
+
+From: Ben Segall <bsegall@google.com>
+
+commit 1ee14e6c8cddeeb8a490d7b54cd9016e4bb900b4 upstream.
+
+When we transition cfs_bandwidth_used to false, any currently
+throttled groups will incorrectly return false from cfs_rq_throttled.
+While tg_set_cfs_bandwidth will unthrottle them eventually, currently
+running code (including at least dequeue_task_fair and
+distribute_cfs_runtime) will cause errors.
+
+Fix this by turning off cfs_bandwidth_used only after unthrottling all
+cfs_rqs.
+
+Tested: toggle bandwidth back and forth on a loaded cgroup. Caused
+crashes in minutes without the patch, hasn't crashed with it.
+
+Signed-off-by: Ben Segall <bsegall@google.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Cc: pjt@google.com
+Link: http://lkml.kernel.org/r/20131016181611.22647.80365.stgit@sword-of-the-dawn.mtv.corp.google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Chris J Arges <chris.j.arges@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/core.c  |    9 ++++++++-
+ kernel/sched/fair.c  |   16 +++++++++-------
+ kernel/sched/sched.h |    3 ++-
+ 3 files changed, 19 insertions(+), 9 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -7906,7 +7906,12 @@ static int tg_set_cfs_bandwidth(struct t
+ 
+ 	runtime_enabled = quota != RUNTIME_INF;
+ 	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+-	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
++	/*
++	 * If we need to toggle cfs_bandwidth_used, off->on must occur
++	 * before making related changes, and on->off must occur afterwards
++	 */
++	if (runtime_enabled && !runtime_was_enabled)
++		cfs_bandwidth_usage_inc();
+ 	raw_spin_lock_irq(&cfs_b->lock);
+ 	cfs_b->period = ns_to_ktime(period);
+ 	cfs_b->quota = quota;
+@@ -7932,6 +7937,8 @@ static int tg_set_cfs_bandwidth(struct t
+ 			unthrottle_cfs_rq(cfs_rq);
+ 		raw_spin_unlock_irq(&rq->lock);
+ 	}
++	if (runtime_was_enabled && !runtime_enabled)
++		cfs_bandwidth_usage_dec();
+ out_unlock:
+ 	mutex_unlock(&cfs_constraints_mutex);
+ 
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1393,13 +1393,14 @@ static inline bool cfs_bandwidth_used(vo
+ 	return static_key_false(&__cfs_bandwidth_used);
+ }
+ 
+-void account_cfs_bandwidth_used(int enabled, int was_enabled)
++void cfs_bandwidth_usage_inc(void)
+ {
+-	/* only need to count groups transitioning between enabled/!enabled */
+-	if (enabled && !was_enabled)
+-		static_key_slow_inc(&__cfs_bandwidth_used);
+-	else if (!enabled && was_enabled)
+-		static_key_slow_dec(&__cfs_bandwidth_used);
++	static_key_slow_inc(&__cfs_bandwidth_used);
++}
++
++void cfs_bandwidth_usage_dec(void)
++{
++	static_key_slow_dec(&__cfs_bandwidth_used);
+ }
+ #else /* HAVE_JUMP_LABEL */
+ static bool cfs_bandwidth_used(void)
+@@ -1407,7 +1408,8 @@ static bool cfs_bandwidth_used(void)
+ 	return true;
+ }
+ 
+-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
++void cfs_bandwidth_usage_inc(void) {}
++void cfs_bandwidth_usage_dec(void) {}
+ #endif /* HAVE_JUMP_LABEL */
+ 
+ /*
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1140,7 +1140,8 @@ extern void init_cfs_rq(struct cfs_rq *c
+ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+ extern void unthrottle_offline_cfs_rqs(struct rq *rq);
+ 
+-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
++extern void cfs_bandwidth_usage_inc(void);
++extern void cfs_bandwidth_usage_dec(void);
+ 
+ #ifdef CONFIG_NO_HZ
+ enum rq_nohz_flag_bits {
diff --git a/queue-3.4/sched-guarantee-new-group-entities-always-have-weight.patch b/queue-3.4/sched-guarantee-new-group-entities-always-have-weight.patch
new file mode 100644
index 00000000000..301035a405c
--- /dev/null
+++ b/queue-3.4/sched-guarantee-new-group-entities-always-have-weight.patch
@@ -0,0 +1,61 @@
+From 0ac9b1c21874d2490331233b3242085f8151e166 Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Wed, 16 Oct 2013 11:16:27 -0700
+Subject: sched: Guarantee new group-entities always have weight
+
+From: Paul Turner <pjt@google.com>
+
+commit 0ac9b1c21874d2490331233b3242085f8151e166 upstream.
+
+Currently, group entity load-weights are initialized to zero. This
+admits some races with respect to the first time they are re-weighted in
+earlty use. ( Let g[x] denote the se for "g" on cpu "x". )
+
+Suppose that we have root->a and that a enters a throttled state,
+immediately followed by a[0]->t1 (the only task running on cpu[0])
+blocking:
+
+  put_prev_task(group_cfs_rq(a[0]), t1)
+  put_prev_entity(..., t1)
+  check_cfs_rq_runtime(group_cfs_rq(a[0]))
+  throttle_cfs_rq(group_cfs_rq(a[0]))
+
+Then, before unthrottling occurs, let a[0]->b[0]->t2 wake for the first
+time:
+
+  enqueue_task_fair(rq[0], t2)
+  enqueue_entity(group_cfs_rq(b[0]), t2)
+  enqueue_entity_load_avg(group_cfs_rq(b[0]), t2)
+  account_entity_enqueue(group_cfs_ra(b[0]), t2)
+  update_cfs_shares(group_cfs_rq(b[0]))
+  < skipped because b is part of a throttled hierarchy >
+  enqueue_entity(group_cfs_rq(a[0]), b[0])
+  ...
+
+We now have b[0] enqueued, yet group_cfs_rq(a[0])->load.weight == 0
+which violates invariants in several code-paths. Eliminate the
+possibility of this by initializing group entity weight.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/20131016181627.22647.47543.stgit@sword-of-the-dawn.mtv.corp.google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Chris J Arges <chris.j.arges@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5470,7 +5470,8 @@ void init_tg_cfs_entry(struct task_group
+ 		se->cfs_rq = parent->my_q;
+ 
+ 	se->my_q = cfs_rq;
+-	update_load_set(&se->load, 0);
++	/* guarantee group entities always have weight */
++	update_load_set(&se->load, NICE_0_LOAD);
+ 	se->parent = parent;
+ }
+ 
diff --git a/queue-3.4/series b/queue-3.4/series
index c859ae59d94..a39bf52d053 100644
--- a/queue-3.4/series
+++ b/queue-3.4/series
@@ -21,3 +21,7 @@ bridge-use-spin_lock_bh-in-br_multicast_set_hash_max.patch
 arm-fix-bad-mode-in-...-handler-message-for-undefined-instructions.patch
 arm-shmobile-mackerel-fix-coherent-dma-mask.patch
 x86-fpu-amd-clear-exceptions-in-amd-fxsave-workaround.patch
+sched-fix-race-on-toggling-cfs_bandwidth_used.patch
+sched-fix-cfs_bandwidth-misuse-of-hrtimer_expires_remaining.patch
+sched-fix-hrtimer_cancel-rq-lock-deadlock.patch
+sched-guarantee-new-group-entities-always-have-weight.patch