--- /dev/null
+From db06e78cc13d70f10877e0557becc88ab3ad2be8 Mon Sep 17 00:00:00 2001
+From: Ben Segall <bsegall@google.com>
+Date: Wed, 16 Oct 2013 11:16:17 -0700
+Subject: sched: Fix cfs_bandwidth misuse of hrtimer_expires_remaining
+
+From: Ben Segall <bsegall@google.com>
+
+commit db06e78cc13d70f10877e0557becc88ab3ad2be8 upstream.
+
+hrtimer_expires_remaining does not take internal hrtimer locks and thus
+must be guarded against concurrent __hrtimer_start_range_ns (but
+returning HRTIMER_RESTART is safe). Use cfs_b->lock to make it safe.
+
+Signed-off-by: Ben Segall <bsegall@google.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Cc: pjt@google.com
+Link: http://lkml.kernel.org/r/20131016181617.22647.73829.stgit@sword-of-the-dawn.mtv.corp.google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Chris J Arges <chris.j.arges@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2519,7 +2519,13 @@ static const u64 min_bandwidth_expiratio
+ /* how long we wait to gather additional slack before distributing */
+ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+-/* are we near the end of the current quota period? */
++/*
++ * Are we near the end of the current quota period?
++ *
++ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
++ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
++ * migrate_hrtimers, base is never cleared, so we are fine.
++ */
+ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+ {
+ struct hrtimer *refresh_timer = &cfs_b->period_timer;
+@@ -2595,10 +2601,12 @@ static void do_sched_cfs_slack_timer(str
+ u64 expires;
+
+ /* confirm we're still not at a refresh boundary */
+- if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
++ raw_spin_lock(&cfs_b->lock);
++ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
++ raw_spin_unlock(&cfs_b->lock);
+ return;
++ }
+
+- raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+ runtime = cfs_b->runtime;
+ cfs_b->runtime = 0;
--- /dev/null
+From 927b54fccbf04207ec92f669dce6806848cbec7d Mon Sep 17 00:00:00 2001
+From: Ben Segall <bsegall@google.com>
+Date: Wed, 16 Oct 2013 11:16:22 -0700
+Subject: sched: Fix hrtimer_cancel()/rq->lock deadlock
+
+From: Ben Segall <bsegall@google.com>
+
+commit 927b54fccbf04207ec92f669dce6806848cbec7d upstream.
+
+__start_cfs_bandwidth calls hrtimer_cancel while holding rq->lock,
+waiting for the hrtimer to finish. However, if sched_cfs_period_timer
+runs for another loop iteration, the hrtimer can attempt to take
+rq->lock, resulting in deadlock.
+
+Fix this by ensuring that cfs_b->timer_active is cleared only if the
+_latest_ call to do_sched_cfs_period_timer is returning as idle. Then
+__start_cfs_bandwidth can just call hrtimer_try_to_cancel and wait for
+that to succeed or timer_active == 1.
+
+Signed-off-by: Ben Segall <bsegall@google.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Cc: pjt@google.com
+Link: http://lkml.kernel.org/r/20131016181622.22647.16643.stgit@sword-of-the-dawn.mtv.corp.google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Chris J Arges <chris.j.arges@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 15 +++++++++++----
+ 1 file changed, 11 insertions(+), 4 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2459,6 +2459,13 @@ static int do_sched_cfs_period_timer(str
+ if (idle)
+ goto out_unlock;
+
++ /*
++ * if we have relooped after returning idle once, we need to update our
++ * status as actually running, so that other cpus doing
++ * __start_cfs_bandwidth will stop trying to cancel us.
++ */
++ cfs_b->timer_active = 1;
++
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
+ if (!throttled) {
+@@ -2727,11 +2734,11 @@ void __start_cfs_bandwidth(struct cfs_ba
+ * (timer_active==0 becomes visible before the hrtimer call-back
+ * terminates). In either case we ensure that it's re-programmed
+ */
+- while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
++ while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
++ hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
++ /* bounce the lock to allow do_sched_cfs_period_timer to run */
+ raw_spin_unlock(&cfs_b->lock);
+- /* ensure cfs_b->lock is available while we wait */
+- hrtimer_cancel(&cfs_b->period_timer);
+-
++ cpu_relax();
+ raw_spin_lock(&cfs_b->lock);
+ /* if someone else restarted the timer then we're done */
+ if (cfs_b->timer_active)
--- /dev/null
+From 1ee14e6c8cddeeb8a490d7b54cd9016e4bb900b4 Mon Sep 17 00:00:00 2001
+From: Ben Segall <bsegall@google.com>
+Date: Wed, 16 Oct 2013 11:16:12 -0700
+Subject: sched: Fix race on toggling cfs_bandwidth_used
+
+From: Ben Segall <bsegall@google.com>
+
+commit 1ee14e6c8cddeeb8a490d7b54cd9016e4bb900b4 upstream.
+
+When we transition cfs_bandwidth_used to false, any currently
+throttled groups will incorrectly return false from cfs_rq_throttled.
+While tg_set_cfs_bandwidth will unthrottle them eventually, currently
+running code (including at least dequeue_task_fair and
+distribute_cfs_runtime) will cause errors.
+
+Fix this by turning off cfs_bandwidth_used only after unthrottling all
+cfs_rqs.
+
+Tested: toggle bandwidth back and forth on a loaded cgroup. Caused
+crashes in minutes without the patch, hasn't crashed with it.
+
+Signed-off-by: Ben Segall <bsegall@google.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Cc: pjt@google.com
+Link: http://lkml.kernel.org/r/20131016181611.22647.80365.stgit@sword-of-the-dawn.mtv.corp.google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Chris J Arges <chris.j.arges@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/core.c | 9 ++++++++-
+ kernel/sched/fair.c | 16 +++++++++-------
+ kernel/sched/sched.h | 3 ++-
+ 3 files changed, 19 insertions(+), 9 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -7277,7 +7277,12 @@ static int tg_set_cfs_bandwidth(struct t
+
+ runtime_enabled = quota != RUNTIME_INF;
+ runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+- account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
++ /*
++ * If we need to toggle cfs_bandwidth_used, off->on must occur
++ * before making related changes, and on->off must occur afterwards
++ */
++ if (runtime_enabled && !runtime_was_enabled)
++ cfs_bandwidth_usage_inc();
+ raw_spin_lock_irq(&cfs_b->lock);
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+@@ -7303,6 +7308,8 @@ static int tg_set_cfs_bandwidth(struct t
+ unthrottle_cfs_rq(cfs_rq);
+ raw_spin_unlock_irq(&rq->lock);
+ }
++ if (runtime_was_enabled && !runtime_enabled)
++ cfs_bandwidth_usage_dec();
+ out_unlock:
+ mutex_unlock(&cfs_constraints_mutex);
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -2077,13 +2077,14 @@ static inline bool cfs_bandwidth_used(vo
+ return static_key_false(&__cfs_bandwidth_used);
+ }
+
+-void account_cfs_bandwidth_used(int enabled, int was_enabled)
++void cfs_bandwidth_usage_inc(void)
+ {
+- /* only need to count groups transitioning between enabled/!enabled */
+- if (enabled && !was_enabled)
+- static_key_slow_inc(&__cfs_bandwidth_used);
+- else if (!enabled && was_enabled)
+- static_key_slow_dec(&__cfs_bandwidth_used);
++ static_key_slow_inc(&__cfs_bandwidth_used);
++}
++
++void cfs_bandwidth_usage_dec(void)
++{
++ static_key_slow_dec(&__cfs_bandwidth_used);
+ }
+ #else /* HAVE_JUMP_LABEL */
+ static bool cfs_bandwidth_used(void)
+@@ -2091,7 +2092,8 @@ static bool cfs_bandwidth_used(void)
+ return true;
+ }
+
+-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
++void cfs_bandwidth_usage_inc(void) {}
++void cfs_bandwidth_usage_dec(void) {}
+ #endif /* HAVE_JUMP_LABEL */
+
+ /*
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1305,7 +1305,8 @@ extern void print_rt_stats(struct seq_fi
+ extern void init_cfs_rq(struct cfs_rq *cfs_rq);
+ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+
+-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
++extern void cfs_bandwidth_usage_inc(void);
++extern void cfs_bandwidth_usage_dec(void);
+
+ #ifdef CONFIG_NO_HZ_COMMON
+ enum rq_nohz_flag_bits {
--- /dev/null
+From 0ac9b1c21874d2490331233b3242085f8151e166 Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Wed, 16 Oct 2013 11:16:27 -0700
+Subject: sched: Guarantee new group-entities always have weight
+
+From: Paul Turner <pjt@google.com>
+
+commit 0ac9b1c21874d2490331233b3242085f8151e166 upstream.
+
+Currently, group entity load-weights are initialized to zero. This
+admits some races with respect to the first time they are re-weighted in
+earlty use. ( Let g[x] denote the se for "g" on cpu "x". )
+
+Suppose that we have root->a and that a enters a throttled state,
+immediately followed by a[0]->t1 (the only task running on cpu[0])
+blocking:
+
+ put_prev_task(group_cfs_rq(a[0]), t1)
+ put_prev_entity(..., t1)
+ check_cfs_rq_runtime(group_cfs_rq(a[0]))
+ throttle_cfs_rq(group_cfs_rq(a[0]))
+
+Then, before unthrottling occurs, let a[0]->b[0]->t2 wake for the first
+time:
+
+ enqueue_task_fair(rq[0], t2)
+ enqueue_entity(group_cfs_rq(b[0]), t2)
+ enqueue_entity_load_avg(group_cfs_rq(b[0]), t2)
+ account_entity_enqueue(group_cfs_ra(b[0]), t2)
+ update_cfs_shares(group_cfs_rq(b[0]))
+ < skipped because b is part of a throttled hierarchy >
+ enqueue_entity(group_cfs_rq(a[0]), b[0])
+ ...
+
+We now have b[0] enqueued, yet group_cfs_rq(a[0])->load.weight == 0
+which violates invariants in several code-paths. Eliminate the
+possibility of this by initializing group entity weight.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/20131016181627.22647.47543.stgit@sword-of-the-dawn.mtv.corp.google.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Chris J Arges <chris.j.arges@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -6240,7 +6240,8 @@ void init_tg_cfs_entry(struct task_group
+ se->cfs_rq = parent->my_q;
+
+ se->my_q = cfs_rq;
+- update_load_set(&se->load, 0);
++ /* guarantee group entities always have weight */
++ update_load_set(&se->load, NICE_0_LOAD);
+ se->parent = parent;
+ }
+
netfilter-fix-wrong-byte-order-in-nf_ct_seqadj_set-internal-information.patch
netfilter-nf_nat-fix-access-to-uninitialized-buffer-in-irc-nat-helper.patch
x86-fpu-amd-clear-exceptions-in-amd-fxsave-workaround.patch
+sched-fix-race-on-toggling-cfs_bandwidth_used.patch
+sched-fix-cfs_bandwidth-misuse-of-hrtimer_expires_remaining.patch
+sched-fix-hrtimer_cancel-rq-lock-deadlock.patch
+sched-guarantee-new-group-entities-always-have-weight.patch