--- /dev/null
+From oleg@redhat.com Fri Sep 17 18:17:33 2010
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Mon, 15 Mar 2010 10:10:23 +0100
+Subject: sched: _cpu_down(): Don't play with current->cpus_allowed
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <6ee57a0da8d81973a62d3c1ce12c5c96e2634b04.1283514307.git.efault@gmx.de>
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit 6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 upstream
+
+_cpu_down() changes the current task's affinity and then recovers it at
+the end. The problems are well known: we can't restore old_allowed if it
+was bound to the now-dead-cpu, and we can race with the userspace which
+can change cpu-affinity during unplug.
+
+_cpu_down() should not play with current->cpus_allowed at all. Instead,
+take_cpu_down() can migrate the caller of _cpu_down() after __cpu_disable()
+removes the dying cpu from cpu_online_mask.
+
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <20100315091023.GA9148@redhat.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ include/linux/sched.h | 1 +
+ kernel/cpu.c | 18 ++++++------------
+ kernel/sched.c | 2 +-
+ 3 files changed, 8 insertions(+), 13 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1887,6 +1887,7 @@ extern void sched_clock_idle_sleep_event
+ extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+
+ #ifdef CONFIG_HOTPLUG_CPU
++extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
+ extern void idle_task_exit(void);
+ #else
+ static inline void idle_task_exit(void) {}
+--- a/kernel/cpu.c
++++ b/kernel/cpu.c
+@@ -163,6 +163,7 @@ static inline void check_for_tasks(int c
+ }
+
+ struct take_cpu_down_param {
++ struct task_struct *caller;
+ unsigned long mod;
+ void *hcpu;
+ };
+@@ -171,6 +172,7 @@ struct take_cpu_down_param {
+ static int __ref take_cpu_down(void *_param)
+ {
+ struct take_cpu_down_param *param = _param;
++ unsigned int cpu = (unsigned long)param->hcpu;
+ int err;
+
+ /* Ensure this CPU doesn't handle any more interrupts. */
+@@ -181,6 +183,8 @@ static int __ref take_cpu_down(void *_pa
+ raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
+ param->hcpu);
+
++ if (task_cpu(param->caller) == cpu)
++ move_task_off_dead_cpu(cpu, param->caller);
+ /* Force idle task to run as soon as we yield: it should
+ immediately notice cpu is offline and die quickly. */
+ sched_idle_next();
+@@ -191,10 +195,10 @@ static int __ref take_cpu_down(void *_pa
+ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
+ {
+ int err, nr_calls = 0;
+- cpumask_var_t old_allowed;
+ void *hcpu = (void *)(long)cpu;
+ unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
+ struct take_cpu_down_param tcd_param = {
++ .caller = current,
+ .mod = mod,
+ .hcpu = hcpu,
+ };
+@@ -205,9 +209,6 @@ static int __ref _cpu_down(unsigned int
+ if (!cpu_online(cpu))
+ return -EINVAL;
+
+- if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
+- return -ENOMEM;
+-
+ cpu_hotplug_begin();
+ set_cpu_active(cpu, false);
+ err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
+@@ -224,10 +225,6 @@ static int __ref _cpu_down(unsigned int
+ goto out_release;
+ }
+
+- /* Ensure that we are not runnable on dying cpu */
+- cpumask_copy(old_allowed, ¤t->cpus_allowed);
+- set_cpus_allowed_ptr(current, cpu_active_mask);
+-
+ err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+ if (err) {
+ set_cpu_active(cpu, true);
+@@ -236,7 +233,7 @@ static int __ref _cpu_down(unsigned int
+ hcpu) == NOTIFY_BAD)
+ BUG();
+
+- goto out_allowed;
++ goto out_release;
+ }
+ BUG_ON(cpu_online(cpu));
+
+@@ -254,8 +251,6 @@ static int __ref _cpu_down(unsigned int
+
+ check_for_tasks(cpu);
+
+-out_allowed:
+- set_cpus_allowed_ptr(current, old_allowed);
+ out_release:
+ cpu_hotplug_done();
+ if (!err) {
+@@ -263,7 +258,6 @@ out_release:
+ hcpu) == NOTIFY_BAD)
+ BUG();
+ }
+- free_cpumask_var(old_allowed);
+ return err;
+ }
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -7393,7 +7393,7 @@ static int migration_thread(void *data)
+ /*
+ * Figure out where task on dead CPU should go, use force if necessary.
+ */
+-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
++void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+ {
+ struct rq *rq = cpu_rq(dead_cpu);
+ int needs_cpu, uninitialized_var(dest_cpu);
--- /dev/null
+From daniel.blueman@gmail.com Fri Sep 17 18:19:12 2010
+From: Daniel J Blueman <daniel.blueman@gmail.com>
+Date: Tue, 1 Jun 2010 14:06:13 +0100
+Subject: sched: apply RCU protection to wake_affine()
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <a0298aef3a2239c68e4874964036fa4f2dd3a79c.1283514307.git.efault@gmx.de>
+
+From: Daniel J Blueman <daniel.blueman@gmail.com>
+
+commit f3b577dec1f2ce32d2db6d2ca6badff7002512af upstream
+
+The task_group() function returns a pointer that must be protected
+by either RCU, the ->alloc_lock, or the cgroup lock (see the
+rcu_dereference_check() in task_subsys_state(), which is invoked by
+task_group()). The wake_affine() function currently does none of these,
+which means that a concurrent update would be within its rights to free
+the structure returned by task_group(). Because wake_affine() uses this
+structure only to compute load-balancing heuristics, there is no reason
+to acquire either of the two locks.
+
+Therefore, this commit introduces an RCU read-side critical section that
+starts before the first call to task_group() and ends after the last use
+of the "tg" pointer returned from task_group(). Thanks to Li Zefan for
+pointing out the need to extend the RCU read-side critical section from
+that proposed by the original patch.
+
+Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
+Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched_fair.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1250,6 +1250,7 @@ static int wake_affine(struct sched_doma
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
++ rcu_read_lock();
+ if (sync) {
+ tg = task_group(current);
+ weight = current->se.load.weight;
+@@ -1275,6 +1276,7 @@ static int wake_affine(struct sched_doma
+ balanced = !this_load ||
+ 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
+ imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
++ rcu_read_unlock();
+
+ /*
+ * If the currently running task will sleep within
--- /dev/null
+From a.p.zijlstra@chello.nl Fri Sep 17 18:19:30 2010
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Thu, 12 Nov 2009 15:55:28 +0100
+Subject: sched: Cleanup select_task_rq_fair()
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <80263dd5bd5a2069a3907f0408ab2f73377f0b8a.1283514307.git.efault@gmx.de>
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit a50bde5130f65733142b32975616427d0ea50856 upstream
+
+Clean up the new affine to idle sibling bits while trying to
+grok them. Should not have any function differences.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Mike Galbraith <efault@gmx.de>
+LKML-Reference: <20091112145610.832503781@chello.nl>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched_fair.c | 73 ++++++++++++++++++++++++++++++++++++----------------
+ 1 file changed, 51 insertions(+), 22 deletions(-)
+
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1384,6 +1384,41 @@ find_idlest_cpu(struct sched_group *grou
+ }
+
+ /*
++ * Try and locate an idle CPU in the sched_domain.
++ */
++static int
++select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
++{
++ int cpu = smp_processor_id();
++ int prev_cpu = task_cpu(p);
++ int i;
++
++ /*
++ * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
++ * test in select_task_rq_fair) and the prev_cpu is idle then that's
++ * always a better target than the current cpu.
++ */
++ if (target == cpu) {
++ if (!cpu_rq(prev_cpu)->cfs.nr_running)
++ target = prev_cpu;
++ }
++
++ /*
++ * Otherwise, iterate the domain and find an elegible idle cpu.
++ */
++ if (target == -1 || target == cpu) {
++ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
++ if (!cpu_rq(i)->cfs.nr_running) {
++ target = i;
++ break;
++ }
++ }
++ }
++
++ return target;
++}
++
++/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+@@ -1441,36 +1476,30 @@ select_task_rq_fair(struct rq *rq, struc
+ }
+
+ if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) {
+- int candidate = -1, i;
++ int target = -1;
+
++ /*
++ * If both cpu and prev_cpu are part of this domain,
++ * cpu is a valid SD_WAKE_AFFINE target.
++ */
+ if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+- candidate = cpu;
++ target = cpu;
+
+ /*
+- * Check for an idle shared cache.
++ * If there's an idle sibling in this domain, make that
++ * the wake_affine target instead of the current cpu.
++ *
++ * XXX: should we possibly do this outside of
++ * WAKE_AFFINE, in case the shared cache domain is
++ * smaller than the WAKE_AFFINE domain?
+ */
+- if (tmp->flags & SD_PREFER_SIBLING) {
+- if (candidate == cpu) {
+- if (!cpu_rq(prev_cpu)->cfs.nr_running)
+- candidate = prev_cpu;
+- }
+-
+- if (candidate == -1 || candidate == cpu) {
+- for_each_cpu(i, sched_domain_span(tmp)) {
+- if (!cpumask_test_cpu(i, &p->cpus_allowed))
+- continue;
+- if (!cpu_rq(i)->cfs.nr_running) {
+- candidate = i;
+- break;
+- }
+- }
+- }
+- }
++ if (tmp->flags & SD_PREFER_SIBLING)
++ target = select_idle_sibling(p, tmp, target);
+
+- if (candidate >= 0) {
++ if (target >= 0) {
+ affine_sd = tmp;
+ want_affine = 0;
+- cpu = candidate;
++ cpu = target;
+ }
+ }
+
--- /dev/null
+From anton@samba.org Fri Sep 17 18:20:49 2010
+From: Anton Blanchard <anton@samba.org>
+Date: Tue, 2 Feb 2010 14:46:13 -0800
+Subject: sched: cpuacct: Use bigger percpu counter batch values for stats counters
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <096b1867bf2f9b6a3fc6c4ed114a02c181d3d77e.1283514307.git.efault@gmx.de>
+
+From: Anton Blanchard <anton@samba.org>
+
+commit fa535a77bd3fa32b9215ba375d6a202fe73e1dd6 upstream
+
+When CONFIG_VIRT_CPU_ACCOUNTING and CONFIG_CGROUP_CPUACCT are
+enabled we can call cpuacct_update_stats with values much larger
+than percpu_counter_batch. This means the call to
+percpu_counter_add will always add to the global count which is
+protected by a spinlock and we end up with a global spinlock in
+the scheduler.
+
+Based on an idea by KOSAKI Motohiro, this patch scales the batch
+value by cputime_one_jiffy such that we have the same batch
+limit as we would if CONFIG_VIRT_CPU_ACCOUNTING was disabled.
+His patch did this once at boot but that initialisation happened
+too early on PowerPC (before time_init) and it was never updated
+at runtime as a result of a hotplug cpu add/remove.
+
+This patch instead scales percpu_counter_batch by
+cputime_one_jiffy at runtime, which keeps the batch correct even
+after cpu hotplug operations. We cap it at INT_MAX in case of
+overflow.
+
+For architectures that do not support
+CONFIG_VIRT_CPU_ACCOUNTING, cputime_one_jiffy is the constant 1
+and gcc is smart enough to optimise min(s32
+percpu_counter_batch, INT_MAX) to just percpu_counter_batch at
+least on x86 and PowerPC. So there is no need to add an #ifdef.
+
+On a 64 thread PowerPC box with CONFIG_VIRT_CPU_ACCOUNTING and
+CONFIG_CGROUP_CPUACCT enabled, a context switch microbenchmark
+is 234x faster and almost matches a CONFIG_CGROUP_CPUACCT
+disabled kernel:
+
+ CONFIG_CGROUP_CPUACCT disabled: 16906698 ctx switches/sec
+ CONFIG_CGROUP_CPUACCT enabled: 61720 ctx switches/sec
+ CONFIG_CGROUP_CPUACCT + patch: 16663217 ctx switches/sec
+
+Tested with:
+
+ wget http://ozlabs.org/~anton/junkcode/context_switch.c
+ make context_switch
+ for i in `seq 0 63`; do taskset -c $i ./context_switch & done
+ vmstat 1
+
+Signed-off-by: Anton Blanchard <anton@samba.org>
+Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
+Tested-by: Balbir Singh <balbir@linux.vnet.ibm.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Cc: "Luck, Tony" <tony.luck@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched.c | 20 +++++++++++++++++++-
+ 1 file changed, 19 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -10934,12 +10934,30 @@ static void cpuacct_charge(struct task_s
+ }
+
+ /*
++ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
++ * in cputime_t units. As a result, cpuacct_update_stats calls
++ * percpu_counter_add with values large enough to always overflow the
++ * per cpu batch limit causing bad SMP scalability.
++ *
++ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
++ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
++ * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
++ */
++#ifdef CONFIG_SMP
++#define CPUACCT_BATCH \
++ min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
++#else
++#define CPUACCT_BATCH 0
++#endif
++
++/*
+ * Charge the system/user time to the task's accounting group.
+ */
+ static void cpuacct_update_stats(struct task_struct *tsk,
+ enum cpuacct_stat_index idx, cputime_t val)
+ {
+ struct cpuacct *ca;
++ int batch = CPUACCT_BATCH;
+
+ if (unlikely(!cpuacct_subsys.active))
+ return;
+@@ -10948,7 +10966,7 @@ static void cpuacct_update_stats(struct
+ ca = task_ca(tsk);
+
+ do {
+- percpu_counter_add(&ca->cpustat[idx], val);
++ __percpu_counter_add(&ca->cpustat[idx], val, batch);
+ ca = ca->parent;
+ } while (ca);
+ rcu_read_unlock();
--- /dev/null
+From tglx@linutronix.de Fri Sep 17 18:13:56 2010
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed, 20 Jan 2010 20:58:57 +0000
+Subject: sched: Extend enqueue_task to allow head queueing
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <e3b3be0a0a3a5c31d5e9f4243f9170302b0de6e5.1283514307.git.efault@gmx.de>
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit ea87bb7853168434f4a82426dd1ea8421f9e604d upstream
+
+The ability of enqueueing a task to the head of a SCHED_FIFO priority
+list is required to fix some violations of POSIX scheduling policy.
+
+Extend the related functions with a "head" argument.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Peter Zijlstra <peterz@infradead.org>
+Tested-by: Carsten Emde <cbe@osadl.org>
+Tested-by: Mathias Weber <mathias.weber.mw1@roche.com>
+LKML-Reference: <20100120171629.734886007@linutronix.de>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ include/linux/sched.h | 3 ++-
+ kernel/sched.c | 13 +++++++------
+ kernel/sched_fair.c | 3 ++-
+ kernel/sched_rt.c | 3 ++-
+ 4 files changed, 13 insertions(+), 9 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1071,7 +1071,8 @@ struct sched_domain;
+ struct sched_class {
+ const struct sched_class *next;
+
+- void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
++ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup,
++ bool head);
+ void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
+ void (*yield_task) (struct rq *rq);
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -1903,13 +1903,14 @@ static void update_avg(u64 *avg, u64 sam
+ *avg += diff >> 3;
+ }
+
+-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
++static void
++enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+ {
+ if (wakeup)
+ p->se.start_runtime = p->se.sum_exec_runtime;
+
+ sched_info_queued(p);
+- p->sched_class->enqueue_task(rq, p, wakeup);
++ p->sched_class->enqueue_task(rq, p, wakeup, head);
+ p->se.on_rq = 1;
+ }
+
+@@ -1985,7 +1986,7 @@ static void activate_task(struct rq *rq,
+ if (task_contributes_to_load(p))
+ rq->nr_uninterruptible--;
+
+- enqueue_task(rq, p, wakeup);
++ enqueue_task(rq, p, wakeup, false);
+ inc_nr_running(rq);
+ }
+
+@@ -6183,7 +6184,7 @@ void rt_mutex_setprio(struct task_struct
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (on_rq) {
+- enqueue_task(rq, p, 0);
++ enqueue_task(rq, p, 0, false);
+
+ check_class_changed(rq, p, prev_class, oldprio, running);
+ }
+@@ -6227,7 +6228,7 @@ void set_user_nice(struct task_struct *p
+ delta = p->prio - old_prio;
+
+ if (on_rq) {
+- enqueue_task(rq, p, 0);
++ enqueue_task(rq, p, 0, false);
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+@@ -10180,7 +10181,7 @@ void sched_move_task(struct task_struct
+ if (unlikely(running))
+ tsk->sched_class->set_curr_task(rq);
+ if (on_rq)
+- enqueue_task(rq, tsk, 0);
++ enqueue_task(rq, tsk, 0, false);
+
+ task_rq_unlock(rq, &flags);
+ }
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1031,7 +1031,8 @@ static inline void hrtick_update(struct
+ * increased. Here we update the fair scheduling stats and
+ * then put the task into the rbtree:
+ */
+-static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
++static void
++enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+ {
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se;
+--- a/kernel/sched_rt.c
++++ b/kernel/sched_rt.c
+@@ -878,7 +878,8 @@ static void dequeue_rt_entity(struct sch
+ /*
+ * Adding/removing a task to/from a priority array:
+ */
+-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
++static void
++enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
+ {
+ struct sched_rt_entity *rt_se = &p->rt;
+
--- /dev/null
+From peterz@infradead.org Fri Sep 17 18:13:26 2010
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Thu, 21 Jan 2010 16:34:27 +0100
+Subject: sched: Fix incorrect sanity check
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <550df2da0c2d00162a463923644fd024de95b890.1283514307.git.efault@gmx.de>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 11854247e2c851e7ff9ce138e501c6cffc5a4217 upstream
+
+We moved to migrate on wakeup, which means that sleeping tasks could
+still be present on offline cpus. Amend the check to only test running
+tasks.
+
+Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/cpu.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/cpu.c
++++ b/kernel/cpu.c
+@@ -151,7 +151,7 @@ static inline void check_for_tasks(int c
+
+ write_lock_irq(&tasklist_lock);
+ for_each_process(p) {
+- if (task_cpu(p) == cpu &&
++ if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
+ (!cputime_eq(p->utime, cputime_zero) ||
+ !cputime_eq(p->stime, cputime_zero)))
+ printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
--- /dev/null
+From a.p.zijlstra@chello.nl Fri Sep 17 18:18:32 2010
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Fri, 26 Mar 2010 12:22:14 +0100
+Subject: sched: Fix nr_uninterruptible count
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <c1b37a706324879a325f2ec268f2dc1b9958060c.1283514307.git.efault@gmx.de>
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit cc87f76a601d2d256118f7bab15e35254356ae21 upstream
+
+The cpuload calculation in calc_load_account_active() assumes
+rq->nr_uninterruptible will not change on an offline cpu after
+migrate_nr_uninterruptible(). However the recent migrate on wakeup
+changes broke that and would result in decrementing the offline cpu's
+rq->nr_uninterruptible.
+
+Fix this by accounting the nr_uninterruptible on the waking cpu.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <new-submission>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -2435,8 +2435,12 @@ static int try_to_wake_up(struct task_st
+ *
+ * First fix up the nr_uninterruptible count:
+ */
+- if (task_contributes_to_load(p))
+- rq->nr_uninterruptible--;
++ if (task_contributes_to_load(p)) {
++ if (likely(cpu_online(orig_cpu)))
++ rq->nr_uninterruptible--;
++ else
++ this_rq()->nr_uninterruptible--;
++ }
+ p->state = TASK_WAKING;
+
+ if (p->sched_class->task_waking)
--- /dev/null
+From a.p.zijlstra@chello.nl Fri Sep 17 18:13:39 2010
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Mon, 15 Feb 2010 14:45:54 +0100
+Subject: sched: Fix race between ttwu() and task_rq_lock()
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <80faa6f269f4bd7825aec22056bbca743b5bd100.1283514307.git.efault@gmx.de>
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit 0970d2992dfd7d5ec2c787417cf464f01eeaf42a upstream
+
+Thomas found that due to ttwu() changing a task's cpu without holding
+the rq->lock, task_rq_lock() might end up locking the wrong rq.
+
+Avoid this by serializing against TASK_WAKING.
+
+Reported-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <1266241712.15770.420.camel@laptop>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched.c | 71 ++++++++++++++++++++++++++++++++++++---------------------
+ 1 file changed, 45 insertions(+), 26 deletions(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -942,16 +942,33 @@ static inline void finish_lock_switch(st
+ #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+ /*
++ * Check whether the task is waking, we use this to synchronize against
++ * ttwu() so that task_cpu() reports a stable number.
++ *
++ * We need to make an exception for PF_STARTING tasks because the fork
++ * path might require task_rq_lock() to work, eg. it can call
++ * set_cpus_allowed_ptr() from the cpuset clone_ns code.
++ */
++static inline int task_is_waking(struct task_struct *p)
++{
++ return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
++}
++
++/*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+ static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+ {
++ struct rq *rq;
++
+ for (;;) {
+- struct rq *rq = task_rq(p);
++ while (task_is_waking(p))
++ cpu_relax();
++ rq = task_rq(p);
+ spin_lock(&rq->lock);
+- if (likely(rq == task_rq(p)))
++ if (likely(rq == task_rq(p) && !task_is_waking(p)))
+ return rq;
+ spin_unlock(&rq->lock);
+ }
+@@ -968,10 +985,12 @@ static struct rq *task_rq_lock(struct ta
+ struct rq *rq;
+
+ for (;;) {
++ while (task_is_waking(p))
++ cpu_relax();
+ local_irq_save(*flags);
+ rq = task_rq(p);
+ spin_lock(&rq->lock);
+- if (likely(rq == task_rq(p)))
++ if (likely(rq == task_rq(p) && !task_is_waking(p)))
+ return rq;
+ spin_unlock_irqrestore(&rq->lock, *flags);
+ }
+@@ -2439,14 +2458,27 @@ static int try_to_wake_up(struct task_st
+ __task_rq_unlock(rq);
+
+ cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+- if (cpu != orig_cpu)
++ if (cpu != orig_cpu) {
++ /*
++ * Since we migrate the task without holding any rq->lock,
++ * we need to be careful with task_rq_lock(), since that
++ * might end up locking an invalid rq.
++ */
+ set_task_cpu(p, cpu);
++ }
+
+- rq = __task_rq_lock(p);
++ rq = cpu_rq(cpu);
++ spin_lock(&rq->lock);
+ update_rq_clock(rq);
+
++ /*
++ * We migrated the task without holding either rq->lock, however
++ * since the task is not on the task list itself, nobody else
++ * will try and migrate the task, hence the rq should match the
++ * cpu we just moved it to.
++ */
++ WARN_ON(task_cpu(p) != cpu);
+ WARN_ON(p->state != TASK_WAKING);
+- cpu = task_cpu(p);
+
+ #ifdef CONFIG_SCHEDSTATS
+ schedstat_inc(rq, ttwu_count);
+@@ -2695,7 +2727,13 @@ void wake_up_new_task(struct task_struct
+ set_task_cpu(p, cpu);
+ #endif
+
+- rq = task_rq_lock(p, &flags);
++ /*
++ * Since the task is not on the rq and we still have TASK_WAKING set
++ * nobody else will migrate this task.
++ */
++ rq = cpu_rq(cpu);
++ spin_lock_irqsave(&rq->lock, flags);
++
+ BUG_ON(p->state != TASK_WAKING);
+ p->state = TASK_RUNNING;
+ update_rq_clock(rq);
+@@ -7204,27 +7242,8 @@ int set_cpus_allowed_ptr(struct task_str
+ struct rq *rq;
+ int ret = 0;
+
+- /*
+- * Since we rely on wake-ups to migrate sleeping tasks, don't change
+- * the ->cpus_allowed mask from under waking tasks, which would be
+- * possible when we change rq->lock in ttwu(), so synchronize against
+- * TASK_WAKING to avoid that.
+- *
+- * Make an exception for freshly cloned tasks, since cpuset namespaces
+- * might move the task about, we have to validate the target in
+- * wake_up_new_task() anyway since the cpu might have gone away.
+- */
+-again:
+- while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
+- cpu_relax();
+-
+ rq = task_rq_lock(p, &flags);
+
+- if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
+- task_rq_unlock(rq, &flags);
+- goto again;
+- }
+-
+ if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ ret = -EINVAL;
+ goto out;
--- /dev/null
+From peterz@infradead.org Fri Sep 17 18:18:47 2010
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Thu, 19 Aug 2010 13:31:43 +0200
+Subject: sched: Fix rq->clock synchronization when migrating tasks
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <748cfa7664c3c3092de1cf8c86f96474f840bed6.1283514307.git.efault@gmx.de>
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 861d034ee814917a83bd5de4b26e3b8336ddeeb8 upstream
+
+sched_fork() -- we do task placement in ->task_fork_fair() ensure we
+ update_rq_clock() so we work with current time. We leave the vruntime
+ in relative state, so the time delay until wake_up_new_task() doesn't
+ matter.
+
+wake_up_new_task() -- Since task_fork_fair() left p->vruntime in
+ relative state we can safely migrate, the activate_task() on the
+ remote rq will call update_rq_clock() and causes the clock to be
+ synced (enough).
+
+Tested-by: Jack Daniel <wanders.thirst@gmail.com>
+Tested-by: Philby John <pjohn@mvista.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <1281002322.1923.1708.camel@laptop>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched_fair.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1963,6 +1963,8 @@ static void task_fork_fair(struct task_s
+
+ spin_lock_irqsave(&rq->lock, flags);
+
++ update_rq_clock(rq);
++
+ if (unlikely(task_cpu(p) != this_cpu))
+ __set_task_cpu(p, this_cpu);
+
--- /dev/null
+From suresh.b.siddha@intel.com Fri Sep 17 18:20:36 2010
+From: Suresh Siddha <suresh.b.siddha@intel.com>
+Date: Wed, 31 Mar 2010 16:47:45 -0700
+Subject: sched: Fix select_idle_sibling() logic in select_task_rq_fair()
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <7c9917f68be3e57e65b938ff15cc6a2b1cc0da16.1283514307.git.efault@gmx.de>
+
+From: Suresh Siddha <suresh.b.siddha@intel.com>
+
+commit 99bd5e2f245d8cd17d040c82d40becdb3efd9b69 upstream
+
+Issues in the current select_idle_sibling() logic in select_task_rq_fair()
+in the context of a task wake-up:
+
+a) Once we select the idle sibling, we use that domain (spanning the cpu that
+ the task is currently woken-up and the idle sibling that we found) in our
+ wake_affine() decisions. This domain is completely different from the
+ domain(we are supposed to use) that spans the cpu that the task currently
+ woken-up and the cpu where the task previously ran.
+
+b) We do select_idle_sibling() check only for the cpu that the task is
+ currently woken-up on. If select_task_rq_fair() selects the previously run
+ cpu for waking the task, doing a select_idle_sibling() check
+ for that cpu also helps and we don't do this currently.
+
+c) In the scenarios where the cpu that the task is woken-up is busy but
+ with its HT siblings are idle, we are selecting the task be woken-up
+ on the idle HT sibling instead of a core that it previously ran
+ and currently completely idle. i.e., we are not taking decisions based on
+ wake_affine() but directly selecting an idle sibling that can cause
+ an imbalance at the SMT/MC level which will be later corrected by the
+ periodic load balancer.
+
+Fix this by first going through the load imbalance calculations using
+wake_affine() and once we make a decision of woken-up cpu vs previously-ran cpu,
+then choose a possible idle sibling for waking up the task on.
+
+Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <1270079265.7835.8.camel@sbs-t61.sc.intel.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched_fair.c | 82 +++++++++++++++++++++++++---------------------------
+ 1 file changed, 40 insertions(+), 42 deletions(-)
+
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1386,29 +1386,48 @@ find_idlest_cpu(struct sched_group *grou
+ /*
+ * Try and locate an idle CPU in the sched_domain.
+ */
+-static int
+-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
++static int select_idle_sibling(struct task_struct *p, int target)
+ {
+ int cpu = smp_processor_id();
+ int prev_cpu = task_cpu(p);
++ struct sched_domain *sd;
+ int i;
+
+ /*
+- * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE
+- * test in select_task_rq_fair) and the prev_cpu is idle then that's
+- * always a better target than the current cpu.
++ * If the task is going to be woken-up on this cpu and if it is
++ * already idle, then it is the right target.
++ */
++ if (target == cpu && idle_cpu(cpu))
++ return cpu;
++
++ /*
++ * If the task is going to be woken-up on the cpu where it previously
++ * ran and if it is currently idle, then it the right target.
+ */
+- if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
++ if (target == prev_cpu && idle_cpu(prev_cpu))
+ return prev_cpu;
+
+ /*
+- * Otherwise, iterate the domain and find an elegible idle cpu.
++ * Otherwise, iterate the domains and find an elegible idle cpu.
+ */
+- for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+- if (!cpu_rq(i)->cfs.nr_running) {
+- target = i;
++ for_each_domain(target, sd) {
++ if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+ break;
++
++ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
++ if (idle_cpu(i)) {
++ target = i;
++ break;
++ }
+ }
++
++ /*
++ * Lets stop looking for an idle sibling when we reached
++ * the domain that spans the current cpu and prev_cpu.
++ */
++ if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
++ cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
++ break;
+ }
+
+ return target;
+@@ -1432,7 +1451,7 @@ select_task_rq_fair(struct rq *rq, struc
+ int cpu = smp_processor_id();
+ int prev_cpu = task_cpu(p);
+ int new_cpu = cpu;
+- int want_affine = 0, cpu_idle = !current->pid;
++ int want_affine = 0;
+ int want_sd = 1;
+ int sync = wake_flags & WF_SYNC;
+
+@@ -1472,36 +1491,13 @@ select_task_rq_fair(struct rq *rq, struc
+ }
+
+ /*
+- * While iterating the domains looking for a spanning
+- * WAKE_AFFINE domain, adjust the affine target to any idle cpu
+- * in cache sharing domains along the way.
++ * If both cpu and prev_cpu are part of this domain,
++ * cpu is a valid SD_WAKE_AFFINE target.
+ */
+- if (want_affine) {
+- int target = -1;
+-
+- /*
+- * If both cpu and prev_cpu are part of this domain,
+- * cpu is a valid SD_WAKE_AFFINE target.
+- */
+- if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+- target = cpu;
+-
+- /*
+- * If there's an idle sibling in this domain, make that
+- * the wake_affine target instead of the current cpu.
+- */
+- if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES)
+- target = select_idle_sibling(p, tmp, target);
+-
+- if (target >= 0) {
+- if (tmp->flags & SD_WAKE_AFFINE) {
+- affine_sd = tmp;
+- want_affine = 0;
+- if (target != cpu)
+- cpu_idle = 1;
+- }
+- cpu = target;
+- }
++ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
++ cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
++ affine_sd = tmp;
++ want_affine = 0;
+ }
+
+ if (!want_sd && !want_affine)
+@@ -1532,8 +1528,10 @@ select_task_rq_fair(struct rq *rq, struc
+ #endif
+
+ if (affine_sd) {
+- if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+- return cpu;
++ if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
++ return select_idle_sibling(p, cpu);
++ else
++ return select_idle_sibling(p, prev_cpu);
+ }
+
+ while (sd) {
--- /dev/null
+From efault@gmx.de Fri Sep 17 18:20:11 2010
+From: Mike Galbraith <efault@gmx.de>
+Date: Thu, 11 Mar 2010 17:17:16 +0100
+Subject: sched: Fix select_idle_sibling()
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <2dc48f18ab671dc1c87c87dba674ff4b755d17ff.1283514307.git.efault@gmx.de>
+
+From: Mike Galbraith <efault@gmx.de>
+
+commit 8b911acdf08477c059d1c36c21113ab1696c612b upstream
+
+Don't bother with selection when the current cpu is idle. Recent load
+balancing changes also make it no longer necessary to check wake_affine()
+success before returning the selected sibling, so we now always use it.
+
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <1268301369.6785.36.camel@marge.simson.net>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched_fair.c | 14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1432,7 +1432,7 @@ select_task_rq_fair(struct rq *rq, struc
+ int cpu = smp_processor_id();
+ int prev_cpu = task_cpu(p);
+ int new_cpu = cpu;
+- int want_affine = 0;
++ int want_affine = 0, cpu_idle = !current->pid;
+ int want_sd = 1;
+ int sync = wake_flags & WF_SYNC;
+
+@@ -1490,13 +1490,15 @@ select_task_rq_fair(struct rq *rq, struc
+ * If there's an idle sibling in this domain, make that
+ * the wake_affine target instead of the current cpu.
+ */
+- if (tmp->flags & SD_SHARE_PKG_RESOURCES)
++ if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES)
+ target = select_idle_sibling(p, tmp, target);
+
+ if (target >= 0) {
+ if (tmp->flags & SD_WAKE_AFFINE) {
+ affine_sd = tmp;
+ want_affine = 0;
++ if (target != cpu)
++ cpu_idle = 1;
+ }
+ cpu = target;
+ }
+@@ -1512,6 +1514,7 @@ select_task_rq_fair(struct rq *rq, struc
+ sd = tmp;
+ }
+
++#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (sched_feat(LB_SHARES_UPDATE)) {
+ /*
+ * Pick the largest domain to update shares over
+@@ -1528,9 +1531,12 @@ select_task_rq_fair(struct rq *rq, struc
+ spin_lock(&rq->lock);
+ }
+ }
++#endif
+
+- if (affine_sd && wake_affine(affine_sd, p, sync))
+- return cpu;
++ if (affine_sd) {
++ if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync))
++ return cpu;
++ }
+
+ while (sd) {
+ int load_idx = sd->forkexec_idx;
--- /dev/null
+From a.p.zijlstra@chello.nl Fri Sep 17 18:18:02 2010
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Wed, 24 Mar 2010 18:34:10 +0100
+Subject: sched: Fix TASK_WAKING vs fork deadlock
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <1620f28b03b31be9190132c280a85fc1d08141a8.1283514307.git.efault@gmx.de>
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit 0017d735092844118bef006696a750a0e4ef6ebd upstream
+
+Oleg noticed a few races with the TASK_WAKING usage on fork.
+
+ - since TASK_WAKING is basically a spinlock, it should be IRQ safe
+ - since we set TASK_WAKING (*) without holding rq->lock it could
+ be there still is a rq->lock holder, thereby not actually
+ providing full serialization.
+
+(*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING.
+
+Cure the second issue by not setting TASK_WAKING in sched_fork(), but
+only temporarily in wake_up_new_task() while calling select_task_rq().
+
+Cure the first by holding rq->lock around the select_task_rq() call,
+this will disable IRQs, this however requires that we push down the
+rq->lock release into select_task_rq_fair()'s cgroup stuff.
+
+Because select_task_rq_fair() still needs to drop the rq->lock we
+cannot fully get rid of TASK_WAKING.
+
+Reported-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <new-submission>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ include/linux/sched.h | 3 +-
+ kernel/sched.c | 65 +++++++++++++++++-------------------------------
+ kernel/sched_fair.c | 8 ++++-
+ kernel/sched_idletask.c | 3 +-
+ kernel/sched_rt.c | 5 +--
+ 5 files changed, 36 insertions(+), 48 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1082,7 +1082,8 @@ struct sched_class {
+ void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+
+ #ifdef CONFIG_SMP
+- int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
++ int (*select_task_rq)(struct rq *rq, struct task_struct *p,
++ int sd_flag, int flags);
+
+ unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
+ struct rq *busiest, unsigned long max_load_move,
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -944,14 +944,10 @@ static inline void finish_lock_switch(st
+ /*
+ * Check whether the task is waking, we use this to synchronize against
+ * ttwu() so that task_cpu() reports a stable number.
+- *
+- * We need to make an exception for PF_STARTING tasks because the fork
+- * path might require task_rq_lock() to work, eg. it can call
+- * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ */
+ static inline int task_is_waking(struct task_struct *p)
+ {
+- return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
++ return unlikely(p->state == TASK_WAKING);
+ }
+
+ /*
+@@ -2373,9 +2369,9 @@ static int select_fallback_rq(int cpu, s
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ */
+ static inline
+-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
++int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+ {
+- int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
++ int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+
+ /*
+ * In order not to call set_task_cpu() on a blocking task we need
+@@ -2450,17 +2446,10 @@ static int try_to_wake_up(struct task_st
+ if (p->sched_class->task_waking)
+ p->sched_class->task_waking(rq, p);
+
+- __task_rq_unlock(rq);
+-
+- cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+- if (cpu != orig_cpu) {
+- /*
+- * Since we migrate the task without holding any rq->lock,
+- * we need to be careful with task_rq_lock(), since that
+- * might end up locking an invalid rq.
+- */
++ cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
++ if (cpu != orig_cpu)
+ set_task_cpu(p, cpu);
+- }
++ __task_rq_unlock(rq);
+
+ rq = cpu_rq(cpu);
+ spin_lock(&rq->lock);
+@@ -2638,11 +2627,11 @@ void sched_fork(struct task_struct *p, i
+
+ __sched_fork(p);
+ /*
+- * We mark the process as waking here. This guarantees that
++ * We mark the process as running here. This guarantees that
+ * nobody will actually run it, and a signal or other external
+ * event cannot wake it up and insert it on the runqueue either.
+ */
+- p->state = TASK_WAKING;
++ p->state = TASK_RUNNING;
+
+ /*
+ * Revert to default priority/policy on fork if requested.
+@@ -2709,28 +2698,25 @@ void wake_up_new_task(struct task_struct
+ int cpu = get_cpu();
+
+ #ifdef CONFIG_SMP
++ rq = task_rq_lock(p, &flags);
++ p->state = TASK_WAKING;
++
+ /*
+ * Fork balancing, do it here and not earlier because:
+ * - cpus_allowed can change in the fork path
+ * - any previously selected cpu might disappear through hotplug
+ *
+- * We still have TASK_WAKING but PF_STARTING is gone now, meaning
+- * ->cpus_allowed is stable, we have preemption disabled, meaning
+- * cpu_online_mask is stable.
++ * We set TASK_WAKING so that select_task_rq() can drop rq->lock
++ * without people poking at ->cpus_allowed.
+ */
+- cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
++ cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+ set_task_cpu(p, cpu);
+-#endif
+-
+- /*
+- * Since the task is not on the rq and we still have TASK_WAKING set
+- * nobody else will migrate this task.
+- */
+- rq = cpu_rq(cpu);
+- spin_lock_irqsave(&rq->lock, flags);
+
+- BUG_ON(p->state != TASK_WAKING);
+ p->state = TASK_RUNNING;
++ task_rq_unlock(rq, &flags);
++#endif
++
++ rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
+ activate_task(rq, p, 0);
+ trace_sched_wakeup_new(rq, p, 1);
+@@ -3215,19 +3201,15 @@ void sched_exec(void)
+ {
+ struct task_struct *p = current;
+ struct migration_req req;
+- int dest_cpu, this_cpu;
+ unsigned long flags;
+ struct rq *rq;
+-
+- this_cpu = get_cpu();
+- dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+- if (dest_cpu == this_cpu) {
+- put_cpu();
+- return;
+- }
++ int dest_cpu;
+
+ rq = task_rq_lock(p, &flags);
+- put_cpu();
++ dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
++ if (dest_cpu == smp_processor_id())
++ goto unlock;
++
+ /*
+ * select_task_rq() can race against ->cpus_allowed
+ */
+@@ -3245,6 +3227,7 @@ void sched_exec(void)
+
+ return;
+ }
++unlock:
+ task_rq_unlock(rq, &flags);
+ }
+
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1392,7 +1392,8 @@ find_idlest_cpu(struct sched_group *grou
+ *
+ * preempt must be disabled.
+ */
+-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
++static int
++select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
+ {
+ struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+ int cpu = smp_processor_id();
+@@ -1492,8 +1493,11 @@ static int select_task_rq_fair(struct ta
+ cpumask_weight(sched_domain_span(sd))))
+ tmp = affine_sd;
+
+- if (tmp)
++ if (tmp) {
++ spin_unlock(&rq->lock);
+ update_shares(tmp);
++ spin_lock(&rq->lock);
++ }
+ }
+
+ if (affine_sd && wake_affine(affine_sd, p, sync)) {
+--- a/kernel/sched_idletask.c
++++ b/kernel/sched_idletask.c
+@@ -6,7 +6,8 @@
+ */
+
+ #ifdef CONFIG_SMP
+-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
++static int
++select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+ {
+ return task_cpu(p); /* IDLE tasks as never migrated */
+ }
+--- a/kernel/sched_rt.c
++++ b/kernel/sched_rt.c
+@@ -942,10 +942,9 @@ static void yield_task_rt(struct rq *rq)
+ #ifdef CONFIG_SMP
+ static int find_lowest_rq(struct task_struct *task);
+
+-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
++static int
++select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+ {
+- struct rq *rq = task_rq(p);
+-
+ if (sd_flag != SD_BALANCE_WAKE)
+ return smp_processor_id();
+
--- /dev/null
+From efault@gmx.de Fri Sep 17 18:19:56 2010
+From: Mike Galbraith <efault@gmx.de>
+Date: Mon, 4 Jan 2010 14:44:56 +0100
+Subject: sched: Fix vmark regression on big machines
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <c0a4bd155e864c31aa575d64ae6330d563ed03fb.1283514307.git.efault@gmx.de>
+
+From: Mike Galbraith <efault@gmx.de>
+
+commit 50b926e439620c469565e8be0f28be78f5fca1ce upstream
+
+SD_PREFER_SIBLING is set at the CPU domain level if power saving isn't
+enabled, leading to many cache misses on large machines as we traverse
+looking for an idle shared cache to wake to. Change the enabler of
+select_idle_sibling() to SD_SHARE_PKG_RESOURCES, and enable same at the
+sibling domain level.
+
+Reported-by: Lin Ming <ming.m.lin@intel.com>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <1262612696.15495.15.camel@marge.simson.net>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ include/linux/topology.h | 2 +-
+ kernel/sched_fair.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/include/linux/topology.h
++++ b/include/linux/topology.h
+@@ -99,7 +99,7 @@ int arch_update_cpu_topology(void);
+ | 1*SD_WAKE_AFFINE \
+ | 1*SD_SHARE_CPUPOWER \
+ | 0*SD_POWERSAVINGS_BALANCE \
+- | 0*SD_SHARE_PKG_RESOURCES \
++ | 1*SD_SHARE_PKG_RESOURCES \
+ | 0*SD_SERIALIZE \
+ | 0*SD_PREFER_SIBLING \
+ , \
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1490,7 +1490,7 @@ select_task_rq_fair(struct rq *rq, struc
+ * If there's an idle sibling in this domain, make that
+ * the wake_affine target instead of the current cpu.
+ */
+- if (tmp->flags & SD_PREFER_SIBLING)
++ if (tmp->flags & SD_SHARE_PKG_RESOURCES)
+ target = select_idle_sibling(p, tmp, target);
+
+ if (target >= 0) {
--- /dev/null
+From tglx@linutronix.de Fri Sep 17 18:14:11 2010
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed, 20 Jan 2010 20:59:01 +0000
+Subject: sched: Implement head queueing for sched_rt
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <06654220e9d17d06d30535777dfbcdf5ab2d7e57.1283514307.git.efault@gmx.de>
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 37dad3fce97f01e5149d69de0833d8452c0e862e upstream
+
+The ability of enqueueing a task to the head of a SCHED_FIFO priority
+list is required to fix some violations of POSIX scheduling policy.
+
+Implement the functionality in sched_rt.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Peter Zijlstra <peterz@infradead.org>
+Tested-by: Carsten Emde <cbe@osadl.org>
+Tested-by: Mathias Weber <mathias.weber.mw1@roche.com>
+LKML-Reference: <20100120171629.772169931@linutronix.de>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched_rt.c | 19 +++++++++++--------
+ 1 file changed, 11 insertions(+), 8 deletions(-)
+
+--- a/kernel/sched_rt.c
++++ b/kernel/sched_rt.c
+@@ -194,7 +194,7 @@ static inline struct rt_rq *group_rt_rq(
+ return rt_se->my_q;
+ }
+
+-static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
++static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
+ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+
+ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
+@@ -204,7 +204,7 @@ static void sched_rt_rq_enqueue(struct r
+
+ if (rt_rq->rt_nr_running) {
+ if (rt_se && !on_rt_rq(rt_se))
+- enqueue_rt_entity(rt_se);
++ enqueue_rt_entity(rt_se, false);
+ if (rt_rq->highest_prio.curr < curr->prio)
+ resched_task(curr);
+ }
+@@ -803,7 +803,7 @@ void dec_rt_tasks(struct sched_rt_entity
+ dec_rt_group(rt_se, rt_rq);
+ }
+
+-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
++static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+ {
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ struct rt_prio_array *array = &rt_rq->active;
+@@ -819,7 +819,10 @@ static void __enqueue_rt_entity(struct s
+ if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+ return;
+
+- list_add_tail(&rt_se->run_list, queue);
++ if (head)
++ list_add(&rt_se->run_list, queue);
++ else
++ list_add_tail(&rt_se->run_list, queue);
+ __set_bit(rt_se_prio(rt_se), array->bitmap);
+
+ inc_rt_tasks(rt_se, rt_rq);
+@@ -856,11 +859,11 @@ static void dequeue_rt_stack(struct sche
+ }
+ }
+
+-static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
++static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+ {
+ dequeue_rt_stack(rt_se);
+ for_each_sched_rt_entity(rt_se)
+- __enqueue_rt_entity(rt_se);
++ __enqueue_rt_entity(rt_se, head);
+ }
+
+ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+@@ -871,7 +874,7 @@ static void dequeue_rt_entity(struct sch
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+ if (rt_rq && rt_rq->rt_nr_running)
+- __enqueue_rt_entity(rt_se);
++ __enqueue_rt_entity(rt_se, false);
+ }
+ }
+
+@@ -886,7 +889,7 @@ enqueue_task_rt(struct rq *rq, struct ta
+ if (wakeup)
+ rt_se->timeout = 0;
+
+- enqueue_rt_entity(rt_se);
++ enqueue_rt_entity(rt_se, head);
+
+ if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ enqueue_pushable_task(rq, p);
--- /dev/null
+From oleg@redhat.com Fri Sep 17 18:14:53 2010
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Mon, 15 Mar 2010 10:10:03 +0100
+Subject: sched: Kill the broken and deadlockable cpuset_lock/cpuset_cpus_allowed_locked code
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <2ed3dbb00c3052ccb7ffda1e7a1d112e3d3f53f1.1283514307.git.efault@gmx.de>
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit 897f0b3c3ff40b443c84e271bef19bd6ae885195 upstream
+
+This patch just states the fact the cpusets/cpuhotplug interaction is
+broken and removes the deadlockable code which only pretends to work.
+
+- cpuset_lock() doesn't really work. It is needed for
+ cpuset_cpus_allowed_locked() but we can't take this lock in
+ try_to_wake_up()->select_fallback_rq() path.
+
+- cpuset_lock() is deadlockable. Suppose that a task T bound to CPU takes
+ callback_mutex. If cpu_down(CPU) happens before T drops callback_mutex
+ stop_machine() preempts T, then migration_call(CPU_DEAD) tries to take
+ cpuset_lock() and hangs forever because CPU is already dead and thus
+ T can't be scheduled.
+
+- cpuset_cpus_allowed_locked() is deadlockable too. It takes task_lock()
+ which is not irq-safe, but try_to_wake_up() can be called from irq.
+
+Kill them, and change select_fallback_rq() to use cpu_possible_mask, like
+we currently do without CONFIG_CPUSETS.
+
+Also, with or without this patch, with or without CONFIG_CPUSETS, the
+callers of select_fallback_rq() can race with each other or with
+set_cpus_allowed() pathes.
+
+The subsequent patches try to to fix these problems.
+
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <20100315091003.GA9123@redhat.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ include/linux/cpuset.h | 13 -------------
+ kernel/cpuset.c | 27 +--------------------------
+ kernel/sched.c | 10 +++-------
+ 3 files changed, 4 insertions(+), 46 deletions(-)
+
+--- a/include/linux/cpuset.h
++++ b/include/linux/cpuset.h
+@@ -21,8 +21,6 @@ extern int number_of_cpusets; /* How man
+ extern int cpuset_init(void);
+ extern void cpuset_init_smp(void);
+ extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
+-extern void cpuset_cpus_allowed_locked(struct task_struct *p,
+- struct cpumask *mask);
+ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
+ #define cpuset_current_mems_allowed (current->mems_allowed)
+ void cpuset_init_current_mems_allowed(void);
+@@ -69,9 +67,6 @@ struct seq_file;
+ extern void cpuset_task_status_allowed(struct seq_file *m,
+ struct task_struct *task);
+
+-extern void cpuset_lock(void);
+-extern void cpuset_unlock(void);
+-
+ extern int cpuset_mem_spread_node(void);
+
+ static inline int cpuset_do_page_mem_spread(void)
+@@ -105,11 +100,6 @@ static inline void cpuset_cpus_allowed(s
+ {
+ cpumask_copy(mask, cpu_possible_mask);
+ }
+-static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
+- struct cpumask *mask)
+-{
+- cpumask_copy(mask, cpu_possible_mask);
+-}
+
+ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
+ {
+@@ -157,9 +147,6 @@ static inline void cpuset_task_status_al
+ {
+ }
+
+-static inline void cpuset_lock(void) {}
+-static inline void cpuset_unlock(void) {}
+-
+ static inline int cpuset_mem_spread_node(void)
+ {
+ return 0;
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -2145,19 +2145,10 @@ void __init cpuset_init_smp(void)
+ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
+ {
+ mutex_lock(&callback_mutex);
+- cpuset_cpus_allowed_locked(tsk, pmask);
+- mutex_unlock(&callback_mutex);
+-}
+-
+-/**
+- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
+- * Must be called with callback_mutex held.
+- **/
+-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
+-{
+ task_lock(tsk);
+ guarantee_online_cpus(task_cs(tsk), pmask);
+ task_unlock(tsk);
++ mutex_unlock(&callback_mutex);
+ }
+
+ void cpuset_init_current_mems_allowed(void)
+@@ -2346,22 +2337,6 @@ int __cpuset_node_allowed_hardwall(int n
+ }
+
+ /**
+- * cpuset_lock - lock out any changes to cpuset structures
+- *
+- * The out of memory (oom) code needs to mutex_lock cpusets
+- * from being changed while it scans the tasklist looking for a
+- * task in an overlapping cpuset. Expose callback_mutex via this
+- * cpuset_lock() routine, so the oom code can lock it, before
+- * locking the task list. The tasklist_lock is a spinlock, so
+- * must be taken inside callback_mutex.
+- */
+-
+-void cpuset_lock(void)
+-{
+- mutex_lock(&callback_mutex);
+-}
+-
+-/**
+ * cpuset_unlock - release lock on cpuset changes
+ *
+ * Undo the lock taken in a previous cpuset_lock() call.
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -2349,11 +2349,9 @@ static int select_fallback_rq(int cpu, s
+ return dest_cpu;
+
+ /* No more Mr. Nice Guy. */
+- if (dest_cpu >= nr_cpu_ids) {
+- rcu_read_lock();
+- cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+- rcu_read_unlock();
+- dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
++ if (unlikely(dest_cpu >= nr_cpu_ids)) {
++ cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
++ dest_cpu = cpumask_any(cpu_active_mask);
+
+ /*
+ * Don't tell them about moving exiting tasks or
+@@ -7833,7 +7831,6 @@ migration_call(struct notifier_block *nf
+
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+- cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
+ migrate_live_tasks(cpu);
+ rq = cpu_rq(cpu);
+ /* Idle task back to normal (off runqueue, low prio) */
+@@ -7844,7 +7841,6 @@ migration_call(struct notifier_block *nf
+ rq->idle->sched_class = &idle_sched_class;
+ migrate_dead_tasks(cpu);
+ spin_unlock_irq(&rq->lock);
+- cpuset_unlock();
+ migrate_nr_uninterruptible(rq);
+ BUG_ON(rq->nr_running != 0);
+ calc_global_load_remove(rq);
--- /dev/null
+From oleg@redhat.com Fri Sep 17 18:17:45 2010
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Mon, 15 Mar 2010 10:10:27 +0100
+Subject: sched: Make select_fallback_rq() cpuset friendly
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <cfcf4b5d923ac7e65cf0725c08e5ab233634719a.1283514307.git.efault@gmx.de>
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit 9084bb8246ea935b98320554229e2f371f7f52fa upstream
+
+Introduce cpuset_cpus_allowed_fallback() helper to fix the cpuset problems
+with select_fallback_rq(). It can be called from any context and can't use
+any cpuset locks including task_lock(). It is called when the task doesn't
+have online cpus in ->cpus_allowed but ttwu/etc must be able to find a
+suitable cpu.
+
+I am not proud of this patch. Everything which needs such a fat comment
+can't be good even if correct. But I'd prefer to not change the locking
+rules in the code I hardly understand, and in any case I believe this
+simple change make the code much more correct compared to deadlocks we
+currently have.
+
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <20100315091027.GA9155@redhat.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ include/linux/cpuset.h | 7 +++++++
+ kernel/cpuset.c | 42 ++++++++++++++++++++++++++++++++++++++++++
+ kernel/sched.c | 4 +---
+ 3 files changed, 50 insertions(+), 3 deletions(-)
+
+--- a/include/linux/cpuset.h
++++ b/include/linux/cpuset.h
+@@ -21,6 +21,7 @@ extern int number_of_cpusets; /* How man
+ extern int cpuset_init(void);
+ extern void cpuset_init_smp(void);
+ extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
++extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
+ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
+ #define cpuset_current_mems_allowed (current->mems_allowed)
+ void cpuset_init_current_mems_allowed(void);
+@@ -101,6 +102,12 @@ static inline void cpuset_cpus_allowed(s
+ cpumask_copy(mask, cpu_possible_mask);
+ }
+
++static inline int cpuset_cpus_allowed_fallback(struct task_struct *p)
++{
++ cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
++ return cpumask_any(cpu_active_mask);
++}
++
+ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
+ {
+ return node_possible_map;
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -2151,6 +2151,48 @@ void cpuset_cpus_allowed(struct task_str
+ mutex_unlock(&callback_mutex);
+ }
+
++int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
++{
++ const struct cpuset *cs;
++ int cpu;
++
++ rcu_read_lock();
++ cs = task_cs(tsk);
++ if (cs)
++ cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
++ rcu_read_unlock();
++
++ /*
++ * We own tsk->cpus_allowed, nobody can change it under us.
++ *
++ * But we used cs && cs->cpus_allowed lockless and thus can
++ * race with cgroup_attach_task() or update_cpumask() and get
++ * the wrong tsk->cpus_allowed. However, both cases imply the
++ * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
++ * which takes task_rq_lock().
++ *
++ * If we are called after it dropped the lock we must see all
++ * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
++ * set any mask even if it is not right from task_cs() pov,
++ * the pending set_cpus_allowed_ptr() will fix things.
++ */
++
++ cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
++ if (cpu >= nr_cpu_ids) {
++ /*
++ * Either tsk->cpus_allowed is wrong (see above) or it
++ * is actually empty. The latter case is only possible
++ * if we are racing with remove_tasks_in_empty_cpuset().
++ * Like above we can temporary set any mask and rely on
++ * set_cpus_allowed_ptr() as synchronization point.
++ */
++ cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
++ cpu = cpumask_any(cpu_active_mask);
++ }
++
++ return cpu;
++}
++
+ void cpuset_init_current_mems_allowed(void)
+ {
+ nodes_setall(current->mems_allowed);
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -2353,9 +2353,7 @@ static int select_fallback_rq(int cpu, s
+
+ /* No more Mr. Nice Guy. */
+ if (unlikely(dest_cpu >= nr_cpu_ids)) {
+- cpumask_copy(&p->cpus_allowed, cpu_possible_mask);
+- dest_cpu = cpumask_any(cpu_active_mask);
+-
++ dest_cpu = cpuset_cpus_allowed_fallback(p);
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
--- /dev/null
+From a.p.zijlstra@chello.nl Fri Sep 17 18:19:43 2010
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Thu, 12 Nov 2009 15:55:29 +0100
+Subject: sched: More generic WAKE_AFFINE vs select_idle_sibling()
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <4fe736bd5f08977bf198f67dd272162a061c1a02.1283514307.git.efault@gmx.de>
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit fe3bcfe1f6c1fc4ea7706ac2d05e579fd9092682 upstream
+
+Instead of only considering SD_WAKE_AFFINE | SD_PREFER_SIBLING
+domains also allow all SD_PREFER_SIBLING domains below a
+SD_WAKE_AFFINE domain to change the affinity target.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Mike Galbraith <efault@gmx.de>
+LKML-Reference: <20091112145610.909723612@chello.nl>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched_fair.c | 33 ++++++++++++++++-----------------
+ 1 file changed, 16 insertions(+), 17 deletions(-)
+
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1398,20 +1398,16 @@ select_idle_sibling(struct task_struct *
+ * test in select_task_rq_fair) and the prev_cpu is idle then that's
+ * always a better target than the current cpu.
+ */
+- if (target == cpu) {
+- if (!cpu_rq(prev_cpu)->cfs.nr_running)
+- target = prev_cpu;
+- }
++ if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running)
++ return prev_cpu;
+
+ /*
+ * Otherwise, iterate the domain and find an elegible idle cpu.
+ */
+- if (target == -1 || target == cpu) {
+- for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+- if (!cpu_rq(i)->cfs.nr_running) {
+- target = i;
+- break;
+- }
++ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
++ if (!cpu_rq(i)->cfs.nr_running) {
++ target = i;
++ break;
+ }
+ }
+
+@@ -1475,7 +1471,12 @@ select_task_rq_fair(struct rq *rq, struc
+ want_sd = 0;
+ }
+
+- if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) {
++ /*
++ * While iterating the domains looking for a spanning
++ * WAKE_AFFINE domain, adjust the affine target to any idle cpu
++ * in cache sharing domains along the way.
++ */
++ if (want_affine) {
+ int target = -1;
+
+ /*
+@@ -1488,17 +1489,15 @@ select_task_rq_fair(struct rq *rq, struc
+ /*
+ * If there's an idle sibling in this domain, make that
+ * the wake_affine target instead of the current cpu.
+- *
+- * XXX: should we possibly do this outside of
+- * WAKE_AFFINE, in case the shared cache domain is
+- * smaller than the WAKE_AFFINE domain?
+ */
+ if (tmp->flags & SD_PREFER_SIBLING)
+ target = select_idle_sibling(p, tmp, target);
+
+ if (target >= 0) {
+- affine_sd = tmp;
+- want_affine = 0;
++ if (tmp->flags & SD_WAKE_AFFINE) {
++ affine_sd = tmp;
++ want_affine = 0;
++ }
+ cpu = target;
+ }
+ }
--- /dev/null
+From oleg@redhat.com Fri Sep 17 18:15:27 2010
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Mon, 15 Mar 2010 10:10:14 +0100
+Subject: sched: move_task_off_dead_cpu(): Remove retry logic
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <d61f978b6a63cf12e26234bf81629a001c2221d0.1283514307.git.efault@gmx.de>
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit c1804d547dc098363443667609c272d1e4d15ee8 upstream
+
+The previous patch preserved the retry logic, but it looks unneeded.
+
+__migrate_task() can only fail if we raced with migration after we dropped
+the lock, but in this case the caller of set_cpus_allowed/etc must initiate
+migration itself if ->on_rq == T.
+
+We already fixed p->cpus_allowed, the changes in active/online masks must
+be visible to racer, it should migrate the task to online cpu correctly.
+
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <20100315091014.GA9138@redhat.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched.c | 13 ++++++-------
+ 1 file changed, 6 insertions(+), 7 deletions(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -7407,7 +7407,7 @@ static void move_task_off_dead_cpu(int d
+ struct rq *rq = cpu_rq(dead_cpu);
+ int needs_cpu, uninitialized_var(dest_cpu);
+ unsigned long flags;
+-again:
++
+ local_irq_save(flags);
+
+ spin_lock(&rq->lock);
+@@ -7415,14 +7415,13 @@ again:
+ if (needs_cpu)
+ dest_cpu = select_fallback_rq(dead_cpu, p);
+ spin_unlock(&rq->lock);
+-
+- /* It can have affinity changed while we were choosing. */
++ /*
++ * It can only fail if we race with set_cpus_allowed(),
++ * in the racer should migrate the task anyway.
++ */
+ if (needs_cpu)
+- needs_cpu = !__migrate_task(p, dead_cpu, dest_cpu);
++ __migrate_task(p, dead_cpu, dest_cpu);
+ local_irq_restore(flags);
+-
+- if (unlikely(needs_cpu))
+- goto again;
+ }
+
+ /*
--- /dev/null
+From oleg@redhat.com Fri Sep 17 18:15:12 2010
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Mon, 15 Mar 2010 10:10:10 +0100
+Subject: sched: move_task_off_dead_cpu(): Take rq->lock around select_fallback_rq()
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <f0c871a27f468c7e4c8cbe43a79f506dc323b9b6.1283514307.git.efault@gmx.de>
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit 1445c08d06c5594895b4fae952ef8a457e89c390 upstream
+
+move_task_off_dead_cpu()->select_fallback_rq() reads/updates ->cpus_allowed
+lockless. We can race with set_cpus_allowed() running in parallel.
+
+Change it to take rq->lock around select_fallback_rq(). Note that it is not
+trivial to move this spin_lock() into select_fallback_rq(), we must recheck
+the task was not migrated after we take the lock and other callers do not
+need this lock.
+
+To avoid the races with other callers of select_fallback_rq() which rely on
+TASK_WAKING, we also check p->state != TASK_WAKING and do nothing otherwise.
+The owner of TASK_WAKING must update ->cpus_allowed and choose the correct
+CPU anyway, and the subsequent __migrate_task() is just meaningless because
+p->se.on_rq must be false.
+
+Alternatively, we could change select_task_rq() to take rq->lock right
+after it calls sched_class->select_task_rq(), but this looks a bit ugly.
+
+Also, change it to not assume irqs are disabled and absorb __migrate_task_irq().
+
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <20100315091010.GA9131@redhat.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched.c | 30 +++++++++++++++---------------
+ 1 file changed, 15 insertions(+), 15 deletions(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -7399,29 +7399,29 @@ static int migration_thread(void *data)
+ }
+
+ #ifdef CONFIG_HOTPLUG_CPU
+-
+-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
+-{
+- int ret;
+-
+- local_irq_disable();
+- ret = __migrate_task(p, src_cpu, dest_cpu);
+- local_irq_enable();
+- return ret;
+-}
+-
+ /*
+ * Figure out where task on dead CPU should go, use force if necessary.
+ */
+ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+ {
+- int dest_cpu;
+-
++ struct rq *rq = cpu_rq(dead_cpu);
++ int needs_cpu, uninitialized_var(dest_cpu);
++ unsigned long flags;
+ again:
+- dest_cpu = select_fallback_rq(dead_cpu, p);
++ local_irq_save(flags);
++
++ spin_lock(&rq->lock);
++ needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
++ if (needs_cpu)
++ dest_cpu = select_fallback_rq(dead_cpu, p);
++ spin_unlock(&rq->lock);
+
+ /* It can have affinity changed while we were choosing. */
+- if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
++ if (needs_cpu)
++ needs_cpu = !__migrate_task(p, dead_cpu, dest_cpu);
++ local_irq_restore(flags);
++
++ if (unlikely(needs_cpu))
+ goto again;
+ }
+
--- /dev/null
+From a.p.zijlstra@chello.nl Fri Sep 17 18:18:19 2010
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Thu, 25 Mar 2010 21:05:16 +0100
+Subject: sched: Optimize task_rq_lock()
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <abb25d422e8ff6033d0feee1ae9a47377ed5df8e.1283514307.git.efault@gmx.de>
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit 65cc8e4859ff29a9ddc989c88557d6059834c2a2 upstream
+
+Now that we hold the rq->lock over set_task_cpu() again, we can do
+away with most of the TASK_WAKING checks and reduce them again to
+set_cpus_allowed_ptr().
+
+Removes some conditionals from scheduling hot-paths.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Oleg Nesterov <oleg@redhat.com>
+LKML-Reference: <new-submission>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched.c | 23 +++++++++++++++--------
+ 1 file changed, 15 insertions(+), 8 deletions(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -942,8 +942,8 @@ static inline void finish_lock_switch(st
+ #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+ /*
+- * Check whether the task is waking, we use this to synchronize against
+- * ttwu() so that task_cpu() reports a stable number.
++ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
++ * against ttwu().
+ */
+ static inline int task_is_waking(struct task_struct *p)
+ {
+@@ -960,11 +960,9 @@ static inline struct rq *__task_rq_lock(
+ struct rq *rq;
+
+ for (;;) {
+- while (task_is_waking(p))
+- cpu_relax();
+ rq = task_rq(p);
+ spin_lock(&rq->lock);
+- if (likely(rq == task_rq(p) && !task_is_waking(p)))
++ if (likely(rq == task_rq(p)))
+ return rq;
+ spin_unlock(&rq->lock);
+ }
+@@ -981,12 +979,10 @@ static struct rq *task_rq_lock(struct ta
+ struct rq *rq;
+
+ for (;;) {
+- while (task_is_waking(p))
+- cpu_relax();
+ local_irq_save(*flags);
+ rq = task_rq(p);
+ spin_lock(&rq->lock);
+- if (likely(rq == task_rq(p) && !task_is_waking(p)))
++ if (likely(rq == task_rq(p)))
+ return rq;
+ spin_unlock_irqrestore(&rq->lock, *flags);
+ }
+@@ -7213,7 +7209,18 @@ int set_cpus_allowed_ptr(struct task_str
+ struct rq *rq;
+ int ret = 0;
+
++ /*
++ * Serialize against TASK_WAKING so that ttwu() and wunt() can
++ * drop the rq->lock and still rely on ->cpus_allowed.
++ */
++again:
++ while (task_is_waking(p))
++ cpu_relax();
+ rq = task_rq_lock(p, &flags);
++ if (task_is_waking(p)) {
++ task_rq_unlock(rq, &flags);
++ goto again;
++ }
+
+ if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ ret = -EINVAL;
--- /dev/null
+From a.p.zijlstra@chello.nl Fri Sep 17 18:20:23 2010
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Fri, 16 Apr 2010 14:59:29 +0200
+Subject: sched: Pre-compute cpumask_weight(sched_domain_span(sd))
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <0c6c762bcaa163e06a13da32043ad968d1473188.1283514307.git.efault@gmx.de>
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit 669c55e9f99b90e46eaa0f98a67ec53d46dc969a upstream
+
+Dave reported that his large SPARC machines spend lots of time in
+hweight64(), try and optimize some of those needless cpumask_weight()
+invocations (esp. with the large offstack cpumasks these are very
+expensive indeed).
+
+Reported-by: David Miller <davem@davemloft.net>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <new-submission>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ include/linux/sched.h | 1 +
+ kernel/sched.c | 7 +++++--
+ kernel/sched_fair.c | 8 +++-----
+ 3 files changed, 9 insertions(+), 7 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1000,6 +1000,7 @@ struct sched_domain {
+ char *name;
+ #endif
+
++ unsigned int span_weight;
+ /*
+ * Span of all CPUs in this domain.
+ *
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -3678,7 +3678,7 @@ unsigned long __weak arch_scale_freq_pow
+
+ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+ {
+- unsigned long weight = cpumask_weight(sched_domain_span(sd));
++ unsigned long weight = sd->span_weight;
+ unsigned long smt_gain = sd->smt_gain;
+
+ smt_gain /= weight;
+@@ -3711,7 +3711,7 @@ unsigned long scale_rt_power(int cpu)
+
+ static void update_cpu_power(struct sched_domain *sd, int cpu)
+ {
+- unsigned long weight = cpumask_weight(sched_domain_span(sd));
++ unsigned long weight = sd->span_weight;
+ unsigned long power = SCHED_LOAD_SCALE;
+ struct sched_group *sdg = sd->groups;
+
+@@ -8166,6 +8166,9 @@ cpu_attach_domain(struct sched_domain *s
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_domain *tmp;
+
++ for (tmp = sd; tmp; tmp = tmp->parent)
++ tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
++
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; ) {
+ struct sched_domain *parent = tmp->parent;
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1520,9 +1520,7 @@ select_task_rq_fair(struct rq *rq, struc
+ * Pick the largest domain to update shares over
+ */
+ tmp = sd;
+- if (affine_sd && (!tmp ||
+- cpumask_weight(sched_domain_span(affine_sd)) >
+- cpumask_weight(sched_domain_span(sd))))
++ if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
+ tmp = affine_sd;
+
+ if (tmp) {
+@@ -1566,10 +1564,10 @@ select_task_rq_fair(struct rq *rq, struc
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = new_cpu;
+- weight = cpumask_weight(sched_domain_span(sd));
++ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+- if (weight <= cpumask_weight(sched_domain_span(tmp)))
++ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
--- /dev/null
+From tglx@linutronix.de Fri Sep 17 18:14:25 2010
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed, 20 Jan 2010 20:59:06 +0000
+Subject: sched: Queue a deboosted task to the head of the RT prio queue
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <55050ebe52e5ca5834a6f847d19809cba5dc10a0.1283514307.git.efault@gmx.de>
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 60db48cacb9b253d5607a5ff206112a59cd09e34 upstream
+
+rtmutex_set_prio() is used to implement priority inheritance for
+futexes. When a task is deboosted it gets enqueued at the tail of its
+RT priority list. This is violating the POSIX scheduling semantics:
+
+rt priority list X contains two runnable tasks A and B
+
+task A runs with priority X and holds mutex M
+task C preempts A and is blocked on mutex M
+ -> task A is boosted to priority of task C (Y)
+task A unlocks the mutex M and deboosts itself
+ -> A is dequeued from rt priority list Y
+ -> A is enqueued to the tail of rt priority list X
+task C schedules away
+task B runs
+
+This is wrong as task A did not schedule away and therefor violates
+the POSIX scheduling semantics.
+
+Enqueue the task to the head of the priority list instead.
+
+Reported-by: Mathias Weber <mathias.weber.mw1@roche.com>
+Reported-by: Carsten Emde <cbe@osadl.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Peter Zijlstra <peterz@infradead.org>
+Tested-by: Carsten Emde <cbe@osadl.org>
+Tested-by: Mathias Weber <mathias.weber.mw1@roche.com>
+LKML-Reference: <20100120171629.809074113@linutronix.de>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -6184,7 +6184,7 @@ void rt_mutex_setprio(struct task_struct
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (on_rq) {
+- enqueue_task(rq, p, 0, false);
++ enqueue_task(rq, p, 0, oldprio < prio);
+
+ check_class_changed(rq, p, prev_class, oldprio, running);
+ }
--- /dev/null
+From a.p.zijlstra@chello.nl Fri Sep 17 18:19:01 2010
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Tue, 1 Dec 2009 12:21:47 +0100
+Subject: sched: Remove unnecessary RCU exclusion
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <96e351935dd8b98a2e436bf3e254fa3d91f4bd2d.1283514307.git.efault@gmx.de>
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit fb58bac5c75bfff8bbf7d02071a10a62f32fe28b upstream
+
+As Nick pointed out, and realized by myself when doing:
+ sched: Fix balance vs hotplug race
+the patch:
+ sched: for_each_domain() vs RCU
+
+is wrong, sched_domains are freed after synchronize_sched(), which
+means disabling preemption is enough.
+
+Reported-by: Nick Piggin <npiggin@suse.de>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <new-submission>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched_fair.c | 9 ++-------
+ 1 file changed, 2 insertions(+), 7 deletions(-)
+
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1410,7 +1410,6 @@ select_task_rq_fair(struct rq *rq, struc
+ new_cpu = prev_cpu;
+ }
+
+- rcu_read_lock();
+ for_each_domain(cpu, tmp) {
+ if (!(tmp->flags & SD_LOAD_BALANCE))
+ continue;
+@@ -1500,10 +1499,8 @@ select_task_rq_fair(struct rq *rq, struc
+ }
+ }
+
+- if (affine_sd && wake_affine(affine_sd, p, sync)) {
+- new_cpu = cpu;
+- goto out;
+- }
++ if (affine_sd && wake_affine(affine_sd, p, sync))
++ return cpu;
+
+ while (sd) {
+ int load_idx = sd->forkexec_idx;
+@@ -1544,8 +1541,6 @@ select_task_rq_fair(struct rq *rq, struc
+ /* while loop will break here if sd == NULL */
+ }
+
+-out:
+- rcu_read_unlock();
+ return new_cpu;
+ }
+ #endif /* CONFIG_SMP */
--- /dev/null
+From 30da688ef6b76e01969b00608202fff1eed2accc Mon Sep 17 00:00:00 2001
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Mon, 15 Mar 2010 10:10:19 +0100
+Subject: sched: sched_exec(): Remove the select_fallback_rq() logic
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit 30da688ef6b76e01969b00608202fff1eed2accc upstream.
+
+sched_exec()->select_task_rq() reads/updates ->cpus_allowed lockless.
+This can race with other CPUs updating our ->cpus_allowed, and this
+looks meaningless to me.
+
+The task is current and running, it must have online cpus in ->cpus_allowed,
+the fallback mode is bogus. And, if ->sched_class returns the "wrong" cpu,
+this likely means we raced with set_cpus_allowed() which was called
+for reason, why should sched_exec() retry and call ->select_task_rq()
+again?
+
+Change the code to call sched_class->select_task_rq() directly and do
+nothing if the returned cpu is wrong after re-checking under rq->lock.
+
+From now task_struct->cpus_allowed is always stable under TASK_WAKING,
+select_fallback_rq() is always called under rq-lock or the caller or
+the caller owns TASK_WAKING (select_task_rq).
+
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <20100315091019.GA9141@redhat.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/sched.c | 25 ++++++++-----------------
+ 1 file changed, 8 insertions(+), 17 deletions(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -2333,6 +2333,9 @@ void task_oncpu_function_call(struct tas
+ }
+
+ #ifdef CONFIG_SMP
++/*
++ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
++ */
+ static int select_fallback_rq(int cpu, struct task_struct *p)
+ {
+ int dest_cpu;
+@@ -2369,12 +2372,7 @@ static int select_fallback_rq(int cpu, s
+ }
+
+ /*
+- * Gets called from 3 sites (exec, fork, wakeup), since it is called without
+- * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
+- * by:
+- *
+- * exec: is unstable, retry loop
+- * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
++ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ */
+ static inline
+ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+@@ -3223,9 +3221,8 @@ void sched_exec(void)
+ unsigned long flags;
+ struct rq *rq;
+
+-again:
+ this_cpu = get_cpu();
+- dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
++ dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+ if (dest_cpu == this_cpu) {
+ put_cpu();
+ return;
+@@ -3233,18 +3230,12 @@ again:
+
+ rq = task_rq_lock(p, &flags);
+ put_cpu();
+-
+ /*
+ * select_task_rq() can race against ->cpus_allowed
+ */
+- if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
+- || unlikely(!cpu_active(dest_cpu))) {
+- task_rq_unlock(rq, &flags);
+- goto again;
+- }
+-
+- /* force the process onto the specified CPU */
+- if (migrate_task(p, dest_cpu, &req)) {
++ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
++ likely(cpu_active(dest_cpu)) &&
++ migrate_task(p, dest_cpu, &req)) {
+ /* Need to wait for migration thread (might exit: take ref). */
+ struct task_struct *mt = rq->migration_thread;
+
--- /dev/null
+From oleg@redhat.com Fri Sep 17 18:14:40 2010
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Tue, 30 Mar 2010 18:58:29 +0200
+Subject: sched: set_cpus_allowed_ptr(): Don't use rq->migration_thread after unlock
+To: stable <stable@kernel.org>
+Cc: Ingo Molnar <mingo@elte.hu>, Peter Zijlstra <a.p.zijlstra@chello.nl>, Greg KH <greg@kroah.com>
+Message-ID: <bb11665c972dd1d8ad681538e851ed2d9cc6741d.1283514307.git.efault@gmx.de>
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit 47a70985e5c093ae03d8ccf633c70a93761d86f2 upstream
+
+Trivial typo fix. rq->migration_thread can be NULL after
+task_rq_unlock(), this is why we have "mt" which should be
+ used instead.
+
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <20100330165829.GA18284@redhat.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ kernel/sched.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -7273,7 +7273,7 @@ int set_cpus_allowed_ptr(struct task_str
+
+ get_task_struct(mt);
+ task_rq_unlock(rq, &flags);
+- wake_up_process(rq->migration_thread);
++ wake_up_process(mt);
+ put_task_struct(mt);
+ wait_for_completion(&req.done);
+ tlb_migrate_finish(p->mm);
sched-remove-the-cfs_rq-dependency-from-set_task_cpu.patch
sched-fix-hotplug-hang.patch
sched-fix-fork-vs-hotplug-vs-cpuset-namespaces.patch
+sched-fix-incorrect-sanity-check.patch
+sched-fix-race-between-ttwu-and-task_rq_lock.patch
+sched-extend-enqueue_task-to-allow-head-queueing.patch
+sched-implement-head-queueing-for-sched_rt.patch
+sched-queue-a-deboosted-task-to-the-head-of-the-rt-prio-queue.patch
+sched-set_cpus_allowed_ptr-don-t-use-rq-migration_thread-after-unlock.patch
+sched-kill-the-broken-and-deadlockable-cpuset_lock-cpuset_cpus_allowed_locked-code.patch
+sched-move_task_off_dead_cpu-take-rq-lock-around-select_fallback_rq.patch
+sched-move_task_off_dead_cpu-remove-retry-logic.patch
+sched-sched_exec-remove-the-select_fallback_rq-logic.patch
+sched-_cpu_down-don-t-play-with-current-cpus_allowed.patch
+sched-make-select_fallback_rq-cpuset-friendly.patch
+sched-fix-task_waking-vs-fork-deadlock.patch
+sched-optimize-task_rq_lock.patch
+sched-fix-nr_uninterruptible-count.patch
+sched-fix-rq-clock-synchronization-when-migrating-tasks.patch
+sched-remove-unnecessary-rcu-exclusion.patch
+sched-apply-rcu-protection-to-wake_affine.patch
+sched-cleanup-select_task_rq_fair.patch
+sched-more-generic-wake_affine-vs-select_idle_sibling.patch
+sched-fix-vmark-regression-on-big-machines.patch
+sched-fix-select_idle_sibling.patch
+sched-pre-compute-cpumask_weight-sched_domain_span-sd.patch
+sched-fix-select_idle_sibling-logic-in-select_task_rq_fair.patch
+sched-cpuacct-use-bigger-percpu-counter-batch-values-for-stats-counters.patch