From: Greg Kroah-Hartman Date: Sat, 18 Sep 2010 01:21:53 +0000 (-0700) Subject: .32 patches X-Git-Tag: v2.6.27.54~14 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=6d5af50e8b296218fe17ac59b450b98839b2a780;p=thirdparty%2Fkernel%2Fstable-queue.git .32 patches --- diff --git a/queue-2.6.32/sched-_cpu_down-don-t-play-with-current-cpus_allowed.patch b/queue-2.6.32/sched-_cpu_down-don-t-play-with-current-cpus_allowed.patch new file mode 100644 index 00000000000..5563af3e14a --- /dev/null +++ b/queue-2.6.32/sched-_cpu_down-don-t-play-with-current-cpus_allowed.patch @@ -0,0 +1,141 @@ +From oleg@redhat.com Fri Sep 17 18:17:33 2010 +From: Oleg Nesterov +Date: Mon, 15 Mar 2010 10:10:23 +0100 +Subject: sched: _cpu_down(): Don't play with current->cpus_allowed +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <6ee57a0da8d81973a62d3c1ce12c5c96e2634b04.1283514307.git.efault@gmx.de> + +From: Oleg Nesterov + +commit 6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 upstream + +_cpu_down() changes the current task's affinity and then recovers it at +the end. The problems are well known: we can't restore old_allowed if it +was bound to the now-dead-cpu, and we can race with the userspace which +can change cpu-affinity during unplug. + +_cpu_down() should not play with current->cpus_allowed at all. Instead, +take_cpu_down() can migrate the caller of _cpu_down() after __cpu_disable() +removes the dying cpu from cpu_online_mask. + +Signed-off-by: Oleg Nesterov +Acked-by: Rafael J. Wysocki +Signed-off-by: Peter Zijlstra +LKML-Reference: <20100315091023.GA9148@redhat.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 1 + + kernel/cpu.c | 18 ++++++------------ + kernel/sched.c | 2 +- + 3 files changed, 8 insertions(+), 13 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1887,6 +1887,7 @@ extern void sched_clock_idle_sleep_event + extern void sched_clock_idle_wakeup_event(u64 delta_ns); + + #ifdef CONFIG_HOTPLUG_CPU ++extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p); + extern void idle_task_exit(void); + #else + static inline void idle_task_exit(void) {} +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -163,6 +163,7 @@ static inline void check_for_tasks(int c + } + + struct take_cpu_down_param { ++ struct task_struct *caller; + unsigned long mod; + void *hcpu; + }; +@@ -171,6 +172,7 @@ struct take_cpu_down_param { + static int __ref take_cpu_down(void *_param) + { + struct take_cpu_down_param *param = _param; ++ unsigned int cpu = (unsigned long)param->hcpu; + int err; + + /* Ensure this CPU doesn't handle any more interrupts. */ +@@ -181,6 +183,8 @@ static int __ref take_cpu_down(void *_pa + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); + ++ if (task_cpu(param->caller) == cpu) ++ move_task_off_dead_cpu(cpu, param->caller); + /* Force idle task to run as soon as we yield: it should + immediately notice cpu is offline and die quickly. */ + sched_idle_next(); +@@ -191,10 +195,10 @@ static int __ref take_cpu_down(void *_pa + static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) + { + int err, nr_calls = 0; +- cpumask_var_t old_allowed; + void *hcpu = (void *)(long)cpu; + unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + struct take_cpu_down_param tcd_param = { ++ .caller = current, + .mod = mod, + .hcpu = hcpu, + }; +@@ -205,9 +209,6 @@ static int __ref _cpu_down(unsigned int + if (!cpu_online(cpu)) + return -EINVAL; + +- if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL)) +- return -ENOMEM; +- + cpu_hotplug_begin(); + set_cpu_active(cpu, false); + err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, +@@ -224,10 +225,6 @@ static int __ref _cpu_down(unsigned int + goto out_release; + } + +- /* Ensure that we are not runnable on dying cpu */ +- cpumask_copy(old_allowed, ¤t->cpus_allowed); +- set_cpus_allowed_ptr(current, cpu_active_mask); +- + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + if (err) { + set_cpu_active(cpu, true); +@@ -236,7 +233,7 @@ static int __ref _cpu_down(unsigned int + hcpu) == NOTIFY_BAD) + BUG(); + +- goto out_allowed; ++ goto out_release; + } + BUG_ON(cpu_online(cpu)); + +@@ -254,8 +251,6 @@ static int __ref _cpu_down(unsigned int + + check_for_tasks(cpu); + +-out_allowed: +- set_cpus_allowed_ptr(current, old_allowed); + out_release: + cpu_hotplug_done(); + if (!err) { +@@ -263,7 +258,6 @@ out_release: + hcpu) == NOTIFY_BAD) + BUG(); + } +- free_cpumask_var(old_allowed); + return err; + } + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -7393,7 +7393,7 @@ static int migration_thread(void *data) + /* + * Figure out where task on dead CPU should go, use force if necessary. + */ +-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) ++void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) + { + struct rq *rq = cpu_rq(dead_cpu); + int needs_cpu, uninitialized_var(dest_cpu); diff --git a/queue-2.6.32/sched-apply-rcu-protection-to-wake_affine.patch b/queue-2.6.32/sched-apply-rcu-protection-to-wake_affine.patch new file mode 100644 index 00000000000..01e24df4888 --- /dev/null +++ b/queue-2.6.32/sched-apply-rcu-protection-to-wake_affine.patch @@ -0,0 +1,53 @@ +From daniel.blueman@gmail.com Fri Sep 17 18:19:12 2010 +From: Daniel J Blueman +Date: Tue, 1 Jun 2010 14:06:13 +0100 +Subject: sched: apply RCU protection to wake_affine() +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: + +From: Daniel J Blueman + +commit f3b577dec1f2ce32d2db6d2ca6badff7002512af upstream + +The task_group() function returns a pointer that must be protected +by either RCU, the ->alloc_lock, or the cgroup lock (see the +rcu_dereference_check() in task_subsys_state(), which is invoked by +task_group()). The wake_affine() function currently does none of these, +which means that a concurrent update would be within its rights to free +the structure returned by task_group(). Because wake_affine() uses this +structure only to compute load-balancing heuristics, there is no reason +to acquire either of the two locks. + +Therefore, this commit introduces an RCU read-side critical section that +starts before the first call to task_group() and ends after the last use +of the "tg" pointer returned from task_group(). Thanks to Li Zefan for +pointing out the need to extend the RCU read-side critical section from +that proposed by the original patch. + +Signed-off-by: Daniel J Blueman +Signed-off-by: Paul E. McKenney +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched_fair.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1250,6 +1250,7 @@ static int wake_affine(struct sched_doma + * effect of the currently running task from the load + * of the current CPU: + */ ++ rcu_read_lock(); + if (sync) { + tg = task_group(current); + weight = current->se.load.weight; +@@ -1275,6 +1276,7 @@ static int wake_affine(struct sched_doma + balanced = !this_load || + 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= + imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); ++ rcu_read_unlock(); + + /* + * If the currently running task will sleep within diff --git a/queue-2.6.32/sched-cleanup-select_task_rq_fair.patch b/queue-2.6.32/sched-cleanup-select_task_rq_fair.patch new file mode 100644 index 00000000000..077246a7d8c --- /dev/null +++ b/queue-2.6.32/sched-cleanup-select_task_rq_fair.patch @@ -0,0 +1,122 @@ +From a.p.zijlstra@chello.nl Fri Sep 17 18:19:30 2010 +From: Peter Zijlstra +Date: Thu, 12 Nov 2009 15:55:28 +0100 +Subject: sched: Cleanup select_task_rq_fair() +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <80263dd5bd5a2069a3907f0408ab2f73377f0b8a.1283514307.git.efault@gmx.de> + +From: Peter Zijlstra + +commit a50bde5130f65733142b32975616427d0ea50856 upstream + +Clean up the new affine to idle sibling bits while trying to +grok them. Should not have any function differences. + +Signed-off-by: Peter Zijlstra +Cc: Mike Galbraith +LKML-Reference: <20091112145610.832503781@chello.nl> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched_fair.c | 73 ++++++++++++++++++++++++++++++++++++---------------- + 1 file changed, 51 insertions(+), 22 deletions(-) + +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1384,6 +1384,41 @@ find_idlest_cpu(struct sched_group *grou + } + + /* ++ * Try and locate an idle CPU in the sched_domain. ++ */ ++static int ++select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) ++{ ++ int cpu = smp_processor_id(); ++ int prev_cpu = task_cpu(p); ++ int i; ++ ++ /* ++ * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE ++ * test in select_task_rq_fair) and the prev_cpu is idle then that's ++ * always a better target than the current cpu. ++ */ ++ if (target == cpu) { ++ if (!cpu_rq(prev_cpu)->cfs.nr_running) ++ target = prev_cpu; ++ } ++ ++ /* ++ * Otherwise, iterate the domain and find an elegible idle cpu. ++ */ ++ if (target == -1 || target == cpu) { ++ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { ++ if (!cpu_rq(i)->cfs.nr_running) { ++ target = i; ++ break; ++ } ++ } ++ } ++ ++ return target; ++} ++ ++/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. +@@ -1441,36 +1476,30 @@ select_task_rq_fair(struct rq *rq, struc + } + + if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) { +- int candidate = -1, i; ++ int target = -1; + ++ /* ++ * If both cpu and prev_cpu are part of this domain, ++ * cpu is a valid SD_WAKE_AFFINE target. ++ */ + if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) +- candidate = cpu; ++ target = cpu; + + /* +- * Check for an idle shared cache. ++ * If there's an idle sibling in this domain, make that ++ * the wake_affine target instead of the current cpu. ++ * ++ * XXX: should we possibly do this outside of ++ * WAKE_AFFINE, in case the shared cache domain is ++ * smaller than the WAKE_AFFINE domain? + */ +- if (tmp->flags & SD_PREFER_SIBLING) { +- if (candidate == cpu) { +- if (!cpu_rq(prev_cpu)->cfs.nr_running) +- candidate = prev_cpu; +- } +- +- if (candidate == -1 || candidate == cpu) { +- for_each_cpu(i, sched_domain_span(tmp)) { +- if (!cpumask_test_cpu(i, &p->cpus_allowed)) +- continue; +- if (!cpu_rq(i)->cfs.nr_running) { +- candidate = i; +- break; +- } +- } +- } +- } ++ if (tmp->flags & SD_PREFER_SIBLING) ++ target = select_idle_sibling(p, tmp, target); + +- if (candidate >= 0) { ++ if (target >= 0) { + affine_sd = tmp; + want_affine = 0; +- cpu = candidate; ++ cpu = target; + } + } + diff --git a/queue-2.6.32/sched-cpuacct-use-bigger-percpu-counter-batch-values-for-stats-counters.patch b/queue-2.6.32/sched-cpuacct-use-bigger-percpu-counter-batch-values-for-stats-counters.patch new file mode 100644 index 00000000000..097a9d35d1f --- /dev/null +++ b/queue-2.6.32/sched-cpuacct-use-bigger-percpu-counter-batch-values-for-stats-counters.patch @@ -0,0 +1,110 @@ +From anton@samba.org Fri Sep 17 18:20:49 2010 +From: Anton Blanchard +Date: Tue, 2 Feb 2010 14:46:13 -0800 +Subject: sched: cpuacct: Use bigger percpu counter batch values for stats counters +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <096b1867bf2f9b6a3fc6c4ed114a02c181d3d77e.1283514307.git.efault@gmx.de> + +From: Anton Blanchard + +commit fa535a77bd3fa32b9215ba375d6a202fe73e1dd6 upstream + +When CONFIG_VIRT_CPU_ACCOUNTING and CONFIG_CGROUP_CPUACCT are +enabled we can call cpuacct_update_stats with values much larger +than percpu_counter_batch. This means the call to +percpu_counter_add will always add to the global count which is +protected by a spinlock and we end up with a global spinlock in +the scheduler. + +Based on an idea by KOSAKI Motohiro, this patch scales the batch +value by cputime_one_jiffy such that we have the same batch +limit as we would if CONFIG_VIRT_CPU_ACCOUNTING was disabled. +His patch did this once at boot but that initialisation happened +too early on PowerPC (before time_init) and it was never updated +at runtime as a result of a hotplug cpu add/remove. + +This patch instead scales percpu_counter_batch by +cputime_one_jiffy at runtime, which keeps the batch correct even +after cpu hotplug operations. We cap it at INT_MAX in case of +overflow. + +For architectures that do not support +CONFIG_VIRT_CPU_ACCOUNTING, cputime_one_jiffy is the constant 1 +and gcc is smart enough to optimise min(s32 +percpu_counter_batch, INT_MAX) to just percpu_counter_batch at +least on x86 and PowerPC. So there is no need to add an #ifdef. + +On a 64 thread PowerPC box with CONFIG_VIRT_CPU_ACCOUNTING and +CONFIG_CGROUP_CPUACCT enabled, a context switch microbenchmark +is 234x faster and almost matches a CONFIG_CGROUP_CPUACCT +disabled kernel: + + CONFIG_CGROUP_CPUACCT disabled: 16906698 ctx switches/sec + CONFIG_CGROUP_CPUACCT enabled: 61720 ctx switches/sec + CONFIG_CGROUP_CPUACCT + patch: 16663217 ctx switches/sec + +Tested with: + + wget http://ozlabs.org/~anton/junkcode/context_switch.c + make context_switch + for i in `seq 0 63`; do taskset -c $i ./context_switch & done + vmstat 1 + +Signed-off-by: Anton Blanchard +Reviewed-by: KOSAKI Motohiro +Acked-by: Balbir Singh +Tested-by: Balbir Singh +Cc: Peter Zijlstra +Cc: Martin Schwidefsky +Cc: "Luck, Tony" +Signed-off-by: Andrew Morton +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched.c | 20 +++++++++++++++++++- + 1 file changed, 19 insertions(+), 1 deletion(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -10934,12 +10934,30 @@ static void cpuacct_charge(struct task_s + } + + /* ++ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large ++ * in cputime_t units. As a result, cpuacct_update_stats calls ++ * percpu_counter_add with values large enough to always overflow the ++ * per cpu batch limit causing bad SMP scalability. ++ * ++ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we ++ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled ++ * and enabled. We cap it at INT_MAX which is the largest allowed batch value. ++ */ ++#ifdef CONFIG_SMP ++#define CPUACCT_BATCH \ ++ min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) ++#else ++#define CPUACCT_BATCH 0 ++#endif ++ ++/* + * Charge the system/user time to the task's accounting group. + */ + static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) + { + struct cpuacct *ca; ++ int batch = CPUACCT_BATCH; + + if (unlikely(!cpuacct_subsys.active)) + return; +@@ -10948,7 +10966,7 @@ static void cpuacct_update_stats(struct + ca = task_ca(tsk); + + do { +- percpu_counter_add(&ca->cpustat[idx], val); ++ __percpu_counter_add(&ca->cpustat[idx], val, batch); + ca = ca->parent; + } while (ca); + rcu_read_unlock(); diff --git a/queue-2.6.32/sched-extend-enqueue_task-to-allow-head-queueing.patch b/queue-2.6.32/sched-extend-enqueue_task-to-allow-head-queueing.patch new file mode 100644 index 00000000000..b3e8945ee47 --- /dev/null +++ b/queue-2.6.32/sched-extend-enqueue_task-to-allow-head-queueing.patch @@ -0,0 +1,122 @@ +From tglx@linutronix.de Fri Sep 17 18:13:56 2010 +From: Thomas Gleixner +Date: Wed, 20 Jan 2010 20:58:57 +0000 +Subject: sched: Extend enqueue_task to allow head queueing +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: + +From: Thomas Gleixner + +commit ea87bb7853168434f4a82426dd1ea8421f9e604d upstream + +The ability of enqueueing a task to the head of a SCHED_FIFO priority +list is required to fix some violations of POSIX scheduling policy. + +Extend the related functions with a "head" argument. + +Signed-off-by: Thomas Gleixner +Acked-by: Peter Zijlstra +Tested-by: Carsten Emde +Tested-by: Mathias Weber +LKML-Reference: <20100120171629.734886007@linutronix.de> +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 3 ++- + kernel/sched.c | 13 +++++++------ + kernel/sched_fair.c | 3 ++- + kernel/sched_rt.c | 3 ++- + 4 files changed, 13 insertions(+), 9 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1071,7 +1071,8 @@ struct sched_domain; + struct sched_class { + const struct sched_class *next; + +- void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); ++ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup, ++ bool head); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); + void (*yield_task) (struct rq *rq); + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -1903,13 +1903,14 @@ static void update_avg(u64 *avg, u64 sam + *avg += diff >> 3; + } + +-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) ++static void ++enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) + { + if (wakeup) + p->se.start_runtime = p->se.sum_exec_runtime; + + sched_info_queued(p); +- p->sched_class->enqueue_task(rq, p, wakeup); ++ p->sched_class->enqueue_task(rq, p, wakeup, head); + p->se.on_rq = 1; + } + +@@ -1985,7 +1986,7 @@ static void activate_task(struct rq *rq, + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + +- enqueue_task(rq, p, wakeup); ++ enqueue_task(rq, p, wakeup, false); + inc_nr_running(rq); + } + +@@ -6183,7 +6184,7 @@ void rt_mutex_setprio(struct task_struct + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) { +- enqueue_task(rq, p, 0); ++ enqueue_task(rq, p, 0, false); + + check_class_changed(rq, p, prev_class, oldprio, running); + } +@@ -6227,7 +6228,7 @@ void set_user_nice(struct task_struct *p + delta = p->prio - old_prio; + + if (on_rq) { +- enqueue_task(rq, p, 0); ++ enqueue_task(rq, p, 0, false); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: +@@ -10180,7 +10181,7 @@ void sched_move_task(struct task_struct + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + if (on_rq) +- enqueue_task(rq, tsk, 0); ++ enqueue_task(rq, tsk, 0, false); + + task_rq_unlock(rq, &flags); + } +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1031,7 +1031,8 @@ static inline void hrtick_update(struct + * increased. Here we update the fair scheduling stats and + * then put the task into the rbtree: + */ +-static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) ++static void ++enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) + { + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; +--- a/kernel/sched_rt.c ++++ b/kernel/sched_rt.c +@@ -878,7 +878,8 @@ static void dequeue_rt_entity(struct sch + /* + * Adding/removing a task to/from a priority array: + */ +-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) ++static void ++enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) + { + struct sched_rt_entity *rt_se = &p->rt; + diff --git a/queue-2.6.32/sched-fix-incorrect-sanity-check.patch b/queue-2.6.32/sched-fix-incorrect-sanity-check.patch new file mode 100644 index 00000000000..201e69cb766 --- /dev/null +++ b/queue-2.6.32/sched-fix-incorrect-sanity-check.patch @@ -0,0 +1,36 @@ +From peterz@infradead.org Fri Sep 17 18:13:26 2010 +From: Peter Zijlstra +Date: Thu, 21 Jan 2010 16:34:27 +0100 +Subject: sched: Fix incorrect sanity check +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <550df2da0c2d00162a463923644fd024de95b890.1283514307.git.efault@gmx.de> + +From: Peter Zijlstra + +commit 11854247e2c851e7ff9ce138e501c6cffc5a4217 upstream + +We moved to migrate on wakeup, which means that sleeping tasks could +still be present on offline cpus. Amend the check to only test running +tasks. + +Reported-by: Heiko Carstens +Signed-off-by: Peter Zijlstra +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/cpu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -151,7 +151,7 @@ static inline void check_for_tasks(int c + + write_lock_irq(&tasklist_lock); + for_each_process(p) { +- if (task_cpu(p) == cpu && ++ if (task_cpu(p) == cpu && p->state == TASK_RUNNING && + (!cputime_eq(p->utime, cputime_zero) || + !cputime_eq(p->stime, cputime_zero))) + printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ diff --git a/queue-2.6.32/sched-fix-nr_uninterruptible-count.patch b/queue-2.6.32/sched-fix-nr_uninterruptible-count.patch new file mode 100644 index 00000000000..41d9c1fe56b --- /dev/null +++ b/queue-2.6.32/sched-fix-nr_uninterruptible-count.patch @@ -0,0 +1,46 @@ +From a.p.zijlstra@chello.nl Fri Sep 17 18:18:32 2010 +From: Peter Zijlstra +Date: Fri, 26 Mar 2010 12:22:14 +0100 +Subject: sched: Fix nr_uninterruptible count +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: + +From: Peter Zijlstra + +commit cc87f76a601d2d256118f7bab15e35254356ae21 upstream + +The cpuload calculation in calc_load_account_active() assumes +rq->nr_uninterruptible will not change on an offline cpu after +migrate_nr_uninterruptible(). However the recent migrate on wakeup +changes broke that and would result in decrementing the offline cpu's +rq->nr_uninterruptible. + +Fix this by accounting the nr_uninterruptible on the waking cpu. + +Signed-off-by: Peter Zijlstra +LKML-Reference: +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -2435,8 +2435,12 @@ static int try_to_wake_up(struct task_st + * + * First fix up the nr_uninterruptible count: + */ +- if (task_contributes_to_load(p)) +- rq->nr_uninterruptible--; ++ if (task_contributes_to_load(p)) { ++ if (likely(cpu_online(orig_cpu))) ++ rq->nr_uninterruptible--; ++ else ++ this_rq()->nr_uninterruptible--; ++ } + p->state = TASK_WAKING; + + if (p->sched_class->task_waking) diff --git a/queue-2.6.32/sched-fix-race-between-ttwu-and-task_rq_lock.patch b/queue-2.6.32/sched-fix-race-between-ttwu-and-task_rq_lock.patch new file mode 100644 index 00000000000..ddb89fde825 --- /dev/null +++ b/queue-2.6.32/sched-fix-race-between-ttwu-and-task_rq_lock.patch @@ -0,0 +1,153 @@ +From a.p.zijlstra@chello.nl Fri Sep 17 18:13:39 2010 +From: Peter Zijlstra +Date: Mon, 15 Feb 2010 14:45:54 +0100 +Subject: sched: Fix race between ttwu() and task_rq_lock() +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <80faa6f269f4bd7825aec22056bbca743b5bd100.1283514307.git.efault@gmx.de> + +From: Peter Zijlstra + +commit 0970d2992dfd7d5ec2c787417cf464f01eeaf42a upstream + +Thomas found that due to ttwu() changing a task's cpu without holding +the rq->lock, task_rq_lock() might end up locking the wrong rq. + +Avoid this by serializing against TASK_WAKING. + +Reported-by: Thomas Gleixner +Signed-off-by: Peter Zijlstra +LKML-Reference: <1266241712.15770.420.camel@laptop> +Signed-off-by: Thomas Gleixner +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched.c | 71 ++++++++++++++++++++++++++++++++++++--------------------- + 1 file changed, 45 insertions(+), 26 deletions(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -942,16 +942,33 @@ static inline void finish_lock_switch(st + #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + + /* ++ * Check whether the task is waking, we use this to synchronize against ++ * ttwu() so that task_cpu() reports a stable number. ++ * ++ * We need to make an exception for PF_STARTING tasks because the fork ++ * path might require task_rq_lock() to work, eg. it can call ++ * set_cpus_allowed_ptr() from the cpuset clone_ns code. ++ */ ++static inline int task_is_waking(struct task_struct *p) ++{ ++ return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); ++} ++ ++/* + * __task_rq_lock - lock the runqueue a given task resides on. + * Must be called interrupts disabled. + */ + static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) + { ++ struct rq *rq; ++ + for (;;) { +- struct rq *rq = task_rq(p); ++ while (task_is_waking(p)) ++ cpu_relax(); ++ rq = task_rq(p); + spin_lock(&rq->lock); +- if (likely(rq == task_rq(p))) ++ if (likely(rq == task_rq(p) && !task_is_waking(p))) + return rq; + spin_unlock(&rq->lock); + } +@@ -968,10 +985,12 @@ static struct rq *task_rq_lock(struct ta + struct rq *rq; + + for (;;) { ++ while (task_is_waking(p)) ++ cpu_relax(); + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); +- if (likely(rq == task_rq(p))) ++ if (likely(rq == task_rq(p) && !task_is_waking(p))) + return rq; + spin_unlock_irqrestore(&rq->lock, *flags); + } +@@ -2439,14 +2458,27 @@ static int try_to_wake_up(struct task_st + __task_rq_unlock(rq); + + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); +- if (cpu != orig_cpu) ++ if (cpu != orig_cpu) { ++ /* ++ * Since we migrate the task without holding any rq->lock, ++ * we need to be careful with task_rq_lock(), since that ++ * might end up locking an invalid rq. ++ */ + set_task_cpu(p, cpu); ++ } + +- rq = __task_rq_lock(p); ++ rq = cpu_rq(cpu); ++ spin_lock(&rq->lock); + update_rq_clock(rq); + ++ /* ++ * We migrated the task without holding either rq->lock, however ++ * since the task is not on the task list itself, nobody else ++ * will try and migrate the task, hence the rq should match the ++ * cpu we just moved it to. ++ */ ++ WARN_ON(task_cpu(p) != cpu); + WARN_ON(p->state != TASK_WAKING); +- cpu = task_cpu(p); + + #ifdef CONFIG_SCHEDSTATS + schedstat_inc(rq, ttwu_count); +@@ -2695,7 +2727,13 @@ void wake_up_new_task(struct task_struct + set_task_cpu(p, cpu); + #endif + +- rq = task_rq_lock(p, &flags); ++ /* ++ * Since the task is not on the rq and we still have TASK_WAKING set ++ * nobody else will migrate this task. ++ */ ++ rq = cpu_rq(cpu); ++ spin_lock_irqsave(&rq->lock, flags); ++ + BUG_ON(p->state != TASK_WAKING); + p->state = TASK_RUNNING; + update_rq_clock(rq); +@@ -7204,27 +7242,8 @@ int set_cpus_allowed_ptr(struct task_str + struct rq *rq; + int ret = 0; + +- /* +- * Since we rely on wake-ups to migrate sleeping tasks, don't change +- * the ->cpus_allowed mask from under waking tasks, which would be +- * possible when we change rq->lock in ttwu(), so synchronize against +- * TASK_WAKING to avoid that. +- * +- * Make an exception for freshly cloned tasks, since cpuset namespaces +- * might move the task about, we have to validate the target in +- * wake_up_new_task() anyway since the cpu might have gone away. +- */ +-again: +- while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) +- cpu_relax(); +- + rq = task_rq_lock(p, &flags); + +- if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { +- task_rq_unlock(rq, &flags); +- goto again; +- } +- + if (!cpumask_intersects(new_mask, cpu_active_mask)) { + ret = -EINVAL; + goto out; diff --git a/queue-2.6.32/sched-fix-rq-clock-synchronization-when-migrating-tasks.patch b/queue-2.6.32/sched-fix-rq-clock-synchronization-when-migrating-tasks.patch new file mode 100644 index 00000000000..e21b05de792 --- /dev/null +++ b/queue-2.6.32/sched-fix-rq-clock-synchronization-when-migrating-tasks.patch @@ -0,0 +1,44 @@ +From peterz@infradead.org Fri Sep 17 18:18:47 2010 +From: Peter Zijlstra +Date: Thu, 19 Aug 2010 13:31:43 +0200 +Subject: sched: Fix rq->clock synchronization when migrating tasks +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <748cfa7664c3c3092de1cf8c86f96474f840bed6.1283514307.git.efault@gmx.de> + +From: Peter Zijlstra + +commit 861d034ee814917a83bd5de4b26e3b8336ddeeb8 upstream + +sched_fork() -- we do task placement in ->task_fork_fair() ensure we + update_rq_clock() so we work with current time. We leave the vruntime + in relative state, so the time delay until wake_up_new_task() doesn't + matter. + +wake_up_new_task() -- Since task_fork_fair() left p->vruntime in + relative state we can safely migrate, the activate_task() on the + remote rq will call update_rq_clock() and causes the clock to be + synced (enough). + +Tested-by: Jack Daniel +Tested-by: Philby John +Signed-off-by: Peter Zijlstra +LKML-Reference: <1281002322.1923.1708.camel@laptop> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched_fair.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1963,6 +1963,8 @@ static void task_fork_fair(struct task_s + + spin_lock_irqsave(&rq->lock, flags); + ++ update_rq_clock(rq); ++ + if (unlikely(task_cpu(p) != this_cpu)) + __set_task_cpu(p, this_cpu); + diff --git a/queue-2.6.32/sched-fix-select_idle_sibling-logic-in-select_task_rq_fair.patch b/queue-2.6.32/sched-fix-select_idle_sibling-logic-in-select_task_rq_fair.patch new file mode 100644 index 00000000000..08651a2596e --- /dev/null +++ b/queue-2.6.32/sched-fix-select_idle_sibling-logic-in-select_task_rq_fair.patch @@ -0,0 +1,174 @@ +From suresh.b.siddha@intel.com Fri Sep 17 18:20:36 2010 +From: Suresh Siddha +Date: Wed, 31 Mar 2010 16:47:45 -0700 +Subject: sched: Fix select_idle_sibling() logic in select_task_rq_fair() +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <7c9917f68be3e57e65b938ff15cc6a2b1cc0da16.1283514307.git.efault@gmx.de> + +From: Suresh Siddha + +commit 99bd5e2f245d8cd17d040c82d40becdb3efd9b69 upstream + +Issues in the current select_idle_sibling() logic in select_task_rq_fair() +in the context of a task wake-up: + +a) Once we select the idle sibling, we use that domain (spanning the cpu that + the task is currently woken-up and the idle sibling that we found) in our + wake_affine() decisions. This domain is completely different from the + domain(we are supposed to use) that spans the cpu that the task currently + woken-up and the cpu where the task previously ran. + +b) We do select_idle_sibling() check only for the cpu that the task is + currently woken-up on. If select_task_rq_fair() selects the previously run + cpu for waking the task, doing a select_idle_sibling() check + for that cpu also helps and we don't do this currently. + +c) In the scenarios where the cpu that the task is woken-up is busy but + with its HT siblings are idle, we are selecting the task be woken-up + on the idle HT sibling instead of a core that it previously ran + and currently completely idle. i.e., we are not taking decisions based on + wake_affine() but directly selecting an idle sibling that can cause + an imbalance at the SMT/MC level which will be later corrected by the + periodic load balancer. + +Fix this by first going through the load imbalance calculations using +wake_affine() and once we make a decision of woken-up cpu vs previously-ran cpu, +then choose a possible idle sibling for waking up the task on. + +Signed-off-by: Suresh Siddha +Signed-off-by: Peter Zijlstra +LKML-Reference: <1270079265.7835.8.camel@sbs-t61.sc.intel.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched_fair.c | 82 +++++++++++++++++++++++++--------------------------- + 1 file changed, 40 insertions(+), 42 deletions(-) + +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1386,29 +1386,48 @@ find_idlest_cpu(struct sched_group *grou + /* + * Try and locate an idle CPU in the sched_domain. + */ +-static int +-select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) ++static int select_idle_sibling(struct task_struct *p, int target) + { + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); ++ struct sched_domain *sd; + int i; + + /* +- * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE +- * test in select_task_rq_fair) and the prev_cpu is idle then that's +- * always a better target than the current cpu. ++ * If the task is going to be woken-up on this cpu and if it is ++ * already idle, then it is the right target. ++ */ ++ if (target == cpu && idle_cpu(cpu)) ++ return cpu; ++ ++ /* ++ * If the task is going to be woken-up on the cpu where it previously ++ * ran and if it is currently idle, then it the right target. + */ +- if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) ++ if (target == prev_cpu && idle_cpu(prev_cpu)) + return prev_cpu; + + /* +- * Otherwise, iterate the domain and find an elegible idle cpu. ++ * Otherwise, iterate the domains and find an elegible idle cpu. + */ +- for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { +- if (!cpu_rq(i)->cfs.nr_running) { +- target = i; ++ for_each_domain(target, sd) { ++ if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) + break; ++ ++ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { ++ if (idle_cpu(i)) { ++ target = i; ++ break; ++ } + } ++ ++ /* ++ * Lets stop looking for an idle sibling when we reached ++ * the domain that spans the current cpu and prev_cpu. ++ */ ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && ++ cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) ++ break; + } + + return target; +@@ -1432,7 +1451,7 @@ select_task_rq_fair(struct rq *rq, struc + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int new_cpu = cpu; +- int want_affine = 0, cpu_idle = !current->pid; ++ int want_affine = 0; + int want_sd = 1; + int sync = wake_flags & WF_SYNC; + +@@ -1472,36 +1491,13 @@ select_task_rq_fair(struct rq *rq, struc + } + + /* +- * While iterating the domains looking for a spanning +- * WAKE_AFFINE domain, adjust the affine target to any idle cpu +- * in cache sharing domains along the way. ++ * If both cpu and prev_cpu are part of this domain, ++ * cpu is a valid SD_WAKE_AFFINE target. + */ +- if (want_affine) { +- int target = -1; +- +- /* +- * If both cpu and prev_cpu are part of this domain, +- * cpu is a valid SD_WAKE_AFFINE target. +- */ +- if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) +- target = cpu; +- +- /* +- * If there's an idle sibling in this domain, make that +- * the wake_affine target instead of the current cpu. +- */ +- if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES) +- target = select_idle_sibling(p, tmp, target); +- +- if (target >= 0) { +- if (tmp->flags & SD_WAKE_AFFINE) { +- affine_sd = tmp; +- want_affine = 0; +- if (target != cpu) +- cpu_idle = 1; +- } +- cpu = target; +- } ++ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && ++ cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { ++ affine_sd = tmp; ++ want_affine = 0; + } + + if (!want_sd && !want_affine) +@@ -1532,8 +1528,10 @@ select_task_rq_fair(struct rq *rq, struc + #endif + + if (affine_sd) { +- if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync)) +- return cpu; ++ if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) ++ return select_idle_sibling(p, cpu); ++ else ++ return select_idle_sibling(p, prev_cpu); + } + + while (sd) { diff --git a/queue-2.6.32/sched-fix-select_idle_sibling.patch b/queue-2.6.32/sched-fix-select_idle_sibling.patch new file mode 100644 index 00000000000..79629d17e6a --- /dev/null +++ b/queue-2.6.32/sched-fix-select_idle_sibling.patch @@ -0,0 +1,76 @@ +From efault@gmx.de Fri Sep 17 18:20:11 2010 +From: Mike Galbraith +Date: Thu, 11 Mar 2010 17:17:16 +0100 +Subject: sched: Fix select_idle_sibling() +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <2dc48f18ab671dc1c87c87dba674ff4b755d17ff.1283514307.git.efault@gmx.de> + +From: Mike Galbraith + +commit 8b911acdf08477c059d1c36c21113ab1696c612b upstream + +Don't bother with selection when the current cpu is idle. Recent load +balancing changes also make it no longer necessary to check wake_affine() +success before returning the selected sibling, so we now always use it. + +Signed-off-by: Mike Galbraith +Signed-off-by: Peter Zijlstra +LKML-Reference: <1268301369.6785.36.camel@marge.simson.net> +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched_fair.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1432,7 +1432,7 @@ select_task_rq_fair(struct rq *rq, struc + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int new_cpu = cpu; +- int want_affine = 0; ++ int want_affine = 0, cpu_idle = !current->pid; + int want_sd = 1; + int sync = wake_flags & WF_SYNC; + +@@ -1490,13 +1490,15 @@ select_task_rq_fair(struct rq *rq, struc + * If there's an idle sibling in this domain, make that + * the wake_affine target instead of the current cpu. + */ +- if (tmp->flags & SD_SHARE_PKG_RESOURCES) ++ if (!cpu_idle && tmp->flags & SD_SHARE_PKG_RESOURCES) + target = select_idle_sibling(p, tmp, target); + + if (target >= 0) { + if (tmp->flags & SD_WAKE_AFFINE) { + affine_sd = tmp; + want_affine = 0; ++ if (target != cpu) ++ cpu_idle = 1; + } + cpu = target; + } +@@ -1512,6 +1514,7 @@ select_task_rq_fair(struct rq *rq, struc + sd = tmp; + } + ++#ifdef CONFIG_FAIR_GROUP_SCHED + if (sched_feat(LB_SHARES_UPDATE)) { + /* + * Pick the largest domain to update shares over +@@ -1528,9 +1531,12 @@ select_task_rq_fair(struct rq *rq, struc + spin_lock(&rq->lock); + } + } ++#endif + +- if (affine_sd && wake_affine(affine_sd, p, sync)) +- return cpu; ++ if (affine_sd) { ++ if (cpu_idle || cpu == prev_cpu || wake_affine(affine_sd, p, sync)) ++ return cpu; ++ } + + while (sd) { + int load_idx = sd->forkexec_idx; diff --git a/queue-2.6.32/sched-fix-task_waking-vs-fork-deadlock.patch b/queue-2.6.32/sched-fix-task_waking-vs-fork-deadlock.patch new file mode 100644 index 00000000000..fdd34c0878e --- /dev/null +++ b/queue-2.6.32/sched-fix-task_waking-vs-fork-deadlock.patch @@ -0,0 +1,246 @@ +From a.p.zijlstra@chello.nl Fri Sep 17 18:18:02 2010 +From: Peter Zijlstra +Date: Wed, 24 Mar 2010 18:34:10 +0100 +Subject: sched: Fix TASK_WAKING vs fork deadlock +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <1620f28b03b31be9190132c280a85fc1d08141a8.1283514307.git.efault@gmx.de> + +From: Peter Zijlstra + +commit 0017d735092844118bef006696a750a0e4ef6ebd upstream + +Oleg noticed a few races with the TASK_WAKING usage on fork. + + - since TASK_WAKING is basically a spinlock, it should be IRQ safe + - since we set TASK_WAKING (*) without holding rq->lock it could + be there still is a rq->lock holder, thereby not actually + providing full serialization. + +(*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING. + +Cure the second issue by not setting TASK_WAKING in sched_fork(), but +only temporarily in wake_up_new_task() while calling select_task_rq(). + +Cure the first by holding rq->lock around the select_task_rq() call, +this will disable IRQs, this however requires that we push down the +rq->lock release into select_task_rq_fair()'s cgroup stuff. + +Because select_task_rq_fair() still needs to drop the rq->lock we +cannot fully get rid of TASK_WAKING. + +Reported-by: Oleg Nesterov +Signed-off-by: Peter Zijlstra +LKML-Reference: +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 3 +- + kernel/sched.c | 65 +++++++++++++++++------------------------------- + kernel/sched_fair.c | 8 ++++- + kernel/sched_idletask.c | 3 +- + kernel/sched_rt.c | 5 +-- + 5 files changed, 36 insertions(+), 48 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1082,7 +1082,8 @@ struct sched_class { + void (*put_prev_task) (struct rq *rq, struct task_struct *p); + + #ifdef CONFIG_SMP +- int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); ++ int (*select_task_rq)(struct rq *rq, struct task_struct *p, ++ int sd_flag, int flags); + + unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, + struct rq *busiest, unsigned long max_load_move, +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -944,14 +944,10 @@ static inline void finish_lock_switch(st + /* + * Check whether the task is waking, we use this to synchronize against + * ttwu() so that task_cpu() reports a stable number. +- * +- * We need to make an exception for PF_STARTING tasks because the fork +- * path might require task_rq_lock() to work, eg. it can call +- * set_cpus_allowed_ptr() from the cpuset clone_ns code. + */ + static inline int task_is_waking(struct task_struct *p) + { +- return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); ++ return unlikely(p->state == TASK_WAKING); + } + + /* +@@ -2373,9 +2369,9 @@ static int select_fallback_rq(int cpu, s + * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. + */ + static inline +-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) ++int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) + { +- int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); ++ int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need +@@ -2450,17 +2446,10 @@ static int try_to_wake_up(struct task_st + if (p->sched_class->task_waking) + p->sched_class->task_waking(rq, p); + +- __task_rq_unlock(rq); +- +- cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); +- if (cpu != orig_cpu) { +- /* +- * Since we migrate the task without holding any rq->lock, +- * we need to be careful with task_rq_lock(), since that +- * might end up locking an invalid rq. +- */ ++ cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); ++ if (cpu != orig_cpu) + set_task_cpu(p, cpu); +- } ++ __task_rq_unlock(rq); + + rq = cpu_rq(cpu); + spin_lock(&rq->lock); +@@ -2638,11 +2627,11 @@ void sched_fork(struct task_struct *p, i + + __sched_fork(p); + /* +- * We mark the process as waking here. This guarantees that ++ * We mark the process as running here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ +- p->state = TASK_WAKING; ++ p->state = TASK_RUNNING; + + /* + * Revert to default priority/policy on fork if requested. +@@ -2709,28 +2698,25 @@ void wake_up_new_task(struct task_struct + int cpu = get_cpu(); + + #ifdef CONFIG_SMP ++ rq = task_rq_lock(p, &flags); ++ p->state = TASK_WAKING; ++ + /* + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug + * +- * We still have TASK_WAKING but PF_STARTING is gone now, meaning +- * ->cpus_allowed is stable, we have preemption disabled, meaning +- * cpu_online_mask is stable. ++ * We set TASK_WAKING so that select_task_rq() can drop rq->lock ++ * without people poking at ->cpus_allowed. + */ +- cpu = select_task_rq(p, SD_BALANCE_FORK, 0); ++ cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); + set_task_cpu(p, cpu); +-#endif +- +- /* +- * Since the task is not on the rq and we still have TASK_WAKING set +- * nobody else will migrate this task. +- */ +- rq = cpu_rq(cpu); +- spin_lock_irqsave(&rq->lock, flags); + +- BUG_ON(p->state != TASK_WAKING); + p->state = TASK_RUNNING; ++ task_rq_unlock(rq, &flags); ++#endif ++ ++ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + activate_task(rq, p, 0); + trace_sched_wakeup_new(rq, p, 1); +@@ -3215,19 +3201,15 @@ void sched_exec(void) + { + struct task_struct *p = current; + struct migration_req req; +- int dest_cpu, this_cpu; + unsigned long flags; + struct rq *rq; +- +- this_cpu = get_cpu(); +- dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); +- if (dest_cpu == this_cpu) { +- put_cpu(); +- return; +- } ++ int dest_cpu; + + rq = task_rq_lock(p, &flags); +- put_cpu(); ++ dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); ++ if (dest_cpu == smp_processor_id()) ++ goto unlock; ++ + /* + * select_task_rq() can race against ->cpus_allowed + */ +@@ -3245,6 +3227,7 @@ void sched_exec(void) + + return; + } ++unlock: + task_rq_unlock(rq, &flags); + } + +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1392,7 +1392,8 @@ find_idlest_cpu(struct sched_group *grou + * + * preempt must be disabled. + */ +-static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) ++static int ++select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) + { + struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; + int cpu = smp_processor_id(); +@@ -1492,8 +1493,11 @@ static int select_task_rq_fair(struct ta + cpumask_weight(sched_domain_span(sd)))) + tmp = affine_sd; + +- if (tmp) ++ if (tmp) { ++ spin_unlock(&rq->lock); + update_shares(tmp); ++ spin_lock(&rq->lock); ++ } + } + + if (affine_sd && wake_affine(affine_sd, p, sync)) { +--- a/kernel/sched_idletask.c ++++ b/kernel/sched_idletask.c +@@ -6,7 +6,8 @@ + */ + + #ifdef CONFIG_SMP +-static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) ++static int ++select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) + { + return task_cpu(p); /* IDLE tasks as never migrated */ + } +--- a/kernel/sched_rt.c ++++ b/kernel/sched_rt.c +@@ -942,10 +942,9 @@ static void yield_task_rt(struct rq *rq) + #ifdef CONFIG_SMP + static int find_lowest_rq(struct task_struct *task); + +-static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) ++static int ++select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) + { +- struct rq *rq = task_rq(p); +- + if (sd_flag != SD_BALANCE_WAKE) + return smp_processor_id(); + diff --git a/queue-2.6.32/sched-fix-vmark-regression-on-big-machines.patch b/queue-2.6.32/sched-fix-vmark-regression-on-big-machines.patch new file mode 100644 index 00000000000..6d0899ddff3 --- /dev/null +++ b/queue-2.6.32/sched-fix-vmark-regression-on-big-machines.patch @@ -0,0 +1,51 @@ +From efault@gmx.de Fri Sep 17 18:19:56 2010 +From: Mike Galbraith +Date: Mon, 4 Jan 2010 14:44:56 +0100 +Subject: sched: Fix vmark regression on big machines +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: + +From: Mike Galbraith + +commit 50b926e439620c469565e8be0f28be78f5fca1ce upstream + +SD_PREFER_SIBLING is set at the CPU domain level if power saving isn't +enabled, leading to many cache misses on large machines as we traverse +looking for an idle shared cache to wake to. Change the enabler of +select_idle_sibling() to SD_SHARE_PKG_RESOURCES, and enable same at the +sibling domain level. + +Reported-by: Lin Ming +Signed-off-by: Mike Galbraith +Signed-off-by: Peter Zijlstra +LKML-Reference: <1262612696.15495.15.camel@marge.simson.net> +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/topology.h | 2 +- + kernel/sched_fair.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/include/linux/topology.h ++++ b/include/linux/topology.h +@@ -99,7 +99,7 @@ int arch_update_cpu_topology(void); + | 1*SD_WAKE_AFFINE \ + | 1*SD_SHARE_CPUPOWER \ + | 0*SD_POWERSAVINGS_BALANCE \ +- | 0*SD_SHARE_PKG_RESOURCES \ ++ | 1*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | 0*SD_PREFER_SIBLING \ + , \ +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1490,7 +1490,7 @@ select_task_rq_fair(struct rq *rq, struc + * If there's an idle sibling in this domain, make that + * the wake_affine target instead of the current cpu. + */ +- if (tmp->flags & SD_PREFER_SIBLING) ++ if (tmp->flags & SD_SHARE_PKG_RESOURCES) + target = select_idle_sibling(p, tmp, target); + + if (target >= 0) { diff --git a/queue-2.6.32/sched-implement-head-queueing-for-sched_rt.patch b/queue-2.6.32/sched-implement-head-queueing-for-sched_rt.patch new file mode 100644 index 00000000000..04f0a091d29 --- /dev/null +++ b/queue-2.6.32/sched-implement-head-queueing-for-sched_rt.patch @@ -0,0 +1,101 @@ +From tglx@linutronix.de Fri Sep 17 18:14:11 2010 +From: Thomas Gleixner +Date: Wed, 20 Jan 2010 20:59:01 +0000 +Subject: sched: Implement head queueing for sched_rt +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <06654220e9d17d06d30535777dfbcdf5ab2d7e57.1283514307.git.efault@gmx.de> + +From: Thomas Gleixner + +commit 37dad3fce97f01e5149d69de0833d8452c0e862e upstream + +The ability of enqueueing a task to the head of a SCHED_FIFO priority +list is required to fix some violations of POSIX scheduling policy. + +Implement the functionality in sched_rt. + +Signed-off-by: Thomas Gleixner +Acked-by: Peter Zijlstra +Tested-by: Carsten Emde +Tested-by: Mathias Weber +LKML-Reference: <20100120171629.772169931@linutronix.de> +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched_rt.c | 19 +++++++++++-------- + 1 file changed, 11 insertions(+), 8 deletions(-) + +--- a/kernel/sched_rt.c ++++ b/kernel/sched_rt.c +@@ -194,7 +194,7 @@ static inline struct rt_rq *group_rt_rq( + return rt_se->my_q; + } + +-static void enqueue_rt_entity(struct sched_rt_entity *rt_se); ++static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); + static void dequeue_rt_entity(struct sched_rt_entity *rt_se); + + static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +@@ -204,7 +204,7 @@ static void sched_rt_rq_enqueue(struct r + + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) +- enqueue_rt_entity(rt_se); ++ enqueue_rt_entity(rt_se, false); + if (rt_rq->highest_prio.curr < curr->prio) + resched_task(curr); + } +@@ -803,7 +803,7 @@ void dec_rt_tasks(struct sched_rt_entity + dec_rt_group(rt_se, rt_rq); + } + +-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) ++static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) + { + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + struct rt_prio_array *array = &rt_rq->active; +@@ -819,7 +819,10 @@ static void __enqueue_rt_entity(struct s + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + return; + +- list_add_tail(&rt_se->run_list, queue); ++ if (head) ++ list_add(&rt_se->run_list, queue); ++ else ++ list_add_tail(&rt_se->run_list, queue); + __set_bit(rt_se_prio(rt_se), array->bitmap); + + inc_rt_tasks(rt_se, rt_rq); +@@ -856,11 +859,11 @@ static void dequeue_rt_stack(struct sche + } + } + +-static void enqueue_rt_entity(struct sched_rt_entity *rt_se) ++static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) + { + dequeue_rt_stack(rt_se); + for_each_sched_rt_entity(rt_se) +- __enqueue_rt_entity(rt_se); ++ __enqueue_rt_entity(rt_se, head); + } + + static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +@@ -871,7 +874,7 @@ static void dequeue_rt_entity(struct sch + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq && rt_rq->rt_nr_running) +- __enqueue_rt_entity(rt_se); ++ __enqueue_rt_entity(rt_se, false); + } + } + +@@ -886,7 +889,7 @@ enqueue_task_rt(struct rq *rq, struct ta + if (wakeup) + rt_se->timeout = 0; + +- enqueue_rt_entity(rt_se); ++ enqueue_rt_entity(rt_se, head); + + if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); diff --git a/queue-2.6.32/sched-kill-the-broken-and-deadlockable-cpuset_lock-cpuset_cpus_allowed_locked-code.patch b/queue-2.6.32/sched-kill-the-broken-and-deadlockable-cpuset_lock-cpuset_cpus_allowed_locked-code.patch new file mode 100644 index 00000000000..8c401077534 --- /dev/null +++ b/queue-2.6.32/sched-kill-the-broken-and-deadlockable-cpuset_lock-cpuset_cpus_allowed_locked-code.patch @@ -0,0 +1,171 @@ +From oleg@redhat.com Fri Sep 17 18:14:53 2010 +From: Oleg Nesterov +Date: Mon, 15 Mar 2010 10:10:03 +0100 +Subject: sched: Kill the broken and deadlockable cpuset_lock/cpuset_cpus_allowed_locked code +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <2ed3dbb00c3052ccb7ffda1e7a1d112e3d3f53f1.1283514307.git.efault@gmx.de> + +From: Oleg Nesterov + +commit 897f0b3c3ff40b443c84e271bef19bd6ae885195 upstream + +This patch just states the fact the cpusets/cpuhotplug interaction is +broken and removes the deadlockable code which only pretends to work. + +- cpuset_lock() doesn't really work. It is needed for + cpuset_cpus_allowed_locked() but we can't take this lock in + try_to_wake_up()->select_fallback_rq() path. + +- cpuset_lock() is deadlockable. Suppose that a task T bound to CPU takes + callback_mutex. If cpu_down(CPU) happens before T drops callback_mutex + stop_machine() preempts T, then migration_call(CPU_DEAD) tries to take + cpuset_lock() and hangs forever because CPU is already dead and thus + T can't be scheduled. + +- cpuset_cpus_allowed_locked() is deadlockable too. It takes task_lock() + which is not irq-safe, but try_to_wake_up() can be called from irq. + +Kill them, and change select_fallback_rq() to use cpu_possible_mask, like +we currently do without CONFIG_CPUSETS. + +Also, with or without this patch, with or without CONFIG_CPUSETS, the +callers of select_fallback_rq() can race with each other or with +set_cpus_allowed() pathes. + +The subsequent patches try to to fix these problems. + +Signed-off-by: Oleg Nesterov +Signed-off-by: Peter Zijlstra +LKML-Reference: <20100315091003.GA9123@redhat.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/cpuset.h | 13 ------------- + kernel/cpuset.c | 27 +-------------------------- + kernel/sched.c | 10 +++------- + 3 files changed, 4 insertions(+), 46 deletions(-) + +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -21,8 +21,6 @@ extern int number_of_cpusets; /* How man + extern int cpuset_init(void); + extern void cpuset_init_smp(void); + extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); +-extern void cpuset_cpus_allowed_locked(struct task_struct *p, +- struct cpumask *mask); + extern nodemask_t cpuset_mems_allowed(struct task_struct *p); + #define cpuset_current_mems_allowed (current->mems_allowed) + void cpuset_init_current_mems_allowed(void); +@@ -69,9 +67,6 @@ struct seq_file; + extern void cpuset_task_status_allowed(struct seq_file *m, + struct task_struct *task); + +-extern void cpuset_lock(void); +-extern void cpuset_unlock(void); +- + extern int cpuset_mem_spread_node(void); + + static inline int cpuset_do_page_mem_spread(void) +@@ -105,11 +100,6 @@ static inline void cpuset_cpus_allowed(s + { + cpumask_copy(mask, cpu_possible_mask); + } +-static inline void cpuset_cpus_allowed_locked(struct task_struct *p, +- struct cpumask *mask) +-{ +- cpumask_copy(mask, cpu_possible_mask); +-} + + static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) + { +@@ -157,9 +147,6 @@ static inline void cpuset_task_status_al + { + } + +-static inline void cpuset_lock(void) {} +-static inline void cpuset_unlock(void) {} +- + static inline int cpuset_mem_spread_node(void) + { + return 0; +--- a/kernel/cpuset.c ++++ b/kernel/cpuset.c +@@ -2145,19 +2145,10 @@ void __init cpuset_init_smp(void) + void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) + { + mutex_lock(&callback_mutex); +- cpuset_cpus_allowed_locked(tsk, pmask); +- mutex_unlock(&callback_mutex); +-} +- +-/** +- * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. +- * Must be called with callback_mutex held. +- **/ +-void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +-{ + task_lock(tsk); + guarantee_online_cpus(task_cs(tsk), pmask); + task_unlock(tsk); ++ mutex_unlock(&callback_mutex); + } + + void cpuset_init_current_mems_allowed(void) +@@ -2346,22 +2337,6 @@ int __cpuset_node_allowed_hardwall(int n + } + + /** +- * cpuset_lock - lock out any changes to cpuset structures +- * +- * The out of memory (oom) code needs to mutex_lock cpusets +- * from being changed while it scans the tasklist looking for a +- * task in an overlapping cpuset. Expose callback_mutex via this +- * cpuset_lock() routine, so the oom code can lock it, before +- * locking the task list. The tasklist_lock is a spinlock, so +- * must be taken inside callback_mutex. +- */ +- +-void cpuset_lock(void) +-{ +- mutex_lock(&callback_mutex); +-} +- +-/** + * cpuset_unlock - release lock on cpuset changes + * + * Undo the lock taken in a previous cpuset_lock() call. +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -2349,11 +2349,9 @@ static int select_fallback_rq(int cpu, s + return dest_cpu; + + /* No more Mr. Nice Guy. */ +- if (dest_cpu >= nr_cpu_ids) { +- rcu_read_lock(); +- cpuset_cpus_allowed_locked(p, &p->cpus_allowed); +- rcu_read_unlock(); +- dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); ++ if (unlikely(dest_cpu >= nr_cpu_ids)) { ++ cpumask_copy(&p->cpus_allowed, cpu_possible_mask); ++ dest_cpu = cpumask_any(cpu_active_mask); + + /* + * Don't tell them about moving exiting tasks or +@@ -7833,7 +7831,6 @@ migration_call(struct notifier_block *nf + + case CPU_DEAD: + case CPU_DEAD_FROZEN: +- cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ + migrate_live_tasks(cpu); + rq = cpu_rq(cpu); + /* Idle task back to normal (off runqueue, low prio) */ +@@ -7844,7 +7841,6 @@ migration_call(struct notifier_block *nf + rq->idle->sched_class = &idle_sched_class; + migrate_dead_tasks(cpu); + spin_unlock_irq(&rq->lock); +- cpuset_unlock(); + migrate_nr_uninterruptible(rq); + BUG_ON(rq->nr_running != 0); + calc_global_load_remove(rq); diff --git a/queue-2.6.32/sched-make-select_fallback_rq-cpuset-friendly.patch b/queue-2.6.32/sched-make-select_fallback_rq-cpuset-friendly.patch new file mode 100644 index 00000000000..833c7cc05c1 --- /dev/null +++ b/queue-2.6.32/sched-make-select_fallback_rq-cpuset-friendly.patch @@ -0,0 +1,123 @@ +From oleg@redhat.com Fri Sep 17 18:17:45 2010 +From: Oleg Nesterov +Date: Mon, 15 Mar 2010 10:10:27 +0100 +Subject: sched: Make select_fallback_rq() cpuset friendly +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: + +From: Oleg Nesterov + +commit 9084bb8246ea935b98320554229e2f371f7f52fa upstream + +Introduce cpuset_cpus_allowed_fallback() helper to fix the cpuset problems +with select_fallback_rq(). It can be called from any context and can't use +any cpuset locks including task_lock(). It is called when the task doesn't +have online cpus in ->cpus_allowed but ttwu/etc must be able to find a +suitable cpu. + +I am not proud of this patch. Everything which needs such a fat comment +can't be good even if correct. But I'd prefer to not change the locking +rules in the code I hardly understand, and in any case I believe this +simple change make the code much more correct compared to deadlocks we +currently have. + +Signed-off-by: Oleg Nesterov +Signed-off-by: Peter Zijlstra +LKML-Reference: <20100315091027.GA9155@redhat.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/cpuset.h | 7 +++++++ + kernel/cpuset.c | 42 ++++++++++++++++++++++++++++++++++++++++++ + kernel/sched.c | 4 +--- + 3 files changed, 50 insertions(+), 3 deletions(-) + +--- a/include/linux/cpuset.h ++++ b/include/linux/cpuset.h +@@ -21,6 +21,7 @@ extern int number_of_cpusets; /* How man + extern int cpuset_init(void); + extern void cpuset_init_smp(void); + extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); ++extern int cpuset_cpus_allowed_fallback(struct task_struct *p); + extern nodemask_t cpuset_mems_allowed(struct task_struct *p); + #define cpuset_current_mems_allowed (current->mems_allowed) + void cpuset_init_current_mems_allowed(void); +@@ -101,6 +102,12 @@ static inline void cpuset_cpus_allowed(s + cpumask_copy(mask, cpu_possible_mask); + } + ++static inline int cpuset_cpus_allowed_fallback(struct task_struct *p) ++{ ++ cpumask_copy(&p->cpus_allowed, cpu_possible_mask); ++ return cpumask_any(cpu_active_mask); ++} ++ + static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) + { + return node_possible_map; +--- a/kernel/cpuset.c ++++ b/kernel/cpuset.c +@@ -2151,6 +2151,48 @@ void cpuset_cpus_allowed(struct task_str + mutex_unlock(&callback_mutex); + } + ++int cpuset_cpus_allowed_fallback(struct task_struct *tsk) ++{ ++ const struct cpuset *cs; ++ int cpu; ++ ++ rcu_read_lock(); ++ cs = task_cs(tsk); ++ if (cs) ++ cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); ++ rcu_read_unlock(); ++ ++ /* ++ * We own tsk->cpus_allowed, nobody can change it under us. ++ * ++ * But we used cs && cs->cpus_allowed lockless and thus can ++ * race with cgroup_attach_task() or update_cpumask() and get ++ * the wrong tsk->cpus_allowed. However, both cases imply the ++ * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() ++ * which takes task_rq_lock(). ++ * ++ * If we are called after it dropped the lock we must see all ++ * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary ++ * set any mask even if it is not right from task_cs() pov, ++ * the pending set_cpus_allowed_ptr() will fix things. ++ */ ++ ++ cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); ++ if (cpu >= nr_cpu_ids) { ++ /* ++ * Either tsk->cpus_allowed is wrong (see above) or it ++ * is actually empty. The latter case is only possible ++ * if we are racing with remove_tasks_in_empty_cpuset(). ++ * Like above we can temporary set any mask and rely on ++ * set_cpus_allowed_ptr() as synchronization point. ++ */ ++ cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); ++ cpu = cpumask_any(cpu_active_mask); ++ } ++ ++ return cpu; ++} ++ + void cpuset_init_current_mems_allowed(void) + { + nodes_setall(current->mems_allowed); +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -2353,9 +2353,7 @@ static int select_fallback_rq(int cpu, s + + /* No more Mr. Nice Guy. */ + if (unlikely(dest_cpu >= nr_cpu_ids)) { +- cpumask_copy(&p->cpus_allowed, cpu_possible_mask); +- dest_cpu = cpumask_any(cpu_active_mask); +- ++ dest_cpu = cpuset_cpus_allowed_fallback(p); + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never diff --git a/queue-2.6.32/sched-more-generic-wake_affine-vs-select_idle_sibling.patch b/queue-2.6.32/sched-more-generic-wake_affine-vs-select_idle_sibling.patch new file mode 100644 index 00000000000..40b58b5d53f --- /dev/null +++ b/queue-2.6.32/sched-more-generic-wake_affine-vs-select_idle_sibling.patch @@ -0,0 +1,91 @@ +From a.p.zijlstra@chello.nl Fri Sep 17 18:19:43 2010 +From: Peter Zijlstra +Date: Thu, 12 Nov 2009 15:55:29 +0100 +Subject: sched: More generic WAKE_AFFINE vs select_idle_sibling() +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <4fe736bd5f08977bf198f67dd272162a061c1a02.1283514307.git.efault@gmx.de> + +From: Peter Zijlstra + +commit fe3bcfe1f6c1fc4ea7706ac2d05e579fd9092682 upstream + +Instead of only considering SD_WAKE_AFFINE | SD_PREFER_SIBLING +domains also allow all SD_PREFER_SIBLING domains below a +SD_WAKE_AFFINE domain to change the affinity target. + +Signed-off-by: Peter Zijlstra +Cc: Mike Galbraith +LKML-Reference: <20091112145610.909723612@chello.nl> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched_fair.c | 33 ++++++++++++++++----------------- + 1 file changed, 16 insertions(+), 17 deletions(-) + +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1398,20 +1398,16 @@ select_idle_sibling(struct task_struct * + * test in select_task_rq_fair) and the prev_cpu is idle then that's + * always a better target than the current cpu. + */ +- if (target == cpu) { +- if (!cpu_rq(prev_cpu)->cfs.nr_running) +- target = prev_cpu; +- } ++ if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) ++ return prev_cpu; + + /* + * Otherwise, iterate the domain and find an elegible idle cpu. + */ +- if (target == -1 || target == cpu) { +- for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { +- if (!cpu_rq(i)->cfs.nr_running) { +- target = i; +- break; +- } ++ for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { ++ if (!cpu_rq(i)->cfs.nr_running) { ++ target = i; ++ break; + } + } + +@@ -1475,7 +1471,12 @@ select_task_rq_fair(struct rq *rq, struc + want_sd = 0; + } + +- if (want_affine && (tmp->flags & SD_WAKE_AFFINE)) { ++ /* ++ * While iterating the domains looking for a spanning ++ * WAKE_AFFINE domain, adjust the affine target to any idle cpu ++ * in cache sharing domains along the way. ++ */ ++ if (want_affine) { + int target = -1; + + /* +@@ -1488,17 +1489,15 @@ select_task_rq_fair(struct rq *rq, struc + /* + * If there's an idle sibling in this domain, make that + * the wake_affine target instead of the current cpu. +- * +- * XXX: should we possibly do this outside of +- * WAKE_AFFINE, in case the shared cache domain is +- * smaller than the WAKE_AFFINE domain? + */ + if (tmp->flags & SD_PREFER_SIBLING) + target = select_idle_sibling(p, tmp, target); + + if (target >= 0) { +- affine_sd = tmp; +- want_affine = 0; ++ if (tmp->flags & SD_WAKE_AFFINE) { ++ affine_sd = tmp; ++ want_affine = 0; ++ } + cpu = target; + } + } diff --git a/queue-2.6.32/sched-move_task_off_dead_cpu-remove-retry-logic.patch b/queue-2.6.32/sched-move_task_off_dead_cpu-remove-retry-logic.patch new file mode 100644 index 00000000000..bce1a34a198 --- /dev/null +++ b/queue-2.6.32/sched-move_task_off_dead_cpu-remove-retry-logic.patch @@ -0,0 +1,62 @@ +From oleg@redhat.com Fri Sep 17 18:15:27 2010 +From: Oleg Nesterov +Date: Mon, 15 Mar 2010 10:10:14 +0100 +Subject: sched: move_task_off_dead_cpu(): Remove retry logic +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: + +From: Oleg Nesterov + +commit c1804d547dc098363443667609c272d1e4d15ee8 upstream + +The previous patch preserved the retry logic, but it looks unneeded. + +__migrate_task() can only fail if we raced with migration after we dropped +the lock, but in this case the caller of set_cpus_allowed/etc must initiate +migration itself if ->on_rq == T. + +We already fixed p->cpus_allowed, the changes in active/online masks must +be visible to racer, it should migrate the task to online cpu correctly. + +Signed-off-by: Oleg Nesterov +Signed-off-by: Peter Zijlstra +LKML-Reference: <20100315091014.GA9138@redhat.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -7407,7 +7407,7 @@ static void move_task_off_dead_cpu(int d + struct rq *rq = cpu_rq(dead_cpu); + int needs_cpu, uninitialized_var(dest_cpu); + unsigned long flags; +-again: ++ + local_irq_save(flags); + + spin_lock(&rq->lock); +@@ -7415,14 +7415,13 @@ again: + if (needs_cpu) + dest_cpu = select_fallback_rq(dead_cpu, p); + spin_unlock(&rq->lock); +- +- /* It can have affinity changed while we were choosing. */ ++ /* ++ * It can only fail if we race with set_cpus_allowed(), ++ * in the racer should migrate the task anyway. ++ */ + if (needs_cpu) +- needs_cpu = !__migrate_task(p, dead_cpu, dest_cpu); ++ __migrate_task(p, dead_cpu, dest_cpu); + local_irq_restore(flags); +- +- if (unlikely(needs_cpu)) +- goto again; + } + + /* diff --git a/queue-2.6.32/sched-move_task_off_dead_cpu-take-rq-lock-around-select_fallback_rq.patch b/queue-2.6.32/sched-move_task_off_dead_cpu-take-rq-lock-around-select_fallback_rq.patch new file mode 100644 index 00000000000..37aa5c71eea --- /dev/null +++ b/queue-2.6.32/sched-move_task_off_dead_cpu-take-rq-lock-around-select_fallback_rq.patch @@ -0,0 +1,88 @@ +From oleg@redhat.com Fri Sep 17 18:15:12 2010 +From: Oleg Nesterov +Date: Mon, 15 Mar 2010 10:10:10 +0100 +Subject: sched: move_task_off_dead_cpu(): Take rq->lock around select_fallback_rq() +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: + +From: Oleg Nesterov + +commit 1445c08d06c5594895b4fae952ef8a457e89c390 upstream + +move_task_off_dead_cpu()->select_fallback_rq() reads/updates ->cpus_allowed +lockless. We can race with set_cpus_allowed() running in parallel. + +Change it to take rq->lock around select_fallback_rq(). Note that it is not +trivial to move this spin_lock() into select_fallback_rq(), we must recheck +the task was not migrated after we take the lock and other callers do not +need this lock. + +To avoid the races with other callers of select_fallback_rq() which rely on +TASK_WAKING, we also check p->state != TASK_WAKING and do nothing otherwise. +The owner of TASK_WAKING must update ->cpus_allowed and choose the correct +CPU anyway, and the subsequent __migrate_task() is just meaningless because +p->se.on_rq must be false. + +Alternatively, we could change select_task_rq() to take rq->lock right +after it calls sched_class->select_task_rq(), but this looks a bit ugly. + +Also, change it to not assume irqs are disabled and absorb __migrate_task_irq(). + +Signed-off-by: Oleg Nesterov +Signed-off-by: Peter Zijlstra +LKML-Reference: <20100315091010.GA9131@redhat.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched.c | 30 +++++++++++++++--------------- + 1 file changed, 15 insertions(+), 15 deletions(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -7399,29 +7399,29 @@ static int migration_thread(void *data) + } + + #ifdef CONFIG_HOTPLUG_CPU +- +-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) +-{ +- int ret; +- +- local_irq_disable(); +- ret = __migrate_task(p, src_cpu, dest_cpu); +- local_irq_enable(); +- return ret; +-} +- + /* + * Figure out where task on dead CPU should go, use force if necessary. + */ + static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) + { +- int dest_cpu; +- ++ struct rq *rq = cpu_rq(dead_cpu); ++ int needs_cpu, uninitialized_var(dest_cpu); ++ unsigned long flags; + again: +- dest_cpu = select_fallback_rq(dead_cpu, p); ++ local_irq_save(flags); ++ ++ spin_lock(&rq->lock); ++ needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); ++ if (needs_cpu) ++ dest_cpu = select_fallback_rq(dead_cpu, p); ++ spin_unlock(&rq->lock); + + /* It can have affinity changed while we were choosing. */ +- if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) ++ if (needs_cpu) ++ needs_cpu = !__migrate_task(p, dead_cpu, dest_cpu); ++ local_irq_restore(flags); ++ ++ if (unlikely(needs_cpu)) + goto again; + } + diff --git a/queue-2.6.32/sched-optimize-task_rq_lock.patch b/queue-2.6.32/sched-optimize-task_rq_lock.patch new file mode 100644 index 00000000000..ccd09f685f5 --- /dev/null +++ b/queue-2.6.32/sched-optimize-task_rq_lock.patch @@ -0,0 +1,87 @@ +From a.p.zijlstra@chello.nl Fri Sep 17 18:18:19 2010 +From: Peter Zijlstra +Date: Thu, 25 Mar 2010 21:05:16 +0100 +Subject: sched: Optimize task_rq_lock() +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: + +From: Peter Zijlstra + +commit 65cc8e4859ff29a9ddc989c88557d6059834c2a2 upstream + +Now that we hold the rq->lock over set_task_cpu() again, we can do +away with most of the TASK_WAKING checks and reduce them again to +set_cpus_allowed_ptr(). + +Removes some conditionals from scheduling hot-paths. + +Signed-off-by: Peter Zijlstra +Cc: Oleg Nesterov +LKML-Reference: +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -942,8 +942,8 @@ static inline void finish_lock_switch(st + #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + + /* +- * Check whether the task is waking, we use this to synchronize against +- * ttwu() so that task_cpu() reports a stable number. ++ * Check whether the task is waking, we use this to synchronize ->cpus_allowed ++ * against ttwu(). + */ + static inline int task_is_waking(struct task_struct *p) + { +@@ -960,11 +960,9 @@ static inline struct rq *__task_rq_lock( + struct rq *rq; + + for (;;) { +- while (task_is_waking(p)) +- cpu_relax(); + rq = task_rq(p); + spin_lock(&rq->lock); +- if (likely(rq == task_rq(p) && !task_is_waking(p))) ++ if (likely(rq == task_rq(p))) + return rq; + spin_unlock(&rq->lock); + } +@@ -981,12 +979,10 @@ static struct rq *task_rq_lock(struct ta + struct rq *rq; + + for (;;) { +- while (task_is_waking(p)) +- cpu_relax(); + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); +- if (likely(rq == task_rq(p) && !task_is_waking(p))) ++ if (likely(rq == task_rq(p))) + return rq; + spin_unlock_irqrestore(&rq->lock, *flags); + } +@@ -7213,7 +7209,18 @@ int set_cpus_allowed_ptr(struct task_str + struct rq *rq; + int ret = 0; + ++ /* ++ * Serialize against TASK_WAKING so that ttwu() and wunt() can ++ * drop the rq->lock and still rely on ->cpus_allowed. ++ */ ++again: ++ while (task_is_waking(p)) ++ cpu_relax(); + rq = task_rq_lock(p, &flags); ++ if (task_is_waking(p)) { ++ task_rq_unlock(rq, &flags); ++ goto again; ++ } + + if (!cpumask_intersects(new_mask, cpu_active_mask)) { + ret = -EINVAL; diff --git a/queue-2.6.32/sched-pre-compute-cpumask_weight-sched_domain_span-sd.patch b/queue-2.6.32/sched-pre-compute-cpumask_weight-sched_domain_span-sd.patch new file mode 100644 index 00000000000..fb27407d1a1 --- /dev/null +++ b/queue-2.6.32/sched-pre-compute-cpumask_weight-sched_domain_span-sd.patch @@ -0,0 +1,95 @@ +From a.p.zijlstra@chello.nl Fri Sep 17 18:20:23 2010 +From: Peter Zijlstra +Date: Fri, 16 Apr 2010 14:59:29 +0200 +Subject: sched: Pre-compute cpumask_weight(sched_domain_span(sd)) +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <0c6c762bcaa163e06a13da32043ad968d1473188.1283514307.git.efault@gmx.de> + +From: Peter Zijlstra + +commit 669c55e9f99b90e46eaa0f98a67ec53d46dc969a upstream + +Dave reported that his large SPARC machines spend lots of time in +hweight64(), try and optimize some of those needless cpumask_weight() +invocations (esp. with the large offstack cpumasks these are very +expensive indeed). + +Reported-by: David Miller +Signed-off-by: Peter Zijlstra +LKML-Reference: +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched.h | 1 + + kernel/sched.c | 7 +++++-- + kernel/sched_fair.c | 8 +++----- + 3 files changed, 9 insertions(+), 7 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1000,6 +1000,7 @@ struct sched_domain { + char *name; + #endif + ++ unsigned int span_weight; + /* + * Span of all CPUs in this domain. + * +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -3678,7 +3678,7 @@ unsigned long __weak arch_scale_freq_pow + + unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) + { +- unsigned long weight = cpumask_weight(sched_domain_span(sd)); ++ unsigned long weight = sd->span_weight; + unsigned long smt_gain = sd->smt_gain; + + smt_gain /= weight; +@@ -3711,7 +3711,7 @@ unsigned long scale_rt_power(int cpu) + + static void update_cpu_power(struct sched_domain *sd, int cpu) + { +- unsigned long weight = cpumask_weight(sched_domain_span(sd)); ++ unsigned long weight = sd->span_weight; + unsigned long power = SCHED_LOAD_SCALE; + struct sched_group *sdg = sd->groups; + +@@ -8166,6 +8166,9 @@ cpu_attach_domain(struct sched_domain *s + struct rq *rq = cpu_rq(cpu); + struct sched_domain *tmp; + ++ for (tmp = sd; tmp; tmp = tmp->parent) ++ tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); ++ + /* Remove the sched domains which do not contribute to scheduling. */ + for (tmp = sd; tmp; ) { + struct sched_domain *parent = tmp->parent; +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1520,9 +1520,7 @@ select_task_rq_fair(struct rq *rq, struc + * Pick the largest domain to update shares over + */ + tmp = sd; +- if (affine_sd && (!tmp || +- cpumask_weight(sched_domain_span(affine_sd)) > +- cpumask_weight(sched_domain_span(sd)))) ++ if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) + tmp = affine_sd; + + if (tmp) { +@@ -1566,10 +1564,10 @@ select_task_rq_fair(struct rq *rq, struc + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; +- weight = cpumask_weight(sched_domain_span(sd)); ++ weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { +- if (weight <= cpumask_weight(sched_domain_span(tmp))) ++ if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; diff --git a/queue-2.6.32/sched-queue-a-deboosted-task-to-the-head-of-the-rt-prio-queue.patch b/queue-2.6.32/sched-queue-a-deboosted-task-to-the-head-of-the-rt-prio-queue.patch new file mode 100644 index 00000000000..61683df70f3 --- /dev/null +++ b/queue-2.6.32/sched-queue-a-deboosted-task-to-the-head-of-the-rt-prio-queue.patch @@ -0,0 +1,56 @@ +From tglx@linutronix.de Fri Sep 17 18:14:25 2010 +From: Thomas Gleixner +Date: Wed, 20 Jan 2010 20:59:06 +0000 +Subject: sched: Queue a deboosted task to the head of the RT prio queue +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <55050ebe52e5ca5834a6f847d19809cba5dc10a0.1283514307.git.efault@gmx.de> + +From: Thomas Gleixner + +commit 60db48cacb9b253d5607a5ff206112a59cd09e34 upstream + +rtmutex_set_prio() is used to implement priority inheritance for +futexes. When a task is deboosted it gets enqueued at the tail of its +RT priority list. This is violating the POSIX scheduling semantics: + +rt priority list X contains two runnable tasks A and B + +task A runs with priority X and holds mutex M +task C preempts A and is blocked on mutex M + -> task A is boosted to priority of task C (Y) +task A unlocks the mutex M and deboosts itself + -> A is dequeued from rt priority list Y + -> A is enqueued to the tail of rt priority list X +task C schedules away +task B runs + +This is wrong as task A did not schedule away and therefor violates +the POSIX scheduling semantics. + +Enqueue the task to the head of the priority list instead. + +Reported-by: Mathias Weber +Reported-by: Carsten Emde +Signed-off-by: Thomas Gleixner +Acked-by: Peter Zijlstra +Tested-by: Carsten Emde +Tested-by: Mathias Weber +LKML-Reference: <20100120171629.809074113@linutronix.de> +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -6184,7 +6184,7 @@ void rt_mutex_setprio(struct task_struct + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) { +- enqueue_task(rq, p, 0, false); ++ enqueue_task(rq, p, 0, oldprio < prio); + + check_class_changed(rq, p, prev_class, oldprio, running); + } diff --git a/queue-2.6.32/sched-remove-unnecessary-rcu-exclusion.patch b/queue-2.6.32/sched-remove-unnecessary-rcu-exclusion.patch new file mode 100644 index 00000000000..4d2f370c542 --- /dev/null +++ b/queue-2.6.32/sched-remove-unnecessary-rcu-exclusion.patch @@ -0,0 +1,62 @@ +From a.p.zijlstra@chello.nl Fri Sep 17 18:19:01 2010 +From: Peter Zijlstra +Date: Tue, 1 Dec 2009 12:21:47 +0100 +Subject: sched: Remove unnecessary RCU exclusion +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: <96e351935dd8b98a2e436bf3e254fa3d91f4bd2d.1283514307.git.efault@gmx.de> + +From: Peter Zijlstra + +commit fb58bac5c75bfff8bbf7d02071a10a62f32fe28b upstream + +As Nick pointed out, and realized by myself when doing: + sched: Fix balance vs hotplug race +the patch: + sched: for_each_domain() vs RCU + +is wrong, sched_domains are freed after synchronize_sched(), which +means disabling preemption is enough. + +Reported-by: Nick Piggin +Signed-off-by: Peter Zijlstra +LKML-Reference: +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched_fair.c | 9 ++------- + 1 file changed, 2 insertions(+), 7 deletions(-) + +--- a/kernel/sched_fair.c ++++ b/kernel/sched_fair.c +@@ -1410,7 +1410,6 @@ select_task_rq_fair(struct rq *rq, struc + new_cpu = prev_cpu; + } + +- rcu_read_lock(); + for_each_domain(cpu, tmp) { + if (!(tmp->flags & SD_LOAD_BALANCE)) + continue; +@@ -1500,10 +1499,8 @@ select_task_rq_fair(struct rq *rq, struc + } + } + +- if (affine_sd && wake_affine(affine_sd, p, sync)) { +- new_cpu = cpu; +- goto out; +- } ++ if (affine_sd && wake_affine(affine_sd, p, sync)) ++ return cpu; + + while (sd) { + int load_idx = sd->forkexec_idx; +@@ -1544,8 +1541,6 @@ select_task_rq_fair(struct rq *rq, struc + /* while loop will break here if sd == NULL */ + } + +-out: +- rcu_read_unlock(); + return new_cpu; + } + #endif /* CONFIG_SMP */ diff --git a/queue-2.6.32/sched-sched_exec-remove-the-select_fallback_rq-logic.patch b/queue-2.6.32/sched-sched_exec-remove-the-select_fallback_rq-logic.patch new file mode 100644 index 00000000000..7db83a47943 --- /dev/null +++ b/queue-2.6.32/sched-sched_exec-remove-the-select_fallback_rq-logic.patch @@ -0,0 +1,96 @@ +From 30da688ef6b76e01969b00608202fff1eed2accc Mon Sep 17 00:00:00 2001 +From: Oleg Nesterov +Date: Mon, 15 Mar 2010 10:10:19 +0100 +Subject: sched: sched_exec(): Remove the select_fallback_rq() logic + +From: Oleg Nesterov + +commit 30da688ef6b76e01969b00608202fff1eed2accc upstream. + +sched_exec()->select_task_rq() reads/updates ->cpus_allowed lockless. +This can race with other CPUs updating our ->cpus_allowed, and this +looks meaningless to me. + +The task is current and running, it must have online cpus in ->cpus_allowed, +the fallback mode is bogus. And, if ->sched_class returns the "wrong" cpu, +this likely means we raced with set_cpus_allowed() which was called +for reason, why should sched_exec() retry and call ->select_task_rq() +again? + +Change the code to call sched_class->select_task_rq() directly and do +nothing if the returned cpu is wrong after re-checking under rq->lock. + +From now task_struct->cpus_allowed is always stable under TASK_WAKING, +select_fallback_rq() is always called under rq-lock or the caller or +the caller owns TASK_WAKING (select_task_rq). + +Signed-off-by: Oleg Nesterov +Signed-off-by: Peter Zijlstra +LKML-Reference: <20100315091019.GA9141@redhat.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched.c | 25 ++++++++----------------- + 1 file changed, 8 insertions(+), 17 deletions(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -2333,6 +2333,9 @@ void task_oncpu_function_call(struct tas + } + + #ifdef CONFIG_SMP ++/* ++ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. ++ */ + static int select_fallback_rq(int cpu, struct task_struct *p) + { + int dest_cpu; +@@ -2369,12 +2372,7 @@ static int select_fallback_rq(int cpu, s + } + + /* +- * Gets called from 3 sites (exec, fork, wakeup), since it is called without +- * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done +- * by: +- * +- * exec: is unstable, retry loop +- * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING ++ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. + */ + static inline + int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +@@ -3223,9 +3221,8 @@ void sched_exec(void) + unsigned long flags; + struct rq *rq; + +-again: + this_cpu = get_cpu(); +- dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0); ++ dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); + if (dest_cpu == this_cpu) { + put_cpu(); + return; +@@ -3233,18 +3230,12 @@ again: + + rq = task_rq_lock(p, &flags); + put_cpu(); +- + /* + * select_task_rq() can race against ->cpus_allowed + */ +- if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) +- || unlikely(!cpu_active(dest_cpu))) { +- task_rq_unlock(rq, &flags); +- goto again; +- } +- +- /* force the process onto the specified CPU */ +- if (migrate_task(p, dest_cpu, &req)) { ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && ++ likely(cpu_active(dest_cpu)) && ++ migrate_task(p, dest_cpu, &req)) { + /* Need to wait for migration thread (might exit: take ref). */ + struct task_struct *mt = rq->migration_thread; + diff --git a/queue-2.6.32/sched-set_cpus_allowed_ptr-don-t-use-rq-migration_thread-after-unlock.patch b/queue-2.6.32/sched-set_cpus_allowed_ptr-don-t-use-rq-migration_thread-after-unlock.patch new file mode 100644 index 00000000000..bc81017c0c4 --- /dev/null +++ b/queue-2.6.32/sched-set_cpus_allowed_ptr-don-t-use-rq-migration_thread-after-unlock.patch @@ -0,0 +1,37 @@ +From oleg@redhat.com Fri Sep 17 18:14:40 2010 +From: Oleg Nesterov +Date: Tue, 30 Mar 2010 18:58:29 +0200 +Subject: sched: set_cpus_allowed_ptr(): Don't use rq->migration_thread after unlock +To: stable +Cc: Ingo Molnar , Peter Zijlstra , Greg KH +Message-ID: + +From: Oleg Nesterov + +commit 47a70985e5c093ae03d8ccf633c70a93761d86f2 upstream + +Trivial typo fix. rq->migration_thread can be NULL after +task_rq_unlock(), this is why we have "mt" which should be + used instead. + +Signed-off-by: Oleg Nesterov +Signed-off-by: Peter Zijlstra +LKML-Reference: <20100330165829.GA18284@redhat.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Mike Galbraith +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -7273,7 +7273,7 @@ int set_cpus_allowed_ptr(struct task_str + + get_task_struct(mt); + task_rq_unlock(rq, &flags); +- wake_up_process(rq->migration_thread); ++ wake_up_process(mt); + put_task_struct(mt); + wait_for_completion(&req.done); + tlb_migrate_finish(p->mm); diff --git a/queue-2.6.32/series b/queue-2.6.32/series index 58ea8fddf28..7fd031394e1 100644 --- a/queue-2.6.32/series +++ b/queue-2.6.32/series @@ -84,3 +84,28 @@ sched-add-pre-and-post-wakeup-hooks.patch sched-remove-the-cfs_rq-dependency-from-set_task_cpu.patch sched-fix-hotplug-hang.patch sched-fix-fork-vs-hotplug-vs-cpuset-namespaces.patch +sched-fix-incorrect-sanity-check.patch +sched-fix-race-between-ttwu-and-task_rq_lock.patch +sched-extend-enqueue_task-to-allow-head-queueing.patch +sched-implement-head-queueing-for-sched_rt.patch +sched-queue-a-deboosted-task-to-the-head-of-the-rt-prio-queue.patch +sched-set_cpus_allowed_ptr-don-t-use-rq-migration_thread-after-unlock.patch +sched-kill-the-broken-and-deadlockable-cpuset_lock-cpuset_cpus_allowed_locked-code.patch +sched-move_task_off_dead_cpu-take-rq-lock-around-select_fallback_rq.patch +sched-move_task_off_dead_cpu-remove-retry-logic.patch +sched-sched_exec-remove-the-select_fallback_rq-logic.patch +sched-_cpu_down-don-t-play-with-current-cpus_allowed.patch +sched-make-select_fallback_rq-cpuset-friendly.patch +sched-fix-task_waking-vs-fork-deadlock.patch +sched-optimize-task_rq_lock.patch +sched-fix-nr_uninterruptible-count.patch +sched-fix-rq-clock-synchronization-when-migrating-tasks.patch +sched-remove-unnecessary-rcu-exclusion.patch +sched-apply-rcu-protection-to-wake_affine.patch +sched-cleanup-select_task_rq_fair.patch +sched-more-generic-wake_affine-vs-select_idle_sibling.patch +sched-fix-vmark-regression-on-big-machines.patch +sched-fix-select_idle_sibling.patch +sched-pre-compute-cpumask_weight-sched_domain_span-sd.patch +sched-fix-select_idle_sibling-logic-in-select_task_rq_fair.patch +sched-cpuacct-use-bigger-percpu-counter-batch-values-for-stats-counters.patch