]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/2.6.32.3/sched-fix-balance-vs-hotplug-race.patch
4.14-stable patches
[thirdparty/kernel/stable-queue.git] / releases / 2.6.32.3 / sched-fix-balance-vs-hotplug-race.patch
1 From 6ad4c18884e864cf4c77f9074d3d1816063f99cd Mon Sep 17 00:00:00 2001
2 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
3 Date: Wed, 25 Nov 2009 13:31:39 +0100
4 Subject: sched: Fix balance vs hotplug race
5
6 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
7
8 commit 6ad4c18884e864cf4c77f9074d3d1816063f99cd upstream.
9
10 Since (e761b77: cpu hotplug, sched: Introduce cpu_active_map and redo
11 sched domain managment) we have cpu_active_mask which is suppose to rule
12 scheduler migration and load-balancing, except it never (fully) did.
13
14 The particular problem being solved here is a crash in try_to_wake_up()
15 where select_task_rq() ends up selecting an offline cpu because
16 select_task_rq_fair() trusts the sched_domain tree to reflect the
17 current state of affairs, similarly select_task_rq_rt() trusts the
18 root_domain.
19
20 However, the sched_domains are updated from CPU_DEAD, which is after the
21 cpu is taken offline and after stop_machine is done. Therefore it can
22 race perfectly well with code assuming the domains are right.
23
24 Cure this by building the domains from cpu_active_mask on
25 CPU_DOWN_PREPARE.
26
27 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
28 LKML-Reference: <new-submission>
29 Signed-off-by: Ingo Molnar <mingo@elte.hu>
30 Cc: Mike Galbraith <efault@gmx.de>
31 Cc: Holger Hoffstätte <holger.hoffstaette@googlemail.com>
32 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
33
34 ---
35 include/linux/cpumask.h | 2 ++
36 kernel/cpu.c | 18 +++++++++++++-----
37 kernel/cpuset.c | 16 +++++++++-------
38 kernel/sched.c | 32 +++++++++++++++++---------------
39 4 files changed, 41 insertions(+), 27 deletions(-)
40
41 --- a/include/linux/cpumask.h
42 +++ b/include/linux/cpumask.h
43 @@ -84,6 +84,7 @@ extern const struct cpumask *const cpu_a
44 #define num_online_cpus() cpumask_weight(cpu_online_mask)
45 #define num_possible_cpus() cpumask_weight(cpu_possible_mask)
46 #define num_present_cpus() cpumask_weight(cpu_present_mask)
47 +#define num_active_cpus() cpumask_weight(cpu_active_mask)
48 #define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask)
49 #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask)
50 #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask)
51 @@ -92,6 +93,7 @@ extern const struct cpumask *const cpu_a
52 #define num_online_cpus() 1
53 #define num_possible_cpus() 1
54 #define num_present_cpus() 1
55 +#define num_active_cpus() 1
56 #define cpu_online(cpu) ((cpu) == 0)
57 #define cpu_possible(cpu) ((cpu) == 0)
58 #define cpu_present(cpu) ((cpu) == 0)
59 --- a/kernel/cpu.c
60 +++ b/kernel/cpu.c
61 @@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int
62 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
63 hcpu, -1, &nr_calls);
64 if (err == NOTIFY_BAD) {
65 + set_cpu_active(cpu, true);
66 +
67 nr_calls--;
68 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
69 hcpu, nr_calls, NULL);
70 @@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int
71
72 /* Ensure that we are not runnable on dying cpu */
73 cpumask_copy(old_allowed, &current->cpus_allowed);
74 - set_cpus_allowed_ptr(current,
75 - cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
76 + set_cpus_allowed_ptr(current, cpu_active_mask);
77
78 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
79 if (err) {
80 + set_cpu_active(cpu, true);
81 /* CPU didn't die: tell everyone. Can't complain. */
82 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
83 hcpu) == NOTIFY_BAD)
84 @@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu)
85
86 err = _cpu_down(cpu, 0);
87
88 - if (cpu_online(cpu))
89 - set_cpu_active(cpu, true);
90 -
91 out:
92 cpu_maps_update_done();
93 stop_machine_destroy();
94 @@ -387,6 +386,15 @@ int disable_nonboot_cpus(void)
95 * with the userspace trying to use the CPU hotplug at the same time
96 */
97 cpumask_clear(frozen_cpus);
98 +
99 + for_each_online_cpu(cpu) {
100 + if (cpu == first_cpu)
101 + continue;
102 + set_cpu_active(cpu, false);
103 + }
104 +
105 + synchronize_sched();
106 +
107 printk("Disabling non-boot CPUs ...\n");
108 for_each_online_cpu(cpu) {
109 if (cpu == first_cpu)
110 --- a/kernel/cpuset.c
111 +++ b/kernel/cpuset.c
112 @@ -873,7 +873,7 @@ static int update_cpumask(struct cpuset
113 if (retval < 0)
114 return retval;
115
116 - if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
117 + if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
118 return -EINVAL;
119 }
120 retval = validate_change(cs, trialcs);
121 @@ -2011,7 +2011,7 @@ static void scan_for_empty_cpusets(struc
122 }
123
124 /* Continue past cpusets with all cpus, mems online */
125 - if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
126 + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
127 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
128 continue;
129
130 @@ -2020,7 +2020,7 @@ static void scan_for_empty_cpusets(struc
131 /* Remove offline cpus and mems from this cpuset. */
132 mutex_lock(&callback_mutex);
133 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
134 - cpu_online_mask);
135 + cpu_active_mask);
136 nodes_and(cp->mems_allowed, cp->mems_allowed,
137 node_states[N_HIGH_MEMORY]);
138 mutex_unlock(&callback_mutex);
139 @@ -2058,8 +2058,10 @@ static int cpuset_track_online_cpus(stru
140 switch (phase) {
141 case CPU_ONLINE:
142 case CPU_ONLINE_FROZEN:
143 - case CPU_DEAD:
144 - case CPU_DEAD_FROZEN:
145 + case CPU_DOWN_PREPARE:
146 + case CPU_DOWN_PREPARE_FROZEN:
147 + case CPU_DOWN_FAILED:
148 + case CPU_DOWN_FAILED_FROZEN:
149 break;
150
151 default:
152 @@ -2068,7 +2070,7 @@ static int cpuset_track_online_cpus(stru
153
154 cgroup_lock();
155 mutex_lock(&callback_mutex);
156 - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
157 + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
158 mutex_unlock(&callback_mutex);
159 scan_for_empty_cpusets(&top_cpuset);
160 ndoms = generate_sched_domains(&doms, &attr);
161 @@ -2115,7 +2117,7 @@ static int cpuset_track_online_nodes(str
162
163 void __init cpuset_init_smp(void)
164 {
165 - cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
166 + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
167 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
168
169 hotcpu_notifier(cpuset_track_online_cpus, 0);
170 --- a/kernel/sched.c
171 +++ b/kernel/sched.c
172 @@ -4139,7 +4139,7 @@ static int load_balance(int this_cpu, st
173 unsigned long flags;
174 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
175
176 - cpumask_copy(cpus, cpu_online_mask);
177 + cpumask_copy(cpus, cpu_active_mask);
178
179 /*
180 * When power savings policy is enabled for the parent domain, idle
181 @@ -4302,7 +4302,7 @@ load_balance_newidle(int this_cpu, struc
182 int all_pinned = 0;
183 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
184
185 - cpumask_copy(cpus, cpu_online_mask);
186 + cpumask_copy(cpus, cpu_active_mask);
187
188 /*
189 * When power savings policy is enabled for the parent domain, idle
190 @@ -4699,7 +4699,7 @@ int select_nohz_load_balancer(int stop_t
191 cpumask_set_cpu(cpu, nohz.cpu_mask);
192
193 /* time for ilb owner also to sleep */
194 - if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
195 + if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
196 if (atomic_read(&nohz.load_balancer) == cpu)
197 atomic_set(&nohz.load_balancer, -1);
198 return 0;
199 @@ -7075,7 +7075,7 @@ int set_cpus_allowed_ptr(struct task_str
200 int ret = 0;
201
202 rq = task_rq_lock(p, &flags);
203 - if (!cpumask_intersects(new_mask, cpu_online_mask)) {
204 + if (!cpumask_intersects(new_mask, cpu_active_mask)) {
205 ret = -EINVAL;
206 goto out;
207 }
208 @@ -7097,7 +7097,7 @@ int set_cpus_allowed_ptr(struct task_str
209 if (cpumask_test_cpu(task_cpu(p), new_mask))
210 goto out;
211
212 - if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
213 + if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
214 /* Need help from migration thread: drop lock and wait. */
215 struct task_struct *mt = rq->migration_thread;
216
217 @@ -7251,19 +7251,19 @@ static void move_task_off_dead_cpu(int d
218
219 again:
220 /* Look for allowed, online CPU in same node. */
221 - for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
222 + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
223 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
224 goto move;
225
226 /* Any allowed, online CPU? */
227 - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
228 + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
229 if (dest_cpu < nr_cpu_ids)
230 goto move;
231
232 /* No more Mr. Nice Guy. */
233 if (dest_cpu >= nr_cpu_ids) {
234 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
235 - dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
236 + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
237
238 /*
239 * Don't tell them about moving exiting tasks or
240 @@ -7292,7 +7292,7 @@ move:
241 */
242 static void migrate_nr_uninterruptible(struct rq *rq_src)
243 {
244 - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
245 + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
246 unsigned long flags;
247
248 local_irq_save(flags);
249 @@ -7546,7 +7546,7 @@ static ctl_table *sd_alloc_ctl_cpu_table
250 static struct ctl_table_header *sd_sysctl_header;
251 static void register_sched_domain_sysctl(void)
252 {
253 - int i, cpu_num = num_online_cpus();
254 + int i, cpu_num = num_possible_cpus();
255 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
256 char buf[32];
257
258 @@ -7556,7 +7556,7 @@ static void register_sched_domain_sysctl
259 if (entry == NULL)
260 return;
261
262 - for_each_online_cpu(i) {
263 + for_each_possible_cpu(i) {
264 snprintf(buf, 32, "cpu%d", i);
265 entry->procname = kstrdup(buf, GFP_KERNEL);
266 entry->mode = 0555;
267 @@ -9042,7 +9042,7 @@ match1:
268 if (doms_new == NULL) {
269 ndoms_cur = 0;
270 doms_new = fallback_doms;
271 - cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
272 + cpumask_andnot(&doms_new[0], cpu_active_mask, cpu_isolated_map);
273 WARN_ON_ONCE(dattr_new);
274 }
275
276 @@ -9173,8 +9173,10 @@ static int update_sched_domains(struct n
277 switch (action) {
278 case CPU_ONLINE:
279 case CPU_ONLINE_FROZEN:
280 - case CPU_DEAD:
281 - case CPU_DEAD_FROZEN:
282 + case CPU_DOWN_PREPARE:
283 + case CPU_DOWN_PREPARE_FROZEN:
284 + case CPU_DOWN_FAILED:
285 + case CPU_DOWN_FAILED_FROZEN:
286 partition_sched_domains(1, NULL, NULL);
287 return NOTIFY_OK;
288
289 @@ -9221,7 +9223,7 @@ void __init sched_init_smp(void)
290 #endif
291 get_online_cpus();
292 mutex_lock(&sched_domains_mutex);
293 - arch_init_sched_domains(cpu_online_mask);
294 + arch_init_sched_domains(cpu_active_mask);
295 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
296 if (cpumask_empty(non_isolated_cpus))
297 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);