releases/2.6.32.3/sched-fix-balance-vs-hotplug-race.patch

   1 From 6ad4c18884e864cf4c77f9074d3d1816063f99cd Mon Sep 17 00:00:00 2001
   2 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
   3 Date: Wed, 25 Nov 2009 13:31:39 +0100
   4 Subject: sched: Fix balance vs hotplug race
   5
   6 From: Peter Zijlstra <a.p.zijlstra@chello.nl>
   7
   8 commit 6ad4c18884e864cf4c77f9074d3d1816063f99cd upstream.
   9
  10 Since (e761b77: cpu hotplug, sched: Introduce cpu_active_map and redo
  11 sched domain managment) we have cpu_active_mask which is suppose to rule
  12 scheduler migration and load-balancing, except it never (fully) did.
  13
  14 The particular problem being solved here is a crash in try_to_wake_up()
  15 where select_task_rq() ends up selecting an offline cpu because
  16 select_task_rq_fair() trusts the sched_domain tree to reflect the
  17 current state of affairs, similarly select_task_rq_rt() trusts the
  18 root_domain.
  19
  20 However, the sched_domains are updated from CPU_DEAD, which is after the
  21 cpu is taken offline and after stop_machine is done. Therefore it can
  22 race perfectly well with code assuming the domains are right.
  23
  24 Cure this by building the domains from cpu_active_mask on
  25 CPU_DOWN_PREPARE.
  26
  27 Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
  28 LKML-Reference: <new-submission>
  29 Signed-off-by: Ingo Molnar <mingo@elte.hu>
  30 Cc: Mike Galbraith <efault@gmx.de>
  31 Cc: Holger Hoffstätte <holger.hoffstaette@googlemail.com>
  32 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
  33
  34 ---
  35  include/linux/cpumask.h |    2 ++
  36  kernel/cpu.c            |   18 +++++++++++++-----
  37  kernel/cpuset.c         |   16 +++++++++-------
  38  kernel/sched.c          |   32 +++++++++++++++++---------------
  39  4 files changed, 41 insertions(+), 27 deletions(-)
  40
  41 --- a/include/linux/cpumask.h
  42 +++ b/include/linux/cpumask.h
  43 @@ -84,6 +84,7 @@ extern const struct cpumask *const cpu_a
  44  #define num_online_cpus()      cpumask_weight(cpu_online_mask)
  45  #define num_possible_cpus()    cpumask_weight(cpu_possible_mask)
  46  #define num_present_cpus()     cpumask_weight(cpu_present_mask)
  47 +#define num_active_cpus()      cpumask_weight(cpu_active_mask)
  48  #define cpu_online(cpu)                cpumask_test_cpu((cpu), cpu_online_mask)
  49  #define cpu_possible(cpu)      cpumask_test_cpu((cpu), cpu_possible_mask)
  50  #define cpu_present(cpu)       cpumask_test_cpu((cpu), cpu_present_mask)
  51 @@ -92,6 +93,7 @@ extern const struct cpumask *const cpu_a
  52  #define num_online_cpus()      1
  53  #define num_possible_cpus()    1
  54  #define num_present_cpus()     1
  55 +#define num_active_cpus()      1
  56  #define cpu_online(cpu)                ((cpu) == 0)
  57  #define cpu_possible(cpu)      ((cpu) == 0)
  58  #define cpu_present(cpu)       ((cpu) == 0)
  59 --- a/kernel/cpu.c
  60 +++ b/kernel/cpu.c
  61 @@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int
  62         err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
  63                                         hcpu, -1, &nr_calls);
  64         if (err == NOTIFY_BAD) {
  65 +               set_cpu_active(cpu, true);
  66 +
  67                 nr_calls--;
  68                 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
  69                                           hcpu, nr_calls, NULL);
  70 @@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int
  71
  72         /* Ensure that we are not runnable on dying cpu */
  73         cpumask_copy(old_allowed, &current->cpus_allowed);
  74 -       set_cpus_allowed_ptr(current,
  75 -                            cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
  76 +       set_cpus_allowed_ptr(current, cpu_active_mask);
  77
  78         err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
  79         if (err) {
  80 +               set_cpu_active(cpu, true);
  81                 /* CPU didn't die: tell everyone.  Can't complain. */
  82                 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
  83                                             hcpu) == NOTIFY_BAD)
  84 @@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu)
  85
  86         err = _cpu_down(cpu, 0);
  87
  88 -       if (cpu_online(cpu))
  89 -               set_cpu_active(cpu, true);
  90 -
  91  out:
  92         cpu_maps_update_done();
  93         stop_machine_destroy();
  94 @@ -387,6 +386,15 @@ int disable_nonboot_cpus(void)
  95          * with the userspace trying to use the CPU hotplug at the same time
  96          */
  97         cpumask_clear(frozen_cpus);
  98 +
  99 +       for_each_online_cpu(cpu) {
 100 +               if (cpu == first_cpu)
 101 +                       continue;
 102 +               set_cpu_active(cpu, false);
 103 +       }
 104 +
 105 +       synchronize_sched();
 106 +
 107         printk("Disabling non-boot CPUs ...\n");
 108         for_each_online_cpu(cpu) {
 109                 if (cpu == first_cpu)
 110 --- a/kernel/cpuset.c
 111 +++ b/kernel/cpuset.c
 112 @@ -873,7 +873,7 @@ static int update_cpumask(struct cpuset
 113                 if (retval < 0)
 114                         return retval;
 115
 116 -               if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
 117 +               if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
 118                         return -EINVAL;
 119         }
 120         retval = validate_change(cs, trialcs);
 121 @@ -2011,7 +2011,7 @@ static void scan_for_empty_cpusets(struc
 122                 }
 123
 124                 /* Continue past cpusets with all cpus, mems online */
 125 -               if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
 126 +               if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
 127                     nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
 128                         continue;
 129
 130 @@ -2020,7 +2020,7 @@ static void scan_for_empty_cpusets(struc
 131                 /* Remove offline cpus and mems from this cpuset. */
 132                 mutex_lock(&callback_mutex);
 133                 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
 134 -                           cpu_online_mask);
 135 +                           cpu_active_mask);
 136                 nodes_and(cp->mems_allowed, cp->mems_allowed,
 137                                                 node_states[N_HIGH_MEMORY]);
 138                 mutex_unlock(&callback_mutex);
 139 @@ -2058,8 +2058,10 @@ static int cpuset_track_online_cpus(stru
 140         switch (phase) {
 141         case CPU_ONLINE:
 142         case CPU_ONLINE_FROZEN:
 143 -       case CPU_DEAD:
 144 -       case CPU_DEAD_FROZEN:
 145 +       case CPU_DOWN_PREPARE:
 146 +       case CPU_DOWN_PREPARE_FROZEN:
 147 +       case CPU_DOWN_FAILED:
 148 +       case CPU_DOWN_FAILED_FROZEN:
 149                 break;
 150
 151         default:
 152 @@ -2068,7 +2070,7 @@ static int cpuset_track_online_cpus(stru
 153
 154         cgroup_lock();
 155         mutex_lock(&callback_mutex);
 156 -       cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
 157 +       cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
 158         mutex_unlock(&callback_mutex);
 159         scan_for_empty_cpusets(&top_cpuset);
 160         ndoms = generate_sched_domains(&doms, &attr);
 161 @@ -2115,7 +2117,7 @@ static int cpuset_track_online_nodes(str
 162
 163  void __init cpuset_init_smp(void)
 164  {
 165 -       cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
 166 +       cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
 167         top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 168
 169         hotcpu_notifier(cpuset_track_online_cpus, 0);
 170 --- a/kernel/sched.c
 171 +++ b/kernel/sched.c
 172 @@ -4139,7 +4139,7 @@ static int load_balance(int this_cpu, st
 173         unsigned long flags;
 174         struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 175
 176 -       cpumask_copy(cpus, cpu_online_mask);
 177 +       cpumask_copy(cpus, cpu_active_mask);
 178
 179         /*
 180          * When power savings policy is enabled for the parent domain, idle
 181 @@ -4302,7 +4302,7 @@ load_balance_newidle(int this_cpu, struc
 182         int all_pinned = 0;
 183         struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 184
 185 -       cpumask_copy(cpus, cpu_online_mask);
 186 +       cpumask_copy(cpus, cpu_active_mask);
 187
 188         /*
 189          * When power savings policy is enabled for the parent domain, idle
 190 @@ -4699,7 +4699,7 @@ int select_nohz_load_balancer(int stop_t
 191                 cpumask_set_cpu(cpu, nohz.cpu_mask);
 192
 193                 /* time for ilb owner also to sleep */
 194 -               if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
 195 +               if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
 196                         if (atomic_read(&nohz.load_balancer) == cpu)
 197                                 atomic_set(&nohz.load_balancer, -1);
 198                         return 0;
 199 @@ -7075,7 +7075,7 @@ int set_cpus_allowed_ptr(struct task_str
 200         int ret = 0;
 201
 202         rq = task_rq_lock(p, &flags);
 203 -       if (!cpumask_intersects(new_mask, cpu_online_mask)) {
 204 +       if (!cpumask_intersects(new_mask, cpu_active_mask)) {
 205                 ret = -EINVAL;
 206                 goto out;
 207         }
 208 @@ -7097,7 +7097,7 @@ int set_cpus_allowed_ptr(struct task_str
 209         if (cpumask_test_cpu(task_cpu(p), new_mask))
 210                 goto out;
 211
 212 -       if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
 213 +       if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
 214                 /* Need help from migration thread: drop lock and wait. */
 215                 struct task_struct *mt = rq->migration_thread;
 216
 217 @@ -7251,19 +7251,19 @@ static void move_task_off_dead_cpu(int d
 218
 219  again:
 220         /* Look for allowed, online CPU in same node. */
 221 -       for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
 222 +       for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
 223                 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 224                         goto move;
 225
 226         /* Any allowed, online CPU? */
 227 -       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
 228 +       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
 229         if (dest_cpu < nr_cpu_ids)
 230                 goto move;
 231
 232         /* No more Mr. Nice Guy. */
 233         if (dest_cpu >= nr_cpu_ids) {
 234                 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
 235 -               dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
 236 +               dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
 237
 238                 /*
 239                  * Don't tell them about moving exiting tasks or
 240 @@ -7292,7 +7292,7 @@ move:
 241   */
 242  static void migrate_nr_uninterruptible(struct rq *rq_src)
 243  {
 244 -       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
 245 +       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
 246         unsigned long flags;
 247
 248         local_irq_save(flags);
 249 @@ -7546,7 +7546,7 @@ static ctl_table *sd_alloc_ctl_cpu_table
 250  static struct ctl_table_header *sd_sysctl_header;
 251  static void register_sched_domain_sysctl(void)
 252  {
 253 -       int i, cpu_num = num_online_cpus();
 254 +       int i, cpu_num = num_possible_cpus();
 255         struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
 256         char buf[32];
 257
 258 @@ -7556,7 +7556,7 @@ static void register_sched_domain_sysctl
 259         if (entry == NULL)
 260                 return;
 261
 262 -       for_each_online_cpu(i) {
 263 +       for_each_possible_cpu(i) {
 264                 snprintf(buf, 32, "cpu%d", i);
 265                 entry->procname = kstrdup(buf, GFP_KERNEL);
 266                 entry->mode = 0555;
 267 @@ -9042,7 +9042,7 @@ match1:
 268         if (doms_new == NULL) {
 269                 ndoms_cur = 0;
 270                 doms_new = fallback_doms;
 271 -               cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
 272 +               cpumask_andnot(&doms_new[0], cpu_active_mask, cpu_isolated_map);
 273                 WARN_ON_ONCE(dattr_new);
 274         }
 275
 276 @@ -9173,8 +9173,10 @@ static int update_sched_domains(struct n
 277         switch (action) {
 278         case CPU_ONLINE:
 279         case CPU_ONLINE_FROZEN:
 280 -       case CPU_DEAD:
 281 -       case CPU_DEAD_FROZEN:
 282 +       case CPU_DOWN_PREPARE:
 283 +       case CPU_DOWN_PREPARE_FROZEN:
 284 +       case CPU_DOWN_FAILED:
 285 +       case CPU_DOWN_FAILED_FROZEN:
 286                 partition_sched_domains(1, NULL, NULL);
 287                 return NOTIFY_OK;
 288
 289 @@ -9221,7 +9223,7 @@ void __init sched_init_smp(void)
 290  #endif
 291         get_online_cpus();
 292         mutex_lock(&sched_domains_mutex);
 293 -       arch_init_sched_domains(cpu_online_mask);
 294 +       arch_init_sched_domains(cpu_active_mask);
 295         cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
 296         if (cpumask_empty(non_isolated_cpus))
 297                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);