--- /dev/null
+From: Andi Kleen <andi@firstfloor.org>
+Date: Thu, 12 Feb 2009 12:39:29 +0000 (+0100)
+Subject: x86, mce: switch machine check polling to per CPU timer
+Patch-mainline: 2.6.30-rc1
+Git-commit: 52d168e28bc11dd026b620fe1767cadde5a747cd
+References: bnc#507557
+
+x86, mce: switch machine check polling to per CPU timer
+
+Impact: Higher priority bug fix
+
+The machine check poller runs a single timer and then broadcasted an
+IPI to all CPUs to check them. This leads to unnecessary
+synchronization between CPUs. The original CPU running the timer has
+to wait potentially a long time for all other CPUs answering. This is
+also real time unfriendly and in general inefficient.
+
+This was especially a problem on systems with a lot of events where
+the poller run with a higher frequency after processing some events.
+There could be more and more CPU time wasted with this, to
+the point of significantly slowing down machines.
+
+The machine check polling is actually fully independent per CPU, so
+there's no reason to not just do this all with per CPU timers. This
+patch implements that.
+
+Also switch the poller also to use standard timers instead of work
+queues. It was using work queues to be able to execute a user program
+on a event, but mce_notify_user() handles this case now with a
+separate callback. So instead always run the poll code in in a
+standard per CPU timer, which means that in the common case of not
+having to execute a trigger there will be less overhead.
+
+This allows to clean up the initialization significantly, because
+standard timers are already up when machine checks get init'ed. No
+multiple initialization functions.
+
+Thanks to Thomas Gleixner for some help.
+
+Cc: thockin@google.com
+v2: Use del_timer_sync() on cpu shutdown and don't try to handle
+migrated timers.
+v3: Add WARN_ON for timer running on unexpected CPU
+
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Acked-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: H. Peter Anvin <hpa@zytor.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+---
+ arch/x86/kernel/cpu/mcheck/mce_64.c | 68 +++++++++++++++++++++++-------------
+ 1 file changed, 45 insertions(+), 23 deletions(-)
+
+--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
++++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
+@@ -353,18 +353,17 @@ void mce_log_therm_throt_event(unsigned
+
+ static int check_interval = 5 * 60; /* 5 minutes */
+ static int next_interval; /* in jiffies */
+-static void mcheck_timer(struct work_struct *work);
+-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
++static void mcheck_timer(unsigned long);
++static DEFINE_PER_CPU(struct timer_list, mce_timer);
+
+-static void mcheck_check_cpu(void *info)
++static void mcheck_timer(unsigned long data)
+ {
++ struct timer_list *t = &per_cpu(mce_timer, data);
++
++ WARN_ON(smp_processor_id() != data);
++
+ if (mce_available(¤t_cpu_data))
+ do_machine_check(NULL, 0);
+-}
+-
+-static void mcheck_timer(struct work_struct *work)
+-{
+- on_each_cpu(mcheck_check_cpu, NULL, 1);
+
+ /*
+ * Alert userspace if needed. If we logged an MCE, reduce the
+@@ -377,7 +376,8 @@ static void mcheck_timer(struct work_str
+ (int)round_jiffies_relative(check_interval*HZ));
+ }
+
+- schedule_delayed_work(&mcheck_work, next_interval);
++ t->expires = jiffies + next_interval;
++ add_timer(t);
+ }
+
+ /*
+@@ -425,16 +425,11 @@ static struct notifier_block mce_idle_no
+
+ static __init int periodic_mcheck_init(void)
+ {
+- next_interval = check_interval * HZ;
+- if (next_interval)
+- schedule_delayed_work(&mcheck_work,
+- round_jiffies_relative(next_interval));
+- idle_notifier_register(&mce_idle_notifier);
+- return 0;
++ idle_notifier_register(&mce_idle_notifier);
++ return 0;
+ }
+ __initcall(periodic_mcheck_init);
+
+-
+ /*
+ * Initialize Machine Checks for a CPU.
+ */
+@@ -504,6 +499,20 @@ static void __cpuinit mce_cpu_features(s
+ }
+ }
+
++static void mce_init_timer(void)
++{
++ struct timer_list *t = &__get_cpu_var(mce_timer);
++
++ /* data race harmless because everyone sets to the same value */
++ if (!next_interval)
++ next_interval = check_interval * HZ;
++ if (!next_interval)
++ return;
++ setup_timer(t, mcheck_timer, smp_processor_id());
++ t->expires = round_jiffies_relative(jiffies + next_interval);
++ add_timer(t);
++}
++
+ /*
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off.
+@@ -521,6 +530,7 @@ void __cpuinit mcheck_init(struct cpuinf
+
+ mce_init(NULL);
+ mce_cpu_features(c);
++ mce_init_timer();
+ }
+
+ /*
+@@ -740,17 +750,19 @@ static int mce_resume(struct sys_device
+ return 0;
+ }
+
++static void mce_cpu_restart(void *data)
++{
++ del_timer_sync(&__get_cpu_var(mce_timer));
++ if (mce_available(¤t_cpu_data))
++ mce_init(NULL);
++ mce_init_timer();
++}
++
+ /* Reinit MCEs after user configuration changes */
+ static void mce_restart(void)
+ {
+- if (next_interval)
+- cancel_delayed_work(&mcheck_work);
+- /* Timer race is harmless here */
+- on_each_cpu(mce_init, NULL, 1);
+ next_interval = check_interval * HZ;
+- if (next_interval)
+- schedule_delayed_work(&mcheck_work,
+- round_jiffies_relative(next_interval));
++ on_each_cpu(mce_cpu_restart, NULL, 1);
+ }
+
+ static struct sysdev_class mce_sysclass = {
+@@ -879,6 +891,7 @@ static int __cpuinit mce_cpu_callback(st
+ unsigned long action, void *hcpu)
+ {
+ unsigned int cpu = (unsigned long)hcpu;
++ struct timer_list *t = &per_cpu(mce_timer, cpu);
+
+ switch (action) {
+ case CPU_ONLINE:
+@@ -893,6 +906,15 @@ static int __cpuinit mce_cpu_callback(st
+ threshold_cpu_callback(action, cpu);
+ mce_remove_device(cpu);
+ break;
++ case CPU_DOWN_PREPARE:
++ case CPU_DOWN_PREPARE_FROZEN:
++ del_timer_sync(t);
++ break;
++ case CPU_DOWN_FAILED:
++ case CPU_DOWN_FAILED_FROZEN:
++ t->expires = round_jiffies_relative(jiffies + next_interval);
++ add_timer_on(t, cpu);
++ break;
+ }
+ return NOTIFY_OK;
+ }