--- /dev/null
+From 14c63f17b1fde5a575a28e96547a22b451c71fb5 Mon Sep 17 00:00:00 2001
+From: Dave Hansen <dave.hansen@linux.intel.com>
+Date: Fri, 21 Jun 2013 08:51:36 -0700
+Subject: perf: Drop sample rate when sampling is too slow
+
+From: Dave Hansen <dave.hansen@linux.intel.com>
+
+commit 14c63f17b1fde5a575a28e96547a22b451c71fb5 upstream.
+
+This patch keeps track of how long perf's NMI handler is taking,
+and also calculates how many samples perf can take a second. If
+the sample length times the expected max number of samples
+exceeds a configurable threshold, it drops the sample rate.
+
+This way, we don't have a runaway sampling process eating up the
+CPU.
+
+This patch can tend to drop the sample rate down to level where
+perf doesn't work very well. *BUT* the alternative is that my
+system hangs because it spends all of its time handling NMIs.
+
+I'll take a busted performance tool over an entire system that's
+busted and undebuggable any day.
+
+BTW, my suspicion is that there's still an underlying bug here.
+Using the HPET instead of the TSC is definitely a contributing
+factor, but I suspect there are some other things going on.
+But, I can't go dig down on a bug like that with my machine
+hanging all the time.
+
+Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
+Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: paulus@samba.org
+Cc: acme@ghostprotocols.net
+Cc: Dave Hansen <dave@sr71.net>
+[ Prettified it a bit. ]
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Weng Meiling <wengmeiling.weng@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ Documentation/sysctl/kernel.txt | 26 +++++++++++
+ arch/x86/kernel/cpu/perf_event.c | 12 ++++-
+ include/linux/perf_event.h | 7 ++
+ kernel/events/core.c | 92 +++++++++++++++++++++++++++++++++++++--
+ kernel/sysctl.c | 9 +++
+ 5 files changed, 141 insertions(+), 5 deletions(-)
+
+--- a/Documentation/sysctl/kernel.txt
++++ b/Documentation/sysctl/kernel.txt
+@@ -438,6 +438,32 @@ This file shows up if CONFIG_DEBUG_STACK
+
+ ==============================================================
+
++perf_cpu_time_max_percent:
++
++Hints to the kernel how much CPU time it should be allowed to
++use to handle perf sampling events. If the perf subsystem
++is informed that its samples are exceeding this limit, it
++will drop its sampling frequency to attempt to reduce its CPU
++usage.
++
++Some perf sampling happens in NMIs. If these samples
++unexpectedly take too long to execute, the NMIs can become
++stacked up next to each other so much that nothing else is
++allowed to execute.
++
++0: disable the mechanism. Do not monitor or correct perf's
++ sampling rate no matter how CPU time it takes.
++
++1-100: attempt to throttle perf's sample rate to this
++ percentage of CPU. Note: the kernel calculates an
++ "expected" length of each sample event. 100 here means
++ 100% of that expected length. Even if this is set to
++ 100, you may still see sample throttling if this
++ length is exceeded. Set to 0 if you truly do not care
++ how much CPU is consumed.
++
++==============================================================
++
+
+ pid_max:
+
+--- a/arch/x86/kernel/cpu/perf_event.c
++++ b/arch/x86/kernel/cpu/perf_event.c
+@@ -1252,10 +1252,20 @@ void perf_events_lapic_init(void)
+ static int __kprobes
+ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+ {
++ int ret;
++ u64 start_clock;
++ u64 finish_clock;
++
+ if (!atomic_read(&active_events))
+ return NMI_DONE;
+
+- return x86_pmu.handle_irq(regs);
++ start_clock = local_clock();
++ ret = x86_pmu.handle_irq(regs);
++ finish_clock = local_clock();
++
++ perf_sample_event_took(finish_clock - start_clock);
++
++ return ret;
+ }
+
+ struct event_constraint emptyconstraint;
+--- a/include/linux/perf_event.h
++++ b/include/linux/perf_event.h
+@@ -695,10 +695,17 @@ static inline void perf_callchain_store(
+ extern int sysctl_perf_event_paranoid;
+ extern int sysctl_perf_event_mlock;
+ extern int sysctl_perf_event_sample_rate;
++extern int sysctl_perf_cpu_time_max_percent;
++
++extern void perf_sample_event_took(u64 sample_len_ns);
+
+ extern int perf_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
++extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
++ void __user *buffer, size_t *lenp,
++ loff_t *ppos);
++
+
+ static inline bool perf_paranoid_tracepoint_raw(void)
+ {
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -165,10 +165,26 @@ int sysctl_perf_event_mlock __read_mostl
+ /*
+ * max perf event sample rate
+ */
+-#define DEFAULT_MAX_SAMPLE_RATE 100000
+-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+-static int max_samples_per_tick __read_mostly =
+- DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
++#define DEFAULT_MAX_SAMPLE_RATE 100000
++#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
++#define DEFAULT_CPU_TIME_MAX_PERCENT 25
++
++int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
++
++static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
++static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
++
++static atomic_t perf_sample_allowed_ns __read_mostly =
++ ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
++
++void update_perf_cpu_limits(void)
++{
++ u64 tmp = perf_sample_period_ns;
++
++ tmp *= sysctl_perf_cpu_time_max_percent;
++ tmp = do_div(tmp, 100);
++ atomic_set(&perf_sample_allowed_ns, tmp);
++}
+
+ int perf_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+@@ -180,10 +196,78 @@ int perf_proc_update_handler(struct ctl_
+ return ret;
+
+ max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
++ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
++ update_perf_cpu_limits();
++
++ return 0;
++}
++
++int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
++
++int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
++ void __user *buffer, size_t *lenp,
++ loff_t *ppos)
++{
++ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
++
++ if (ret || !write)
++ return ret;
++
++ update_perf_cpu_limits();
+
+ return 0;
+ }
+
++/*
++ * perf samples are done in some very critical code paths (NMIs).
++ * If they take too much CPU time, the system can lock up and not
++ * get any real work done. This will drop the sample rate when
++ * we detect that events are taking too long.
++ */
++#define NR_ACCUMULATED_SAMPLES 128
++DEFINE_PER_CPU(u64, running_sample_length);
++
++void perf_sample_event_took(u64 sample_len_ns)
++{
++ u64 avg_local_sample_len;
++ u64 local_samples_len = __get_cpu_var(running_sample_length);
++
++ if (atomic_read(&perf_sample_allowed_ns) == 0)
++ return;
++
++ /* decay the counter by 1 average sample */
++ local_samples_len = __get_cpu_var(running_sample_length);
++ local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
++ local_samples_len += sample_len_ns;
++ __get_cpu_var(running_sample_length) = local_samples_len;
++
++ /*
++ * note: this will be biased artifically low until we have
++ * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
++ * from having to maintain a count.
++ */
++ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
++
++ if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
++ return;
++
++ if (max_samples_per_tick <= 1)
++ return;
++
++ max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
++ sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
++ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
++
++ printk_ratelimited(KERN_WARNING
++ "perf samples too long (%lld > %d), lowering "
++ "kernel.perf_event_max_sample_rate to %d\n",
++ avg_local_sample_len,
++ atomic_read(&perf_sample_allowed_ns),
++ sysctl_perf_event_sample_rate);
++
++ update_perf_cpu_limits();
++}
++
+ static atomic64_t perf_event_id;
+
+ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -1050,6 +1050,15 @@ static struct ctl_table kern_table[] = {
+ .mode = 0644,
+ .proc_handler = perf_proc_update_handler,
+ },
++ {
++ .procname = "perf_cpu_time_max_percent",
++ .data = &sysctl_perf_cpu_time_max_percent,
++ .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
++ .mode = 0644,
++ .proc_handler = perf_cpu_time_max_percent_handler,
++ .extra1 = &zero,
++ .extra2 = &one_hundred,
++ },
+ #endif
+ #ifdef CONFIG_KMEMCHECK
+ {
--- /dev/null
+From 723478c8a471403c53cf144999701f6e0c4bbd11 Mon Sep 17 00:00:00 2001
+From: Knut Petersen <Knut_Petersen@t-online.de>
+Date: Wed, 25 Sep 2013 14:29:37 +0200
+Subject: perf: Enforce 1 as lower limit for perf_event_max_sample_rate
+
+From: Knut Petersen <Knut_Petersen@t-online.de>
+
+commit 723478c8a471403c53cf144999701f6e0c4bbd11 upstream.
+
+/proc/sys/kernel/perf_event_max_sample_rate will accept
+negative values as well as 0.
+
+Negative values are unreasonable, and 0 causes a
+divide by zero exception in perf_proc_update_handler.
+
+This patch enforces a lower limit of 1.
+
+Signed-off-by: Knut Petersen <Knut_Petersen@t-online.de>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/5242DB0C.4070005@t-online.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Weng Meiling <wengmeiling.weng@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/events/core.c | 2 +-
+ kernel/sysctl.c | 1 +
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -190,7 +190,7 @@ int perf_proc_update_handler(struct ctl_
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+ {
+- int ret = proc_dointvec(table, write, buffer, lenp, ppos);
++ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -1049,6 +1049,7 @@ static struct ctl_table kern_table[] = {
+ .maxlen = sizeof(sysctl_perf_event_sample_rate),
+ .mode = 0644,
+ .proc_handler = perf_proc_update_handler,
++ .extra1 = &one,
+ },
+ {
+ .procname = "perf_cpu_time_max_percent",
--- /dev/null
+From e5302920da9ef23f9d19d4e9ac85704cc25bee7a Mon Sep 17 00:00:00 2001
+From: Stephane Eranian <eranian@google.com>
+Date: Fri, 5 Jul 2013 00:30:11 +0200
+Subject: perf: Fix interrupt handler timing harness
+
+From: Stephane Eranian <eranian@google.com>
+
+commit e5302920da9ef23f9d19d4e9ac85704cc25bee7a upstream.
+
+This patch fixes a serious bug in:
+
+ 14c63f17b1fd perf: Drop sample rate when sampling is too slow
+
+There was an misunderstanding on the API of the do_div()
+macro. It returns the remainder of the division and this
+was not what the function expected leading to disabling the
+interrupt latency watchdog.
+
+This patch also remove a duplicate assignment in
+perf_sample_event_took().
+
+Signed-off-by: Stephane Eranian <eranian@google.com>
+Cc: peterz@infradead.org
+Cc: dave.hansen@linux.intel.com
+Cc: ak@linux.intel.com
+Cc: jolsa@redhat.com
+Link: http://lkml.kernel.org/r/20130704223010.GA30625@quad
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Weng Meiling <wengmeiling.weng@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/events/core.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -182,7 +182,7 @@ void update_perf_cpu_limits(void)
+ u64 tmp = perf_sample_period_ns;
+
+ tmp *= sysctl_perf_cpu_time_max_percent;
+- tmp = do_div(tmp, 100);
++ do_div(tmp, 100);
+ atomic_set(&perf_sample_allowed_ns, tmp);
+ }
+
+@@ -230,7 +230,7 @@ DEFINE_PER_CPU(u64, running_sample_lengt
+ void perf_sample_event_took(u64 sample_len_ns)
+ {
+ u64 avg_local_sample_len;
+- u64 local_samples_len = __get_cpu_var(running_sample_length);
++ u64 local_samples_len;
+
+ if (atomic_read(&perf_sample_allowed_ns) == 0)
+ return;