From b2d2f5e2aa68d43c8f92d894a6a209ce75c039e5 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 24 Aug 2010 13:19:03 -0700
Subject: [PATCH] .35 patches

---
 ...e-scalability-of-optimistic-spinning.patch | 134 ++++++++++++++++++
 queue-2.6.35/series                           |   2 +
 queue-2.6.35/tracing-fix-timer-tracing.patch  |  75 ++++++++++
 3 files changed, 211 insertions(+)
 create mode 100644 queue-2.6.35/mutex-improve-the-scalability-of-optimistic-spinning.patch
 create mode 100644 queue-2.6.35/tracing-fix-timer-tracing.patch

diff --git a/queue-2.6.35/mutex-improve-the-scalability-of-optimistic-spinning.patch b/queue-2.6.35/mutex-improve-the-scalability-of-optimistic-spinning.patch
new file mode 100644
index 00000000000..379cafcaa99
--- /dev/null
+++ b/queue-2.6.35/mutex-improve-the-scalability-of-optimistic-spinning.patch
@@ -0,0 +1,134 @@
+From 9d0f4dcc5c4d1c5dd01172172684a45b5f49d740 Mon Sep 17 00:00:00 2001
+From: Tim Chen <tim.c.chen@linux.intel.com>
+Date: Wed, 18 Aug 2010 15:00:27 -0700
+Subject: mutex: Improve the scalability of optimistic spinning
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Tim Chen <tim.c.chen@linux.intel.com>
+
+commit 9d0f4dcc5c4d1c5dd01172172684a45b5f49d740 upstream.
+
+There is a scalability issue for current implementation of optimistic
+mutex spin in the kernel.  It is found on a 8 node 64 core Nehalem-EX
+system (HT mode).
+
+The intention of the optimistic mutex spin is to busy wait and spin on a
+mutex if the owner of the mutex is running, in the hope that the mutex
+will be released soon and be acquired, without the thread trying to
+acquire mutex going to sleep. However, when we have a large number of
+threads, contending for the mutex, we could have the mutex grabbed by
+other thread, and then another â¦â¦, and we will keep spinning, wasting cpu
+cycles and adding to the contention.  One possible fix is to quit
+spinning and put the current thread on wait-list if mutex lock switch to
+a new owner while we spin, indicating heavy contention (see the patch
+included).
+
+I did some testing on a 8 socket Nehalem-EX system with a total of 64
+cores. Using Ingo's test-mutex program that creates/delete files with 256
+threads (http://lkml.org/lkml/2006/1/8/50) , I see the following speed up
+after putting in the mutex spin fix:
+
+ ./mutex-test V 256 10
+                 Ops/sec
+ 2.6.34          62864
+ With fix        197200
+
+Repeating the test with Aim7 fserver workload, again there is a speed up
+with the fix:
+
+                 Jobs/min
+ 2.6.34          91657
+ With fix        149325
+
+To look at the impact on the distribution of mutex acquisition time, I
+collected the mutex acquisition time on Aim7 fserver workload with some
+instrumentation.  The average acquisition time is reduced by 48% and
+number of contentions reduced by 32%.
+
+                 #contentions    Time to acquire mutex (cycles)
+ 2.6.34          72973           44765791
+ With fix        49210           23067129
+
+The histogram of mutex acquisition time is listed below.  The acquisition
+time is in 2^bin cycles.  We see that without the fix, the acquisition
+time is mostly around 2^26 cycles.  With the fix, we the distribution get
+spread out a lot more towards the lower cycles, starting from 2^13.
+However, there is an increase of the tail distribution with the fix at
+2^28 and 2^29 cycles.  It seems a small price to pay for the reduced
+average acquisition time and also getting the cpu to do useful work.
+
+ Mutex acquisition time distribution (acq time = 2^bin cycles):
+         2.6.34                  With Fix
+ bin     #occurrence     %       #occurrence     %
+ 11      2               0.00%   120             0.24%
+ 12      10              0.01%   790             1.61%
+ 13      14              0.02%   2058            4.18%
+ 14      86              0.12%   3378            6.86%
+ 15      393             0.54%   4831            9.82%
+ 16      710             0.97%   4893            9.94%
+ 17      815             1.12%   4667            9.48%
+ 18      790             1.08%   5147            10.46%
+ 19      580             0.80%   6250            12.70%
+ 20      429             0.59%   6870            13.96%
+ 21      311             0.43%   1809            3.68%
+ 22      255             0.35%   2305            4.68%
+ 23      317             0.44%   916             1.86%
+ 24      610             0.84%   233             0.47%
+ 25      3128            4.29%   95              0.19%
+ 26      63902           87.69%  122             0.25%
+ 27      619             0.85%   286             0.58%
+ 28      0               0.00%   3536            7.19%
+ 29      0               0.00%   903             1.83%
+ 30      0               0.00%   0               0.00%
+
+I've done similar experiments with 2.6.35 kernel on smaller boxes as
+well.  One is on a dual-socket Westmere box (12 cores total, with HT).
+Another experiment is on an old dual-socket Core 2 box (4 cores total, no
+HT)
+
+On the 12-core Westmere box, I see a 250% increase for Ingo's mutex-test
+program with my mutex patch but no significant difference in aim7's
+fserver workload.
+
+On the 4-core Core 2 box, I see the difference with the patch for both
+mutex-test and aim7 fserver are negligible.
+
+So far, it seems like the patch has not caused regression on smaller
+systems.
+
+Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
+Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+LKML-Reference: <1282168827.9542.72.camel@schen9-DESK>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/sched.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -3694,8 +3694,16 @@ int mutex_spin_on_owner(struct mutex *lo
+ 		/*
+ 		 * Owner changed, break to re-assess state.
+ 		 */
+-		if (lock->owner != owner)
++		if (lock->owner != owner) {
++			/*
++			 * If the lock has switched to a different owner,
++			 * we likely have heavy contention. Return 0 to quit
++			 * optimistic spinning and not contend further:
++			 */
++			if (lock->owner)
++				return 0;
+ 			break;
++		}
+ 
+ 		/*
+ 		 * Is that owner really running on that cpu?
diff --git a/queue-2.6.35/series b/queue-2.6.35/series
index 9501727c652..3ef20d7e7c5 100644
--- a/queue-2.6.35/series
+++ b/queue-2.6.35/series
@@ -110,3 +110,5 @@ usb-io_ti-check-firmware-version-before-updating.patch
 usb-xhci-remove-buggy-assignment-in-next_trb.patch
 usb-ftdi_sio-add-id-for-ionics-plugcomputer.patch
 usb-ftdi_sio-add-product-id-for-lenz-li-usb.patch
+mutex-improve-the-scalability-of-optimistic-spinning.patch
+tracing-fix-timer-tracing.patch
diff --git a/queue-2.6.35/tracing-fix-timer-tracing.patch b/queue-2.6.35/tracing-fix-timer-tracing.patch
new file mode 100644
index 00000000000..518dc91a917
--- /dev/null
+++ b/queue-2.6.35/tracing-fix-timer-tracing.patch
@@ -0,0 +1,75 @@
+From ede1b4290781ae82ccf0f2ecc6dada8d3dd35779 Mon Sep 17 00:00:00 2001
+From: Arjan van de Ven <arjan@linux.intel.com>
+Date: Wed, 18 Aug 2010 15:33:13 -0700
+Subject: tracing: Fix timer tracing
+
+From: Arjan van de Ven <arjan@linux.intel.com>
+
+commit ede1b4290781ae82ccf0f2ecc6dada8d3dd35779 upstream.
+
+PowerTOP would like to be able to trace timers.
+
+Unfortunately, the current timer tracing is not very useful: the
+actual timer function is not recorded in the trace at the start
+of timer execution.
+
+Although this is recorded for timer "start" time (when it gets
+armed), this is not useful; most timers get started early, and a
+tracer like PowerTOP will never see this event, but will only
+see the actual running of the  timer.
+
+This patch just adds the function to the timer tracing; I've
+verified with PowerTOP that now it can get useful information
+about timers.
+
+Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
+Cc: xiaoguangrong@cn.fujitsu.com
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <4C6C5FA9.3000405@linux.intel.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/trace/events/timer.h |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/include/trace/events/timer.h
++++ b/include/trace/events/timer.h
+@@ -74,14 +74,16 @@ TRACE_EVENT(timer_expire_entry,
+ 	TP_STRUCT__entry(
+ 		__field( void *,	timer	)
+ 		__field( unsigned long,	now	)
++		__field( void *,	function)
+ 	),
+ 
+ 	TP_fast_assign(
+ 		__entry->timer		= timer;
+ 		__entry->now		= jiffies;
++		__entry->function	= timer->function;
+ 	),
+ 
+-	TP_printk("timer=%p now=%lu", __entry->timer, __entry->now)
++	TP_printk("timer=%p function=%pf now=%lu", __entry->timer, __entry->function,__entry->now)
+ );
+ 
+ /**
+@@ -213,14 +215,16 @@ TRACE_EVENT(hrtimer_expire_entry,
+ 	TP_STRUCT__entry(
+ 		__field( void *,	hrtimer	)
+ 		__field( s64,		now	)
++		__field( void *,	function)
+ 	),
+ 
+ 	TP_fast_assign(
+ 		__entry->hrtimer	= hrtimer;
+ 		__entry->now		= now->tv64;
++		__entry->function	= hrtimer->function;
+ 	),
+ 
+-	TP_printk("hrtimer=%p now=%llu", __entry->hrtimer,
++	TP_printk("hrtimer=%p function=%pf now=%llu", __entry->hrtimer, __entry->function,
+ 		  (unsigned long long)ktime_to_ns((ktime_t) { .tv64 = __entry->now }))
+  );
+ 
-- 
2.47.3