From b2d2f5e2aa68d43c8f92d894a6a209ce75c039e5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 24 Aug 2010 13:19:03 -0700 Subject: [PATCH] .35 patches --- ...e-scalability-of-optimistic-spinning.patch | 134 ++++++++++++++++++ queue-2.6.35/series | 2 + queue-2.6.35/tracing-fix-timer-tracing.patch | 75 ++++++++++ 3 files changed, 211 insertions(+) create mode 100644 queue-2.6.35/mutex-improve-the-scalability-of-optimistic-spinning.patch create mode 100644 queue-2.6.35/tracing-fix-timer-tracing.patch diff --git a/queue-2.6.35/mutex-improve-the-scalability-of-optimistic-spinning.patch b/queue-2.6.35/mutex-improve-the-scalability-of-optimistic-spinning.patch new file mode 100644 index 00000000000..379cafcaa99 --- /dev/null +++ b/queue-2.6.35/mutex-improve-the-scalability-of-optimistic-spinning.patch @@ -0,0 +1,134 @@ +From 9d0f4dcc5c4d1c5dd01172172684a45b5f49d740 Mon Sep 17 00:00:00 2001 +From: Tim Chen +Date: Wed, 18 Aug 2010 15:00:27 -0700 +Subject: mutex: Improve the scalability of optimistic spinning +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +commit 9d0f4dcc5c4d1c5dd01172172684a45b5f49d740 upstream. + +There is a scalability issue for current implementation of optimistic +mutex spin in the kernel. It is found on a 8 node 64 core Nehalem-EX +system (HT mode). + +The intention of the optimistic mutex spin is to busy wait and spin on a +mutex if the owner of the mutex is running, in the hope that the mutex +will be released soon and be acquired, without the thread trying to +acquire mutex going to sleep. However, when we have a large number of +threads, contending for the mutex, we could have the mutex grabbed by +other thread, and then another ……, and we will keep spinning, wasting cpu +cycles and adding to the contention. One possible fix is to quit +spinning and put the current thread on wait-list if mutex lock switch to +a new owner while we spin, indicating heavy contention (see the patch +included). + +I did some testing on a 8 socket Nehalem-EX system with a total of 64 +cores. Using Ingo's test-mutex program that creates/delete files with 256 +threads (http://lkml.org/lkml/2006/1/8/50) , I see the following speed up +after putting in the mutex spin fix: + + ./mutex-test V 256 10 + Ops/sec + 2.6.34 62864 + With fix 197200 + +Repeating the test with Aim7 fserver workload, again there is a speed up +with the fix: + + Jobs/min + 2.6.34 91657 + With fix 149325 + +To look at the impact on the distribution of mutex acquisition time, I +collected the mutex acquisition time on Aim7 fserver workload with some +instrumentation. The average acquisition time is reduced by 48% and +number of contentions reduced by 32%. + + #contentions Time to acquire mutex (cycles) + 2.6.34 72973 44765791 + With fix 49210 23067129 + +The histogram of mutex acquisition time is listed below. The acquisition +time is in 2^bin cycles. We see that without the fix, the acquisition +time is mostly around 2^26 cycles. With the fix, we the distribution get +spread out a lot more towards the lower cycles, starting from 2^13. +However, there is an increase of the tail distribution with the fix at +2^28 and 2^29 cycles. It seems a small price to pay for the reduced +average acquisition time and also getting the cpu to do useful work. + + Mutex acquisition time distribution (acq time = 2^bin cycles): + 2.6.34 With Fix + bin #occurrence % #occurrence % + 11 2 0.00% 120 0.24% + 12 10 0.01% 790 1.61% + 13 14 0.02% 2058 4.18% + 14 86 0.12% 3378 6.86% + 15 393 0.54% 4831 9.82% + 16 710 0.97% 4893 9.94% + 17 815 1.12% 4667 9.48% + 18 790 1.08% 5147 10.46% + 19 580 0.80% 6250 12.70% + 20 429 0.59% 6870 13.96% + 21 311 0.43% 1809 3.68% + 22 255 0.35% 2305 4.68% + 23 317 0.44% 916 1.86% + 24 610 0.84% 233 0.47% + 25 3128 4.29% 95 0.19% + 26 63902 87.69% 122 0.25% + 27 619 0.85% 286 0.58% + 28 0 0.00% 3536 7.19% + 29 0 0.00% 903 1.83% + 30 0 0.00% 0 0.00% + +I've done similar experiments with 2.6.35 kernel on smaller boxes as +well. One is on a dual-socket Westmere box (12 cores total, with HT). +Another experiment is on an old dual-socket Core 2 box (4 cores total, no +HT) + +On the 12-core Westmere box, I see a 250% increase for Ingo's mutex-test +program with my mutex patch but no significant difference in aim7's +fserver workload. + +On the 4-core Core 2 box, I see the difference with the patch for both +mutex-test and aim7 fserver are negligible. + +So far, it seems like the patch has not caused regression on smaller +systems. + +Signed-off-by: Tim Chen +Acked-by: Peter Zijlstra +Cc: Linus Torvalds +Cc: Andrew Morton +Cc: Thomas Gleixner +Cc: Frederic Weisbecker +LKML-Reference: <1282168827.9542.72.camel@schen9-DESK> +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -3694,8 +3694,16 @@ int mutex_spin_on_owner(struct mutex *lo + /* + * Owner changed, break to re-assess state. + */ +- if (lock->owner != owner) ++ if (lock->owner != owner) { ++ /* ++ * If the lock has switched to a different owner, ++ * we likely have heavy contention. Return 0 to quit ++ * optimistic spinning and not contend further: ++ */ ++ if (lock->owner) ++ return 0; + break; ++ } + + /* + * Is that owner really running on that cpu? diff --git a/queue-2.6.35/series b/queue-2.6.35/series index 9501727c652..3ef20d7e7c5 100644 --- a/queue-2.6.35/series +++ b/queue-2.6.35/series @@ -110,3 +110,5 @@ usb-io_ti-check-firmware-version-before-updating.patch usb-xhci-remove-buggy-assignment-in-next_trb.patch usb-ftdi_sio-add-id-for-ionics-plugcomputer.patch usb-ftdi_sio-add-product-id-for-lenz-li-usb.patch +mutex-improve-the-scalability-of-optimistic-spinning.patch +tracing-fix-timer-tracing.patch diff --git a/queue-2.6.35/tracing-fix-timer-tracing.patch b/queue-2.6.35/tracing-fix-timer-tracing.patch new file mode 100644 index 00000000000..518dc91a917 --- /dev/null +++ b/queue-2.6.35/tracing-fix-timer-tracing.patch @@ -0,0 +1,75 @@ +From ede1b4290781ae82ccf0f2ecc6dada8d3dd35779 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Wed, 18 Aug 2010 15:33:13 -0700 +Subject: tracing: Fix timer tracing + +From: Arjan van de Ven + +commit ede1b4290781ae82ccf0f2ecc6dada8d3dd35779 upstream. + +PowerTOP would like to be able to trace timers. + +Unfortunately, the current timer tracing is not very useful: the +actual timer function is not recorded in the trace at the start +of timer execution. + +Although this is recorded for timer "start" time (when it gets +armed), this is not useful; most timers get started early, and a +tracer like PowerTOP will never see this event, but will only +see the actual running of the timer. + +This patch just adds the function to the timer tracing; I've +verified with PowerTOP that now it can get useful information +about timers. + +Signed-off-by: Arjan van de Ven +Cc: xiaoguangrong@cn.fujitsu.com +Cc: Steven Rostedt +Cc: Frederic Weisbecker +Cc: Peter Zijlstra +LKML-Reference: <4C6C5FA9.3000405@linux.intel.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + include/trace/events/timer.h | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/include/trace/events/timer.h ++++ b/include/trace/events/timer.h +@@ -74,14 +74,16 @@ TRACE_EVENT(timer_expire_entry, + TP_STRUCT__entry( + __field( void *, timer ) + __field( unsigned long, now ) ++ __field( void *, function) + ), + + TP_fast_assign( + __entry->timer = timer; + __entry->now = jiffies; ++ __entry->function = timer->function; + ), + +- TP_printk("timer=%p now=%lu", __entry->timer, __entry->now) ++ TP_printk("timer=%p function=%pf now=%lu", __entry->timer, __entry->function,__entry->now) + ); + + /** +@@ -213,14 +215,16 @@ TRACE_EVENT(hrtimer_expire_entry, + TP_STRUCT__entry( + __field( void *, hrtimer ) + __field( s64, now ) ++ __field( void *, function) + ), + + TP_fast_assign( + __entry->hrtimer = hrtimer; + __entry->now = now->tv64; ++ __entry->function = hrtimer->function; + ), + +- TP_printk("hrtimer=%p now=%llu", __entry->hrtimer, ++ TP_printk("hrtimer=%p function=%pf now=%llu", __entry->hrtimer, __entry->function, + (unsigned long long)ktime_to_ns((ktime_t) { .tv64 = __entry->now })) + ); + -- 2.47.3