3.9-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 10 May 2013 20:14:46 +0000 (13:14 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 10 May 2013 20:14:46 +0000 (13:14 -0700)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 10 May 2013 20:14:46 +0000 (13:14 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 10 May 2013 20:14:46 +0000 (13:14 -0700)
diff --git a/queue-3.9/sched-lower-chances-of-cputime-scaling-overflow.patch b/queue-3.9/sched-lower-chances-of-cputime-scaling-overflow.patch

new file mode 100644 (file)

index 0000000..f2b6820
--- /dev/null
+++ b/queue-3.9/sched-lower-chances-of-cputime-scaling-overflow.patch
@@ -0,0 +1,138 @@
+From d9a3c9823a2e6a543eb7807fb3d15d8233817ec5 Mon Sep 17 00:00:00 2001
+From: Frederic Weisbecker <fweisbec@gmail.com>
+Date: Wed, 20 Feb 2013 18:54:55 +0100
+Subject: sched: Lower chances of cputime scaling overflow
+
+From: Frederic Weisbecker <fweisbec@gmail.com>
+
+commit d9a3c9823a2e6a543eb7807fb3d15d8233817ec5 upstream.
+
+Some users have reported that after running a process with
+hundreds of threads on intensive CPU-bound loads, the cputime
+of the group started to freeze after a few days.
+
+This is due to how we scale the tick-based cputime against
+the scheduler precise execution time value.
+
+We add the values of all threads in the group and we multiply
+that against the sum of the scheduler exec runtime of the whole
+group.
+
+This easily overflows after a few days/weeks of execution.
+
+A proposed solution to solve this was to compute that multiplication
+on stime instead of utime:
+   62188451f0d63add7ad0cd2a1ae269d600c1663d
+   ("cputime: Avoid multiplication overflow on utime scaling")
+
+The rationale behind that was that it's easy for a thread to
+spend most of its time in userspace under intensive CPU-bound workload
+but it's much harder to do CPU-bound intensive long run in the kernel.
+
+This postulate got defeated when a user recently reported he was still
+seeing cputime freezes after the above patch. The workload that
+triggers this issue relates to intensive networking workloads where
+most of the cputime is consumed in the kernel.
+
+To reduce much more the opportunities for multiplication overflow,
+lets reduce the multiplication factors to the remainders of the division
+between sched exec runtime and cputime. Assuming the difference between
+these shouldn't ever be that large, it could work on many situations.
+
+This gets the same results as in the upstream scaling code except for
+a small difference: the upstream code always rounds the results to
+the nearest integer not greater to what would be the precise result.
+The new code rounds to the nearest integer either greater or not
+greater. In practice this difference probably shouldn't matter but
+it's worth mentioning.
+
+If this solution appears not to be enough in the end, we'll
+need to partly revert back to the behaviour prior to commit
+     0cf55e1ec08bb5a22e068309e2d8ba1180ab4239
+     ("sched, cputime: Introduce thread_group_times()")
+
+Back then, the scaling was done on exit() time before adding the cputime
+of an exiting thread to the signal struct. And then we'll need to
+scale one-by-one the live threads cputime in thread_group_cputime(). The
+drawback may be a slightly slower code on exit time.
+
+Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Stanislaw Gruszka <sgruszka@redhat.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
+Acked-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/cputime.c |   46 ++++++++++++++++++++++++++++++++++------------
+ 1 file changed, 34 insertions(+), 12 deletions(-)
+
+--- a/kernel/sched/cputime.c
++++ b/kernel/sched/cputime.c
+@@ -521,18 +521,36 @@ EXPORT_SYMBOL_GPL(vtime_account_irq_ente
+ 
+ #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+ 
+-static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
++/*
++ * Perform (stime * rtime) / total with reduced chances
++ * of multiplication overflows by using smaller factors
++ * like quotient and remainders of divisions between
++ * rtime and total.
++ */
++static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+ {
+-      u64 temp = (__force u64) rtime;
+-
+-      temp *= (__force u64) stime;
++      u64 rem, res, scaled;
+ 
+-      if (sizeof(cputime_t) == 4)
+-              temp = div_u64(temp, (__force u32) total);
+-      else
+-              temp = div64_u64(temp, (__force u64) total);
++      if (rtime >= total) {
++              /*
++               * Scale up to rtime / total then add
++               * the remainder scaled to stime / total.
++               */
++              res = div64_u64_rem(rtime, total, &rem);
++              scaled = stime * res;
++              scaled += div64_u64(stime * rem, total);
++      } else {
++              /*
++               * Same in reverse: scale down to total / rtime
++               * then substract that result scaled to
++               * to the remaining part.
++               */
++              res = div64_u64_rem(total, rtime, &rem);
++              scaled = div64_u64(stime, res);
++              scaled -= div64_u64(scaled * rem, total);
++      }
+ 
+-      return (__force cputime_t) temp;
++      return (__force cputime_t) scaled;
+ }
+ 
+ /*
+@@ -560,10 +578,14 @@ static void cputime_adjust(struct task_c
+        */
+       rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+ 
+-      if (total)
+-              stime = scale_stime(stime, rtime, total);
+-      else
++      if (!rtime) {
++              stime = 0;
++      } else if (!total) {
+               stime = rtime;
++      } else {
++              stime = scale_stime((__force u64)stime,
++                                  (__force u64)rtime, (__force u64)total);
++      }
+ 
+       /*
+        * If the tick based count grows faster than the scheduler one,
diff --git a/queue-3.9/series b/queue-3.9/series

index 03b198ef56f26c82717bca5fa063a907be83bb98..1262b2c08a6ae01bee613bf5e4b71cb2592333c4 100644 (file)
--- a/queue-3.9/series
+++ b/queue-3.9/series
@@ -16,3 +16,4 @@ dm-bufio-avoid-a-possible-__vmalloc-deadlock.patch
  dm-snapshot-fix-error-return-code-in-snapshot_ctr.patch
  dm-cache-fix-error-return-code-in-cache_create.patch
  math64-new-div64_u64_rem-helper.patch
+sched-lower-chances-of-cputime-scaling-overflow.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 10 May 2013 20:14:46 +0000 (13:14 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 10 May 2013 20:14:46 +0000 (13:14 -0700)
queue-3.9/sched-lower-chances-of-cputime-scaling-overflow.patch	[new file with mode: 0644]	patch \| blob
queue-3.9/series		patch \| blob \| blame \| history