--- /dev/null
+From d9a3c9823a2e6a543eb7807fb3d15d8233817ec5 Mon Sep 17 00:00:00 2001
+From: Frederic Weisbecker <fweisbec@gmail.com>
+Date: Wed, 20 Feb 2013 18:54:55 +0100
+Subject: sched: Lower chances of cputime scaling overflow
+
+From: Frederic Weisbecker <fweisbec@gmail.com>
+
+commit d9a3c9823a2e6a543eb7807fb3d15d8233817ec5 upstream.
+
+Some users have reported that after running a process with
+hundreds of threads on intensive CPU-bound loads, the cputime
+of the group started to freeze after a few days.
+
+This is due to how we scale the tick-based cputime against
+the scheduler precise execution time value.
+
+We add the values of all threads in the group and we multiply
+that against the sum of the scheduler exec runtime of the whole
+group.
+
+This easily overflows after a few days/weeks of execution.
+
+A proposed solution to solve this was to compute that multiplication
+on stime instead of utime:
+ 62188451f0d63add7ad0cd2a1ae269d600c1663d
+ ("cputime: Avoid multiplication overflow on utime scaling")
+
+The rationale behind that was that it's easy for a thread to
+spend most of its time in userspace under intensive CPU-bound workload
+but it's much harder to do CPU-bound intensive long run in the kernel.
+
+This postulate got defeated when a user recently reported he was still
+seeing cputime freezes after the above patch. The workload that
+triggers this issue relates to intensive networking workloads where
+most of the cputime is consumed in the kernel.
+
+To reduce much more the opportunities for multiplication overflow,
+lets reduce the multiplication factors to the remainders of the division
+between sched exec runtime and cputime. Assuming the difference between
+these shouldn't ever be that large, it could work on many situations.
+
+This gets the same results as in the upstream scaling code except for
+a small difference: the upstream code always rounds the results to
+the nearest integer not greater to what would be the precise result.
+The new code rounds to the nearest integer either greater or not
+greater. In practice this difference probably shouldn't matter but
+it's worth mentioning.
+
+If this solution appears not to be enough in the end, we'll
+need to partly revert back to the behaviour prior to commit
+ 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239
+ ("sched, cputime: Introduce thread_group_times()")
+
+Back then, the scaling was done on exit() time before adding the cputime
+of an exiting thread to the signal struct. And then we'll need to
+scale one-by-one the live threads cputime in thread_group_cputime(). The
+drawback may be a slightly slower code on exit time.
+
+Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Stanislaw Gruszka <sgruszka@redhat.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
+Acked-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/cputime.c | 46 ++++++++++++++++++++++++++++++++++------------
+ 1 file changed, 34 insertions(+), 12 deletions(-)
+
+--- a/kernel/sched/cputime.c
++++ b/kernel/sched/cputime.c
+@@ -521,18 +521,36 @@ EXPORT_SYMBOL_GPL(vtime_account_irq_ente
+
+ #else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+
+-static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
++/*
++ * Perform (stime * rtime) / total with reduced chances
++ * of multiplication overflows by using smaller factors
++ * like quotient and remainders of divisions between
++ * rtime and total.
++ */
++static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+ {
+- u64 temp = (__force u64) rtime;
+-
+- temp *= (__force u64) stime;
++ u64 rem, res, scaled;
+
+- if (sizeof(cputime_t) == 4)
+- temp = div_u64(temp, (__force u32) total);
+- else
+- temp = div64_u64(temp, (__force u64) total);
++ if (rtime >= total) {
++ /*
++ * Scale up to rtime / total then add
++ * the remainder scaled to stime / total.
++ */
++ res = div64_u64_rem(rtime, total, &rem);
++ scaled = stime * res;
++ scaled += div64_u64(stime * rem, total);
++ } else {
++ /*
++ * Same in reverse: scale down to total / rtime
++ * then substract that result scaled to
++ * to the remaining part.
++ */
++ res = div64_u64_rem(total, rtime, &rem);
++ scaled = div64_u64(stime, res);
++ scaled -= div64_u64(scaled * rem, total);
++ }
+
+- return (__force cputime_t) temp;
++ return (__force cputime_t) scaled;
+ }
+
+ /*
+@@ -560,10 +578,14 @@ static void cputime_adjust(struct task_c
+ */
+ rtime = nsecs_to_cputime(curr->sum_exec_runtime);
+
+- if (total)
+- stime = scale_stime(stime, rtime, total);
+- else
++ if (!rtime) {
++ stime = 0;
++ } else if (!total) {
+ stime = rtime;
++ } else {
++ stime = scale_stime((__force u64)stime,
++ (__force u64)rtime, (__force u64)total);
++ }
+
+ /*
+ * If the tick based count grows faster than the scheduler one,