From: Greg Kroah-Hartman Date: Fri, 10 May 2013 20:14:46 +0000 (-0700) Subject: 3.9-stable patches X-Git-Tag: v3.9.2~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=bb656d86cfa14b004366d759e6e389d37b752566;p=thirdparty%2Fkernel%2Fstable-queue.git 3.9-stable patches added patches: sched-lower-chances-of-cputime-scaling-overflow.patch --- diff --git a/queue-3.9/sched-lower-chances-of-cputime-scaling-overflow.patch b/queue-3.9/sched-lower-chances-of-cputime-scaling-overflow.patch new file mode 100644 index 00000000000..f2b68205138 --- /dev/null +++ b/queue-3.9/sched-lower-chances-of-cputime-scaling-overflow.patch @@ -0,0 +1,138 @@ +From d9a3c9823a2e6a543eb7807fb3d15d8233817ec5 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Wed, 20 Feb 2013 18:54:55 +0100 +Subject: sched: Lower chances of cputime scaling overflow + +From: Frederic Weisbecker + +commit d9a3c9823a2e6a543eb7807fb3d15d8233817ec5 upstream. + +Some users have reported that after running a process with +hundreds of threads on intensive CPU-bound loads, the cputime +of the group started to freeze after a few days. + +This is due to how we scale the tick-based cputime against +the scheduler precise execution time value. + +We add the values of all threads in the group and we multiply +that against the sum of the scheduler exec runtime of the whole +group. + +This easily overflows after a few days/weeks of execution. + +A proposed solution to solve this was to compute that multiplication +on stime instead of utime: + 62188451f0d63add7ad0cd2a1ae269d600c1663d + ("cputime: Avoid multiplication overflow on utime scaling") + +The rationale behind that was that it's easy for a thread to +spend most of its time in userspace under intensive CPU-bound workload +but it's much harder to do CPU-bound intensive long run in the kernel. + +This postulate got defeated when a user recently reported he was still +seeing cputime freezes after the above patch. The workload that +triggers this issue relates to intensive networking workloads where +most of the cputime is consumed in the kernel. + +To reduce much more the opportunities for multiplication overflow, +lets reduce the multiplication factors to the remainders of the division +between sched exec runtime and cputime. Assuming the difference between +these shouldn't ever be that large, it could work on many situations. + +This gets the same results as in the upstream scaling code except for +a small difference: the upstream code always rounds the results to +the nearest integer not greater to what would be the precise result. +The new code rounds to the nearest integer either greater or not +greater. In practice this difference probably shouldn't matter but +it's worth mentioning. + +If this solution appears not to be enough in the end, we'll +need to partly revert back to the behaviour prior to commit + 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 + ("sched, cputime: Introduce thread_group_times()") + +Back then, the scaling was done on exit() time before adding the cputime +of an exiting thread to the signal struct. And then we'll need to +scale one-by-one the live threads cputime in thread_group_cputime(). The +drawback may be a slightly slower code on exit time. + +Signed-off-by: Frederic Weisbecker +Cc: Stanislaw Gruszka +Cc: Steven Rostedt +Cc: Peter Zijlstra +Cc: Ingo Molnar +Cc: Andrew Morton +Signed-off-by: Stanislaw Gruszka +Acked-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched/cputime.c | 46 ++++++++++++++++++++++++++++++++++------------ + 1 file changed, 34 insertions(+), 12 deletions(-) + +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -521,18 +521,36 @@ EXPORT_SYMBOL_GPL(vtime_account_irq_ente + + #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ + +-static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) ++/* ++ * Perform (stime * rtime) / total with reduced chances ++ * of multiplication overflows by using smaller factors ++ * like quotient and remainders of divisions between ++ * rtime and total. ++ */ ++static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) + { +- u64 temp = (__force u64) rtime; +- +- temp *= (__force u64) stime; ++ u64 rem, res, scaled; + +- if (sizeof(cputime_t) == 4) +- temp = div_u64(temp, (__force u32) total); +- else +- temp = div64_u64(temp, (__force u64) total); ++ if (rtime >= total) { ++ /* ++ * Scale up to rtime / total then add ++ * the remainder scaled to stime / total. ++ */ ++ res = div64_u64_rem(rtime, total, &rem); ++ scaled = stime * res; ++ scaled += div64_u64(stime * rem, total); ++ } else { ++ /* ++ * Same in reverse: scale down to total / rtime ++ * then substract that result scaled to ++ * to the remaining part. ++ */ ++ res = div64_u64_rem(total, rtime, &rem); ++ scaled = div64_u64(stime, res); ++ scaled -= div64_u64(scaled * rem, total); ++ } + +- return (__force cputime_t) temp; ++ return (__force cputime_t) scaled; + } + + /* +@@ -560,10 +578,14 @@ static void cputime_adjust(struct task_c + */ + rtime = nsecs_to_cputime(curr->sum_exec_runtime); + +- if (total) +- stime = scale_stime(stime, rtime, total); +- else ++ if (!rtime) { ++ stime = 0; ++ } else if (!total) { + stime = rtime; ++ } else { ++ stime = scale_stime((__force u64)stime, ++ (__force u64)rtime, (__force u64)total); ++ } + + /* + * If the tick based count grows faster than the scheduler one, diff --git a/queue-3.9/series b/queue-3.9/series index 03b198ef56f..1262b2c08a6 100644 --- a/queue-3.9/series +++ b/queue-3.9/series @@ -16,3 +16,4 @@ dm-bufio-avoid-a-possible-__vmalloc-deadlock.patch dm-snapshot-fix-error-return-code-in-snapshot_ctr.patch dm-cache-fix-error-return-code-in-cache_create.patch math64-new-div64_u64_rem-helper.patch +sched-lower-chances-of-cputime-scaling-overflow.patch