--- /dev/null
+From d670ec13178d0fd8680e6742a2bc6e04f28f87d8 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Thu, 1 Sep 2011 12:42:04 +0200
+Subject: posix-cpu-timers: Cure SMP wobbles
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit d670ec13178d0fd8680e6742a2bc6e04f28f87d8 upstream.
+
+David reported:
+
+ Attached below is a watered-down version of rt/tst-cpuclock2.c from
+ GLIBC. Just build it with "gcc -o test test.c -lpthread -lrt" or
+ similar.
+
+ Run it several times, and you will see cases where the main thread
+ will measure a process clock difference before and after the nanosleep
+ which is smaller than the cpu-burner thread's individual thread clock
+ difference. This doesn't make any sense since the cpu-burner thread
+ is part of the top-level process's thread group.
+
+ I've reproduced this on both x86-64 and sparc64 (using both 32-bit and
+ 64-bit binaries).
+
+ For example:
+
+ [davem@boricha build-x86_64-linux]$ ./test
+ process: before(0.001221967) after(0.498624371) diff(497402404)
+ thread: before(0.000081692) after(0.498316431) diff(498234739)
+ self: before(0.001223521) after(0.001240219) diff(16698)
+ [davem@boricha build-x86_64-linux]$
+
+ The diff of 'process' should always be >= the diff of 'thread'.
+
+ I make sure to wrap the 'thread' clock measurements the most tightly
+ around the nanosleep() call, and that the 'process' clock measurements
+ are the outer-most ones.
+
+ ---
+ #include <unistd.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <time.h>
+ #include <fcntl.h>
+ #include <string.h>
+ #include <errno.h>
+ #include <pthread.h>
+
+ static pthread_barrier_t barrier;
+
+ static void *chew_cpu(void *arg)
+ {
+ pthread_barrier_wait(&barrier);
+ while (1)
+ __asm__ __volatile__("" : : : "memory");
+ return NULL;
+ }
+
+ int main(void)
+ {
+ clockid_t process_clock, my_thread_clock, th_clock;
+ struct timespec process_before, process_after;
+ struct timespec me_before, me_after;
+ struct timespec th_before, th_after;
+ struct timespec sleeptime;
+ unsigned long diff;
+ pthread_t th;
+ int err;
+
+ err = clock_getcpuclockid(0, &process_clock);
+ if (err)
+ return 1;
+
+ err = pthread_getcpuclockid(pthread_self(), &my_thread_clock);
+ if (err)
+ return 1;
+
+ pthread_barrier_init(&barrier, NULL, 2);
+ err = pthread_create(&th, NULL, chew_cpu, NULL);
+ if (err)
+ return 1;
+
+ err = pthread_getcpuclockid(th, &th_clock);
+ if (err)
+ return 1;
+
+ pthread_barrier_wait(&barrier);
+
+ err = clock_gettime(process_clock, &process_before);
+ if (err)
+ return 1;
+
+ err = clock_gettime(my_thread_clock, &me_before);
+ if (err)
+ return 1;
+
+ err = clock_gettime(th_clock, &th_before);
+ if (err)
+ return 1;
+
+ sleeptime.tv_sec = 0;
+ sleeptime.tv_nsec = 500000000;
+ nanosleep(&sleeptime, NULL);
+
+ err = clock_gettime(th_clock, &th_after);
+ if (err)
+ return 1;
+
+ err = clock_gettime(my_thread_clock, &me_after);
+ if (err)
+ return 1;
+
+ err = clock_gettime(process_clock, &process_after);
+ if (err)
+ return 1;
+
+ diff = process_after.tv_nsec - process_before.tv_nsec;
+ printf("process: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n",
+ process_before.tv_sec, process_before.tv_nsec,
+ process_after.tv_sec, process_after.tv_nsec, diff);
+ diff = th_after.tv_nsec - th_before.tv_nsec;
+ printf("thread: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n",
+ th_before.tv_sec, th_before.tv_nsec,
+ th_after.tv_sec, th_after.tv_nsec, diff);
+ diff = me_after.tv_nsec - me_before.tv_nsec;
+ printf("self: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n",
+ me_before.tv_sec, me_before.tv_nsec,
+ me_after.tv_sec, me_after.tv_nsec, diff);
+
+ return 0;
+ }
+
+This is due to us using p->se.sum_exec_runtime in
+thread_group_cputime() where we iterate the thread group and sum all
+data. This does not take time since the last schedule operation (tick
+or otherwise) into account. We can cure this by using
+task_sched_runtime() at the cost of having to take locks.
+
+This also means we can (and must) do away with
+thread_group_sched_runtime() since the modified thread_group_cputime()
+is now more accurate and would deadlock when called from
+thread_group_sched_runtime().
+
+Aside of that it makes the function safe on 32 bit systems. The old
+code added t->se.sum_exec_runtime unprotected. sum_exec_runtime is a
+64bit value and could be changed on another cpu at the same time.
+
+Reported-by: David Miller <davem@davemloft.net>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/1314874459.7945.22.camel@twins
+Tested-by: David Miller <davem@davemloft.net>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/sched.h | 1 -
+ kernel/posix-cpu-timers.c | 5 +++--
+ kernel/sched.c | 24 ------------------------
+ 3 files changed, 3 insertions(+), 27 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1937,7 +1937,6 @@ static inline void disable_sched_clock_i
+
+ extern unsigned long long
+ task_sched_runtime(struct task_struct *task);
+-extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
+
+ /* sched_exec is called by processes performing an exec */
+ #ifdef CONFIG_SMP
+--- a/kernel/posix-cpu-timers.c
++++ b/kernel/posix-cpu-timers.c
+@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_st
+ do {
+ times->utime = cputime_add(times->utime, t->utime);
+ times->stime = cputime_add(times->stime, t->stime);
+- times->sum_exec_runtime += t->se.sum_exec_runtime;
++ times->sum_exec_runtime += task_sched_runtime(t);
+ } while_each_thread(tsk, t);
+ out:
+ rcu_read_unlock();
+@@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const
+ cpu->cpu = cputime.utime;
+ break;
+ case CPUCLOCK_SCHED:
+- cpu->sched = thread_group_sched_runtime(p);
++ thread_group_cputime(p, &cputime);
++ cpu->sched = cputime.sum_exec_runtime;
+ break;
+ }
+ return 0;
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -3713,30 +3713,6 @@ unsigned long long task_sched_runtime(st
+ }
+
+ /*
+- * Return sum_exec_runtime for the thread group.
+- * In case the task is currently running, return the sum plus current's
+- * pending runtime that have not been accounted yet.
+- *
+- * Note that the thread group might have other running tasks as well,
+- * so the return value not includes other pending runtime that other
+- * running tasks might have.
+- */
+-unsigned long long thread_group_sched_runtime(struct task_struct *p)
+-{
+- struct task_cputime totals;
+- unsigned long flags;
+- struct rq *rq;
+- u64 ns;
+-
+- rq = task_rq_lock(p, &flags);
+- thread_group_cputime(p, &totals);
+- ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
+- task_rq_unlock(rq, p, &flags);
+-
+- return ns;
+-}
+-
+-/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update