--- /dev/null
+From 112166f88cf83dd11486cf1818672d42b540865b Mon Sep 17 00:00:00 2001
+From: Christoph Lameter <cl@linux.com>
+Date: Wed, 12 Jul 2017 14:33:11 -0700
+Subject: kernel/fork.c: virtually mapped stacks: do not disable interrupts
+
+From: Christoph Lameter <cl@linux.com>
+
+commit 112166f88cf83dd11486cf1818672d42b540865b upstream.
+
+The reason to disable interrupts seems to be to avoid switching to a
+different processor while handling per cpu data using individual loads and
+stores. If we use per cpu RMV primitives we will not have to disable
+interrupts.
+
+Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1705171055130.5898@east.gentwo.org
+Signed-off-by: Christoph Lameter <cl@linux.com>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/fork.c | 16 +++++-----------
+ 1 file changed, 5 insertions(+), 11 deletions(-)
+
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack
+ void *stack;
+ int i;
+
+- local_irq_disable();
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+- struct vm_struct *s = this_cpu_read(cached_stacks[i]);
++ struct vm_struct *s;
++
++ s = this_cpu_xchg(cached_stacks[i], NULL);
+
+ if (!s)
+ continue;
+- this_cpu_write(cached_stacks[i], NULL);
+
+ tsk->stack_vm_area = s;
+- local_irq_enable();
+ return s->addr;
+ }
+- local_irq_enable();
+
+ stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+ VMALLOC_START, VMALLOC_END,
+@@ -245,19 +243,15 @@ static inline void free_thread_stack(str
+ {
+ #ifdef CONFIG_VMAP_STACK
+ if (task_stack_vm_area(tsk)) {
+- unsigned long flags;
+ int i;
+
+- local_irq_save(flags);
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+- if (this_cpu_read(cached_stacks[i]))
++ if (this_cpu_cmpxchg(cached_stacks[i],
++ NULL, tsk->stack_vm_area) != NULL)
+ continue;
+
+- this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
+- local_irq_restore(flags);
+ return;
+ }
+- local_irq_restore(flags);
+
+ vfree_atomic(tsk->stack);
+ return;
--- /dev/null
+From 104b4e5139fe384431ac11c3b8a6cf4a529edf4a Mon Sep 17 00:00:00 2001
+From: Nikolay Borisov <nborisov@suse.com>
+Date: Tue, 20 Jun 2017 21:01:20 +0300
+Subject: percpu_counter: Rename __percpu_counter_add to percpu_counter_add_batch
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit 104b4e5139fe384431ac11c3b8a6cf4a529edf4a upstream.
+
+Currently, percpu_counter_add is a wrapper around __percpu_counter_add
+which is preempt safe due to explicit calls to preempt_disable. Given
+how __ prefix is used in percpu related interfaces, the naming
+unfortunately creates the false sense that __percpu_counter_add is
+less safe than percpu_counter_add. In terms of context-safety,
+they're equivalent. The only difference is that the __ version takes
+a batch parameter.
+
+Make this a bit more explicit by just renaming __percpu_counter_add to
+percpu_counter_add_batch.
+
+This patch doesn't cause any functional changes.
+
+tj: Minor updates to patch description for clarity. Cosmetic
+ indentation updates.
+
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Cc: Chris Mason <clm@fb.com>
+Cc: Josef Bacik <jbacik@fb.com>
+Cc: David Sterba <dsterba@suse.com>
+Cc: Darrick J. Wong <darrick.wong@oracle.com>
+Cc: Jan Kara <jack@suse.com>
+Cc: Jens Axboe <axboe@fb.com>
+Cc: linux-mm@kvack.org
+Cc: "David S. Miller" <davem@davemloft.net>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/disk-io.c | 12 ++++++------
+ fs/btrfs/extent_io.c | 6 +++---
+ fs/btrfs/inode.c | 8 ++++----
+ fs/xfs/xfs_mount.c | 4 ++--
+ include/linux/backing-dev.h | 2 +-
+ include/linux/blk-cgroup.h | 6 +++---
+ include/linux/mman.h | 2 +-
+ include/linux/percpu_counter.h | 7 ++++---
+ include/net/inet_frag.h | 4 ++--
+ lib/flex_proportions.c | 6 +++---
+ lib/percpu_counter.c | 4 ++--
+ 11 files changed, 31 insertions(+), 30 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1255,9 +1255,9 @@ void clean_tree_block(struct btrfs_fs_in
+ btrfs_assert_tree_locked(buf);
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+- __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+- -buf->len,
+- fs_info->dirty_metadata_batch);
++ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
++ -buf->len,
++ fs_info->dirty_metadata_batch);
+ /* ugh, clear_extent_buffer_dirty needs to lock the page */
+ btrfs_set_lock_blocking(buf);
+ clear_extent_buffer_dirty(buf);
+@@ -4049,9 +4049,9 @@ void btrfs_mark_buffer_dirty(struct exte
+ buf->start, transid, fs_info->generation);
+ was_dirty = set_extent_buffer_dirty(buf);
+ if (!was_dirty)
+- __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+- buf->len,
+- fs_info->dirty_metadata_batch);
++ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
++ buf->len,
++ fs_info->dirty_metadata_batch);
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+ btrfs_print_leaf(fs_info, buf);
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -3597,9 +3597,9 @@ lock_extent_buffer_for_io(struct extent_
+ set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+ spin_unlock(&eb->refs_lock);
+ btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+- __percpu_counter_add(&fs_info->dirty_metadata_bytes,
+- -eb->len,
+- fs_info->dirty_metadata_batch);
++ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
++ -eb->len,
++ fs_info->dirty_metadata_batch);
+ ret = 1;
+ } else {
+ spin_unlock(&eb->refs_lock);
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -1766,8 +1766,8 @@ static void btrfs_set_bit_hook(struct in
+ if (btrfs_is_testing(fs_info))
+ return;
+
+- __percpu_counter_add(&fs_info->delalloc_bytes, len,
+- fs_info->delalloc_batch);
++ percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
++ fs_info->delalloc_batch);
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->delalloc_bytes += len;
+ if (*bits & EXTENT_DEFRAG)
+@@ -1840,8 +1840,8 @@ static void btrfs_clear_bit_hook(struct
+ &inode->vfs_inode,
+ state->start, len);
+
+- __percpu_counter_add(&fs_info->delalloc_bytes, -len,
+- fs_info->delalloc_batch);
++ percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
++ fs_info->delalloc_batch);
+ spin_lock(&inode->lock);
+ inode->delalloc_bytes -= len;
+ if (do_list && inode->delalloc_bytes == 0 &&
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1209,7 +1209,7 @@ xfs_mod_icount(
+ struct xfs_mount *mp,
+ int64_t delta)
+ {
+- __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
++ percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
+ if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
+ ASSERT(0);
+ percpu_counter_add(&mp->m_icount, -delta);
+@@ -1288,7 +1288,7 @@ xfs_mod_fdblocks(
+ else
+ batch = XFS_FDBLOCKS_BATCH;
+
+- __percpu_counter_add(&mp->m_fdblocks, delta, batch);
++ percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
+ if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside,
+ XFS_FDBLOCKS_BATCH) >= 0) {
+ /* we had space! */
+--- a/include/linux/backing-dev.h
++++ b/include/linux/backing-dev.h
+@@ -66,7 +66,7 @@ static inline bool bdi_has_dirty_io(stru
+ static inline void __add_wb_stat(struct bdi_writeback *wb,
+ enum wb_stat_item item, s64 amount)
+ {
+- __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH);
++ percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
+ }
+
+ static inline void __inc_wb_stat(struct bdi_writeback *wb,
+--- a/include/linux/blk-cgroup.h
++++ b/include/linux/blk-cgroup.h
+@@ -518,7 +518,7 @@ static inline void blkg_stat_exit(struct
+ */
+ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
+ {
+- __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
++ percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
+ }
+
+ /**
+@@ -597,14 +597,14 @@ static inline void blkg_rwstat_add(struc
+ else
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+- __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
++ percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
+
+ if (op_is_sync(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
+ else
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
+
+- __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
++ percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
+ }
+
+ /**
+--- a/include/linux/mman.h
++++ b/include/linux/mman.h
+@@ -22,7 +22,7 @@ unsigned long vm_memory_committed(void);
+
+ static inline void vm_acct_memory(long pages)
+ {
+- __percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
++ percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
+ }
+
+ static inline void vm_unacct_memory(long pages)
+--- a/include/linux/percpu_counter.h
++++ b/include/linux/percpu_counter.h
+@@ -39,7 +39,8 @@ int __percpu_counter_init(struct percpu_
+
+ void percpu_counter_destroy(struct percpu_counter *fbc);
+ void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
+-void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
++void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
++ s32 batch);
+ s64 __percpu_counter_sum(struct percpu_counter *fbc);
+ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
+
+@@ -50,7 +51,7 @@ static inline int percpu_counter_compare
+
+ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
+ {
+- __percpu_counter_add(fbc, amount, percpu_counter_batch);
++ percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
+ }
+
+ static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
+@@ -136,7 +137,7 @@ percpu_counter_add(struct percpu_counter
+ }
+
+ static inline void
+-__percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
++percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
+ {
+ percpu_counter_add(fbc, amount);
+ }
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -154,12 +154,12 @@ static inline int frag_mem_limit(struct
+
+ static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
+ {
+- __percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
++ percpu_counter_add_batch(&nf->mem, -i, frag_percpu_counter_batch);
+ }
+
+ static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
+ {
+- __percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch);
++ percpu_counter_add_batch(&nf->mem, i, frag_percpu_counter_batch);
+ }
+
+ static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
+--- a/lib/flex_proportions.c
++++ b/lib/flex_proportions.c
+@@ -207,7 +207,7 @@ static void fprop_reflect_period_percpu(
+ if (val < (nr_cpu_ids * PROP_BATCH))
+ val = percpu_counter_sum(&pl->events);
+
+- __percpu_counter_add(&pl->events,
++ percpu_counter_add_batch(&pl->events,
+ -val + (val >> (period-pl->period)), PROP_BATCH);
+ } else
+ percpu_counter_set(&pl->events, 0);
+@@ -219,7 +219,7 @@ static void fprop_reflect_period_percpu(
+ void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
+ {
+ fprop_reflect_period_percpu(p, pl);
+- __percpu_counter_add(&pl->events, 1, PROP_BATCH);
++ percpu_counter_add_batch(&pl->events, 1, PROP_BATCH);
+ percpu_counter_add(&p->events, 1);
+ }
+
+@@ -267,6 +267,6 @@ void __fprop_inc_percpu_max(struct fprop
+ return;
+ } else
+ fprop_reflect_period_percpu(p, pl);
+- __percpu_counter_add(&pl->events, 1, PROP_BATCH);
++ percpu_counter_add_batch(&pl->events, 1, PROP_BATCH);
+ percpu_counter_add(&p->events, 1);
+ }
+--- a/lib/percpu_counter.c
++++ b/lib/percpu_counter.c
+@@ -72,7 +72,7 @@ void percpu_counter_set(struct percpu_co
+ }
+ EXPORT_SYMBOL(percpu_counter_set);
+
+-void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
++void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
+ {
+ s64 count;
+
+@@ -89,7 +89,7 @@ void __percpu_counter_add(struct percpu_
+ }
+ preempt_enable();
+ }
+-EXPORT_SYMBOL(__percpu_counter_add);
++EXPORT_SYMBOL(percpu_counter_add_batch);
+
+ /*
+ * Add up all the per-cpu counts, return the result. This is a more accurate
--- /dev/null
+From 2a42eb9594a1480b4ead9e036e06ee1290e5fa6d Mon Sep 17 00:00:00 2001
+From: Wanpeng Li <wanpeng.li@hotmail.com>
+Date: Thu, 29 Jun 2017 19:15:11 +0200
+Subject: sched/cputime: Accumulate vtime on top of nsec clocksource
+
+From: Wanpeng Li <wanpeng.li@hotmail.com>
+
+commit 2a42eb9594a1480b4ead9e036e06ee1290e5fa6d upstream.
+
+Currently the cputime source used by vtime is jiffies. When we cross
+a context boundary and jiffies have changed since the last snapshot, the
+pending cputime is accounted to the switching out context.
+
+This system works ok if the ticks are not aligned across CPUs. If they
+instead are aligned (ie: all fire at the same time) and the CPUs run in
+userspace, the jiffies change is only observed on tick exit and therefore
+the user cputime is accounted as system cputime. This is because the
+CPU that maintains timekeeping fires its tick at the same time as the
+others. It updates jiffies in the middle of the tick and the other CPUs
+see that update on IRQ exit:
+
+ CPU 0 (timekeeper) CPU 1
+ ------------------- -------------
+ jiffies = N
+ ... run in userspace for a jiffy
+ tick entry tick entry (sees jiffies = N)
+ set jiffies = N + 1
+ tick exit tick exit (sees jiffies = N + 1)
+ account 1 jiffy as stime
+
+Fix this with using a nanosec clock source instead of jiffies. The
+cputime is then accumulated and flushed everytime the pending delta
+reaches a jiffy in order to mitigate the accounting overhead.
+
+[ fweisbec: changelog, rebase on struct vtime, field renames, add delta
+ on cputime readers, keep idle vtime as-is (low overhead accounting),
+ harmonize clock sources. ]
+
+Suggested-by: Thomas Gleixner <tglx@linutronix.de>
+Reported-by: Luiz Capitulino <lcapitulino@redhat.com>
+Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
+Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
+Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Wanpeng Li <kernellwp@gmail.com>
+Link: http://lkml.kernel.org/r/1498756511-11714-6-git-send-email-fweisbec@gmail.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/sched.h | 3 ++
+ kernel/sched/cputime.c | 64 ++++++++++++++++++++++++++++++++-----------------
+ 2 files changed, 45 insertions(+), 22 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -236,6 +236,9 @@ struct vtime {
+ seqcount_t seqcount;
+ unsigned long long starttime;
+ enum vtime_state state;
++ u64 utime;
++ u64 stime;
++ u64 gtime;
+ };
+
+ struct sched_info {
+--- a/kernel/sched/cputime.c
++++ b/kernel/sched/cputime.c
+@@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struc
+ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ static u64 vtime_delta(struct vtime *vtime)
+ {
+- unsigned long now = READ_ONCE(jiffies);
++ unsigned long long clock;
+
+- if (time_before(now, (unsigned long)vtime->starttime))
++ clock = sched_clock_cpu(smp_processor_id());
++ if (clock < vtime->starttime)
+ return 0;
+
+- return jiffies_to_nsecs(now - vtime->starttime);
++ return clock - vtime->starttime;
+ }
+
+ static u64 get_vtime_delta(struct vtime *vtime)
+ {
+- unsigned long now = READ_ONCE(jiffies);
+- u64 delta, other;
++ u64 delta = vtime_delta(vtime);
++ u64 other;
+
+ /*
+ * Unlike tick based timing, vtime based timing never has lost
+@@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
+- delta = jiffies_to_nsecs(now - vtime->starttime);
+ other = account_other_time(delta);
+ WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+- vtime->starttime = now;
++ vtime->starttime += delta;
+
+ return delta - other;
+ }
+
+-static void __vtime_account_system(struct task_struct *tsk)
++static void __vtime_account_system(struct task_struct *tsk,
++ struct vtime *vtime)
+ {
+- account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime));
++ vtime->stime += get_vtime_delta(vtime);
++ if (vtime->stime >= TICK_NSEC) {
++ account_system_time(tsk, irq_count(), vtime->stime);
++ vtime->stime = 0;
++ }
++}
++
++static void vtime_account_guest(struct task_struct *tsk,
++ struct vtime *vtime)
++{
++ vtime->gtime += get_vtime_delta(vtime);
++ if (vtime->gtime >= TICK_NSEC) {
++ account_guest_time(tsk, vtime->gtime);
++ vtime->gtime = 0;
++ }
+ }
+
+ void vtime_account_system(struct task_struct *tsk)
+@@ -722,7 +737,11 @@ void vtime_account_system(struct task_st
+ return;
+
+ write_seqcount_begin(&vtime->seqcount);
+- __vtime_account_system(tsk);
++ /* We might have scheduled out from guest path */
++ if (current->flags & PF_VCPU)
++ vtime_account_guest(tsk, vtime);
++ else
++ __vtime_account_system(tsk, vtime);
+ write_seqcount_end(&vtime->seqcount);
+ }
+
+@@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+- if (vtime_delta(vtime))
+- __vtime_account_system(tsk);
++ __vtime_account_system(tsk, vtime);
+ vtime->state = VTIME_USER;
+ write_seqcount_end(&vtime->seqcount);
+ }
+@@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+- if (vtime_delta(vtime))
+- account_user_time(tsk, get_vtime_delta(vtime));
++ vtime->utime += get_vtime_delta(vtime);
++ if (vtime->utime >= TICK_NSEC) {
++ account_user_time(tsk, vtime->utime);
++ vtime->utime = 0;
++ }
+ vtime->state = VTIME_SYS;
+ write_seqcount_end(&vtime->seqcount);
+ }
+@@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struc
+ * that can thus safely catch up with a tickless delta.
+ */
+ write_seqcount_begin(&vtime->seqcount);
+- if (vtime_delta(vtime))
+- __vtime_account_system(tsk);
++ __vtime_account_system(tsk, vtime);
+ current->flags |= PF_VCPU;
+ write_seqcount_end(&vtime->seqcount);
+ }
+@@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+- __vtime_account_system(tsk);
++ vtime_account_guest(tsk, vtime);
+ current->flags &= ~PF_VCPU;
+ write_seqcount_end(&vtime->seqcount);
+ }
+@@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+- vtime->starttime = jiffies;
++ vtime->starttime = sched_clock_cpu(smp_processor_id());
+ write_seqcount_end(&vtime->seqcount);
+ }
+
+@@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct
+ local_irq_save(flags);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+- vtime->starttime = jiffies;
++ vtime->starttime = sched_clock_cpu(cpu);
+ write_seqcount_end(&vtime->seqcount);
+ local_irq_restore(flags);
+ }
+@@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t)
+
+ gtime = t->gtime;
+ if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+- gtime += vtime_delta(vtime);
++ gtime += vtime->gtime + vtime_delta(vtime);
+
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+@@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t,
+ * the right place.
+ */
+ if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
+- *utime += delta;
++ *utime += vtime->utime + delta;
+ else if (vtime->state == VTIME_SYS)
+- *stime += delta;
++ *stime += vtime->stime + delta;
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+ }
+ #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
--- /dev/null
+From 9fa57cf5a5c4aed1e45879b335fe433048709327 Mon Sep 17 00:00:00 2001
+From: Frederic Weisbecker <fweisbec@gmail.com>
+Date: Thu, 29 Jun 2017 19:15:08 +0200
+Subject: sched/cputime: Always set tsk->vtime_snap_whence after accounting vtime
+
+From: Frederic Weisbecker <fweisbec@gmail.com>
+
+commit 9fa57cf5a5c4aed1e45879b335fe433048709327 upstream.
+
+Even though it doesn't have functional consequences, setting
+the task's new context state after we actually accounted the pending
+vtime from the old context state makes more sense from a review
+perspective.
+
+vtime_user_exit() is the only function that doesn't follow that rule
+and that can bug the reviewer for a little while until he realizes there
+is no reason for this special case.
+
+Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
+Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Wanpeng Li <kernellwp@gmail.com>
+Link: http://lkml.kernel.org/r/1498756511-11714-3-git-send-email-fweisbec@gmail.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/cputime.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/sched/cputime.c
++++ b/kernel/sched/cputime.c
+@@ -736,9 +736,9 @@ void vtime_user_enter(struct task_struct
+ void vtime_user_exit(struct task_struct *tsk)
+ {
+ write_seqcount_begin(&tsk->vtime_seqcount);
+- tsk->vtime_snap_whence = VTIME_SYS;
+ if (vtime_delta(tsk))
+ account_user_time(tsk, get_vtime_delta(tsk));
++ tsk->vtime_snap_whence = VTIME_SYS;
+ write_seqcount_end(&tsk->vtime_seqcount);
+ }
+
--- /dev/null
+From bac5b6b6b11560f323e71d0ebac4061cfe5f56c0 Mon Sep 17 00:00:00 2001
+From: Frederic Weisbecker <fweisbec@gmail.com>
+Date: Thu, 29 Jun 2017 19:15:10 +0200
+Subject: sched/cputime: Move the vtime task fields to their own struct
+
+From: Frederic Weisbecker <fweisbec@gmail.com>
+
+commit bac5b6b6b11560f323e71d0ebac4061cfe5f56c0 upstream.
+
+We are about to add vtime accumulation fields to the task struct. Let's
+avoid more bloatification and gather vtime information to their own
+struct.
+
+Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
+Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Wanpeng Li <kernellwp@gmail.com>
+Link: http://lkml.kernel.org/r/1498756511-11714-5-git-send-email-fweisbec@gmail.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/init_task.h | 6 +-
+ include/linux/sched.h | 26 ++++++----
+ kernel/fork.c | 6 +-
+ kernel/sched/cputime.c | 114 ++++++++++++++++++++++++++--------------------
+ 4 files changed, 87 insertions(+), 65 deletions(-)
+
+--- a/include/linux/init_task.h
++++ b/include/linux/init_task.h
+@@ -170,9 +170,9 @@ extern struct cred init_cred;
+
+ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ # define INIT_VTIME(tsk) \
+- .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \
+- .vtime_starttime = 0, \
+- .vtime_state = VTIME_SYS,
++ .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \
++ .vtime.starttime = 0, \
++ .vtime.state = VTIME_SYS,
+ #else
+ # define INIT_VTIME(tsk)
+ #endif
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -223,6 +223,21 @@ struct task_cputime {
+ #define prof_exp stime
+ #define sched_exp sum_exec_runtime
+
++enum vtime_state {
++ /* Task is sleeping or running in a CPU with VTIME inactive: */
++ VTIME_INACTIVE = 0,
++ /* Task runs in userspace in a CPU with VTIME active: */
++ VTIME_USER,
++ /* Task runs in kernelspace in a CPU with VTIME active: */
++ VTIME_SYS,
++};
++
++struct vtime {
++ seqcount_t seqcount;
++ unsigned long long starttime;
++ enum vtime_state state;
++};
++
+ struct sched_info {
+ #ifdef CONFIG_SCHED_INFO
+ /* Cumulative counters: */
+@@ -670,16 +685,7 @@ struct task_struct {
+ u64 gtime;
+ struct prev_cputime prev_cputime;
+ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+- seqcount_t vtime_seqcount;
+- unsigned long long vtime_starttime;
+- enum {
+- /* Task is sleeping or running in a CPU with VTIME inactive: */
+- VTIME_INACTIVE = 0,
+- /* Task runs in userspace in a CPU with VTIME active: */
+- VTIME_USER,
+- /* Task runs in kernelspace in a CPU with VTIME active: */
+- VTIME_SYS,
+- } vtime_state;
++ struct vtime vtime;
+ #endif
+
+ #ifdef CONFIG_NO_HZ_FULL
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1637,9 +1637,9 @@ static __latent_entropy struct task_stru
+ prev_cputime_init(&p->prev_cputime);
+
+ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+- seqcount_init(&p->vtime_seqcount);
+- p->vtime_starttime = 0;
+- p->vtime_state = VTIME_INACTIVE;
++ seqcount_init(&p->vtime.seqcount);
++ p->vtime.starttime = 0;
++ p->vtime.state = VTIME_INACTIVE;
+ #endif
+
+ #if defined(SPLIT_RSS_COUNTING)
+--- a/kernel/sched/cputime.c
++++ b/kernel/sched/cputime.c
+@@ -679,17 +679,17 @@ void thread_group_cputime_adjusted(struc
+ #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+
+ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+-static u64 vtime_delta(struct task_struct *tsk)
++static u64 vtime_delta(struct vtime *vtime)
+ {
+ unsigned long now = READ_ONCE(jiffies);
+
+- if (time_before(now, (unsigned long)tsk->vtime_starttime))
++ if (time_before(now, (unsigned long)vtime->starttime))
+ return 0;
+
+- return jiffies_to_nsecs(now - tsk->vtime_starttime);
++ return jiffies_to_nsecs(now - vtime->starttime);
+ }
+
+-static u64 get_vtime_delta(struct task_struct *tsk)
++static u64 get_vtime_delta(struct vtime *vtime)
+ {
+ unsigned long now = READ_ONCE(jiffies);
+ u64 delta, other;
+@@ -701,49 +701,56 @@ static u64 get_vtime_delta(struct task_s
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
+- delta = jiffies_to_nsecs(now - tsk->vtime_starttime);
++ delta = jiffies_to_nsecs(now - vtime->starttime);
+ other = account_other_time(delta);
+- WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE);
+- tsk->vtime_starttime = now;
++ WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
++ vtime->starttime = now;
+
+ return delta - other;
+ }
+
+ static void __vtime_account_system(struct task_struct *tsk)
+ {
+- account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
++ account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime));
+ }
+
+ void vtime_account_system(struct task_struct *tsk)
+ {
+- if (!vtime_delta(tsk))
++ struct vtime *vtime = &tsk->vtime;
++
++ if (!vtime_delta(vtime))
+ return;
+
+- write_seqcount_begin(&tsk->vtime_seqcount);
++ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk);
+- write_seqcount_end(&tsk->vtime_seqcount);
++ write_seqcount_end(&vtime->seqcount);
+ }
+
+ void vtime_user_enter(struct task_struct *tsk)
+ {
+- write_seqcount_begin(&tsk->vtime_seqcount);
+- if (vtime_delta(tsk))
++ struct vtime *vtime = &tsk->vtime;
++
++ write_seqcount_begin(&vtime->seqcount);
++ if (vtime_delta(vtime))
+ __vtime_account_system(tsk);
+- tsk->vtime_snap_whence = VTIME_USER;
+- write_seqcount_end(&tsk->vtime_seqcount);
++ vtime->state = VTIME_USER;
++ write_seqcount_end(&vtime->seqcount);
+ }
+
+ void vtime_user_exit(struct task_struct *tsk)
+ {
+- write_seqcount_begin(&tsk->vtime_seqcount);
+- if (vtime_delta(tsk))
+- account_user_time(tsk, get_vtime_delta(tsk));
+- tsk->vtime_snap_whence = VTIME_SYS;
+- write_seqcount_end(&tsk->vtime_seqcount);
++ struct vtime *vtime = &tsk->vtime;
++
++ write_seqcount_begin(&vtime->seqcount);
++ if (vtime_delta(vtime))
++ account_user_time(tsk, get_vtime_delta(vtime));
++ vtime->state = VTIME_SYS;
++ write_seqcount_end(&vtime->seqcount);
+ }
+
+ void vtime_guest_enter(struct task_struct *tsk)
+ {
++ struct vtime *vtime = &tsk->vtime;
+ /*
+ * The flags must be updated under the lock with
+ * the vtime_starttime flush and update.
+@@ -751,54 +758,62 @@ void vtime_guest_enter(struct task_struc
+ * synchronization against the reader (task_gtime())
+ * that can thus safely catch up with a tickless delta.
+ */
+- write_seqcount_begin(&tsk->vtime_seqcount);
+- if (vtime_delta(tsk))
++ write_seqcount_begin(&vtime->seqcount);
++ if (vtime_delta(vtime))
+ __vtime_account_system(tsk);
+ current->flags |= PF_VCPU;
+- write_seqcount_end(&tsk->vtime_seqcount);
++ write_seqcount_end(&vtime->seqcount);
+ }
+ EXPORT_SYMBOL_GPL(vtime_guest_enter);
+
+ void vtime_guest_exit(struct task_struct *tsk)
+ {
+- write_seqcount_begin(&tsk->vtime_seqcount);
++ struct vtime *vtime = &tsk->vtime;
++
++ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk);
+ current->flags &= ~PF_VCPU;
+- write_seqcount_end(&tsk->vtime_seqcount);
++ write_seqcount_end(&vtime->seqcount);
+ }
+ EXPORT_SYMBOL_GPL(vtime_guest_exit);
+
+ void vtime_account_idle(struct task_struct *tsk)
+ {
+- account_idle_time(get_vtime_delta(tsk));
++ account_idle_time(get_vtime_delta(&tsk->vtime));
+ }
+
+ void arch_vtime_task_switch(struct task_struct *prev)
+ {
+- write_seqcount_begin(&prev->vtime_seqcount);
+- prev->vtime_state = VTIME_INACTIVE;
+- write_seqcount_end(&prev->vtime_seqcount);
+-
+- write_seqcount_begin(¤t->vtime_seqcount);
+- current->vtime_state = VTIME_SYS;
+- current->vtime_starttime = jiffies;
+- write_seqcount_end(¤t->vtime_seqcount);
++ struct vtime *vtime = &prev->vtime;
++
++ write_seqcount_begin(&vtime->seqcount);
++ vtime->state = VTIME_INACTIVE;
++ write_seqcount_end(&vtime->seqcount);
++
++ vtime = ¤t->vtime;
++
++ write_seqcount_begin(&vtime->seqcount);
++ vtime->state = VTIME_SYS;
++ vtime->starttime = jiffies;
++ write_seqcount_end(&vtime->seqcount);
+ }
+
+ void vtime_init_idle(struct task_struct *t, int cpu)
+ {
++ struct vtime *vtime = &t->vtime;
+ unsigned long flags;
+
+ local_irq_save(flags);
+- write_seqcount_begin(&t->vtime_seqcount);
+- t->vtime_state = VTIME_SYS;
+- t->vtime_starttime = jiffies;
+- write_seqcount_end(&t->vtime_seqcount);
++ write_seqcount_begin(&vtime->seqcount);
++ vtime->state = VTIME_SYS;
++ vtime->starttime = jiffies;
++ write_seqcount_end(&vtime->seqcount);
+ local_irq_restore(flags);
+ }
+
+ u64 task_gtime(struct task_struct *t)
+ {
++ struct vtime *vtime = &t->vtime;
+ unsigned int seq;
+ u64 gtime;
+
+@@ -806,13 +821,13 @@ u64 task_gtime(struct task_struct *t)
+ return t->gtime;
+
+ do {
+- seq = read_seqcount_begin(&t->vtime_seqcount);
++ seq = read_seqcount_begin(&vtime->seqcount);
+
+ gtime = t->gtime;
+- if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU)
+- gtime += vtime_delta(t);
++ if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
++ gtime += vtime_delta(vtime);
+
+- } while (read_seqcount_retry(&t->vtime_seqcount, seq));
++ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return gtime;
+ }
+@@ -824,8 +839,9 @@ u64 task_gtime(struct task_struct *t)
+ */
+ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
+ {
+- u64 delta;
++ struct vtime *vtime = &t->vtime;
+ unsigned int seq;
++ u64 delta;
+
+ if (!vtime_accounting_enabled()) {
+ *utime = t->utime;
+@@ -834,25 +850,25 @@ void task_cputime(struct task_struct *t,
+ }
+
+ do {
+- seq = read_seqcount_begin(&t->vtime_seqcount);
++ seq = read_seqcount_begin(&vtime->seqcount);
+
+ *utime = t->utime;
+ *stime = t->stime;
+
+ /* Task is sleeping, nothing to add */
+- if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t))
++ if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
+ continue;
+
+- delta = vtime_delta(t);
++ delta = vtime_delta(vtime);
+
+ /*
+ * Task runs either in user or kernel space, add pending nohz time to
+ * the right place.
+ */
+- if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU)
++ if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
+ *utime += delta;
+- else if (t->vtime_state == VTIME_SYS)
++ else if (vtime->state == VTIME_SYS)
+ *stime += delta;
+- } while (read_seqcount_retry(&t->vtime_seqcount, seq));
++ } while (read_seqcount_retry(&vtime->seqcount, seq));
+ }
+ #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
--- /dev/null
+From 60a9ce57e7c5ac1df3a39fb941022bbfa40c0862 Mon Sep 17 00:00:00 2001
+From: Frederic Weisbecker <fweisbec@gmail.com>
+Date: Thu, 29 Jun 2017 19:15:09 +0200
+Subject: sched/cputime: Rename vtime fields
+
+From: Frederic Weisbecker <fweisbec@gmail.com>
+
+commit 60a9ce57e7c5ac1df3a39fb941022bbfa40c0862 upstream.
+
+The current "snapshot" based naming on vtime fields suggests we record
+some past event but that's a low level picture of their actual purpose
+which comes out blurry. The real point of these fields is to run a basic
+state machine that tracks down cputime entry while switching between
+contexts.
+
+So lets reflect that with more meaningful names.
+
+Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
+Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Wanpeng Li <kernellwp@gmail.com>
+Link: http://lkml.kernel.org/r/1498756511-11714-4-git-send-email-fweisbec@gmail.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/init_task.h | 4 ++--
+ include/linux/sched.h | 4 ++--
+ kernel/fork.c | 4 ++--
+ kernel/sched/cputime.c | 30 +++++++++++++++---------------
+ 4 files changed, 21 insertions(+), 21 deletions(-)
+
+--- a/include/linux/init_task.h
++++ b/include/linux/init_task.h
+@@ -171,8 +171,8 @@ extern struct cred init_cred;
+ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ # define INIT_VTIME(tsk) \
+ .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \
+- .vtime_snap = 0, \
+- .vtime_snap_whence = VTIME_SYS,
++ .vtime_starttime = 0, \
++ .vtime_state = VTIME_SYS,
+ #else
+ # define INIT_VTIME(tsk)
+ #endif
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -671,7 +671,7 @@ struct task_struct {
+ struct prev_cputime prev_cputime;
+ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ seqcount_t vtime_seqcount;
+- unsigned long long vtime_snap;
++ unsigned long long vtime_starttime;
+ enum {
+ /* Task is sleeping or running in a CPU with VTIME inactive: */
+ VTIME_INACTIVE = 0,
+@@ -679,7 +679,7 @@ struct task_struct {
+ VTIME_USER,
+ /* Task runs in kernelspace in a CPU with VTIME active: */
+ VTIME_SYS,
+- } vtime_snap_whence;
++ } vtime_state;
+ #endif
+
+ #ifdef CONFIG_NO_HZ_FULL
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1638,8 +1638,8 @@ static __latent_entropy struct task_stru
+
+ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ seqcount_init(&p->vtime_seqcount);
+- p->vtime_snap = 0;
+- p->vtime_snap_whence = VTIME_INACTIVE;
++ p->vtime_starttime = 0;
++ p->vtime_state = VTIME_INACTIVE;
+ #endif
+
+ #if defined(SPLIT_RSS_COUNTING)
+--- a/kernel/sched/cputime.c
++++ b/kernel/sched/cputime.c
+@@ -683,10 +683,10 @@ static u64 vtime_delta(struct task_struc
+ {
+ unsigned long now = READ_ONCE(jiffies);
+
+- if (time_before(now, (unsigned long)tsk->vtime_snap))
++ if (time_before(now, (unsigned long)tsk->vtime_starttime))
+ return 0;
+
+- return jiffies_to_nsecs(now - tsk->vtime_snap);
++ return jiffies_to_nsecs(now - tsk->vtime_starttime);
+ }
+
+ static u64 get_vtime_delta(struct task_struct *tsk)
+@@ -701,10 +701,10 @@ static u64 get_vtime_delta(struct task_s
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
+- delta = jiffies_to_nsecs(now - tsk->vtime_snap);
++ delta = jiffies_to_nsecs(now - tsk->vtime_starttime);
+ other = account_other_time(delta);
+- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
+- tsk->vtime_snap = now;
++ WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE);
++ tsk->vtime_starttime = now;
+
+ return delta - other;
+ }
+@@ -746,7 +746,7 @@ void vtime_guest_enter(struct task_struc
+ {
+ /*
+ * The flags must be updated under the lock with
+- * the vtime_snap flush and update.
++ * the vtime_starttime flush and update.
+ * That enforces a right ordering and update sequence
+ * synchronization against the reader (task_gtime())
+ * that can thus safely catch up with a tickless delta.
+@@ -776,12 +776,12 @@ void vtime_account_idle(struct task_stru
+ void arch_vtime_task_switch(struct task_struct *prev)
+ {
+ write_seqcount_begin(&prev->vtime_seqcount);
+- prev->vtime_snap_whence = VTIME_INACTIVE;
++ prev->vtime_state = VTIME_INACTIVE;
+ write_seqcount_end(&prev->vtime_seqcount);
+
+ write_seqcount_begin(¤t->vtime_seqcount);
+- current->vtime_snap_whence = VTIME_SYS;
+- current->vtime_snap = jiffies;
++ current->vtime_state = VTIME_SYS;
++ current->vtime_starttime = jiffies;
+ write_seqcount_end(¤t->vtime_seqcount);
+ }
+
+@@ -791,8 +791,8 @@ void vtime_init_idle(struct task_struct
+
+ local_irq_save(flags);
+ write_seqcount_begin(&t->vtime_seqcount);
+- t->vtime_snap_whence = VTIME_SYS;
+- t->vtime_snap = jiffies;
++ t->vtime_state = VTIME_SYS;
++ t->vtime_starttime = jiffies;
+ write_seqcount_end(&t->vtime_seqcount);
+ local_irq_restore(flags);
+ }
+@@ -809,7 +809,7 @@ u64 task_gtime(struct task_struct *t)
+ seq = read_seqcount_begin(&t->vtime_seqcount);
+
+ gtime = t->gtime;
+- if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
++ if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU)
+ gtime += vtime_delta(t);
+
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+@@ -840,7 +840,7 @@ void task_cputime(struct task_struct *t,
+ *stime = t->stime;
+
+ /* Task is sleeping, nothing to add */
+- if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
++ if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t))
+ continue;
+
+ delta = vtime_delta(t);
+@@ -849,9 +849,9 @@ void task_cputime(struct task_struct *t,
+ * Task runs either in user or kernel space, add pending nohz time to
+ * the right place.
+ */
+- if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
++ if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU)
+ *utime += delta;
+- else if (t->vtime_snap_whence == VTIME_SYS)
++ else if (t->vtime_state == VTIME_SYS)
+ *stime += delta;
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+ }
--- /dev/null
+From 65a4433aebe36c8c6abeb69b99ef00274b971c6c Mon Sep 17 00:00:00 2001
+From: Jeffrey Hugo <jhugo@codeaurora.org>
+Date: Wed, 7 Jun 2017 13:18:57 -0600
+Subject: sched/fair: Fix load_balance() affinity redo path
+
+From: Jeffrey Hugo <jhugo@codeaurora.org>
+
+commit 65a4433aebe36c8c6abeb69b99ef00274b971c6c upstream.
+
+If load_balance() fails to migrate any tasks because all tasks were
+affined, load_balance() removes the source CPU from consideration and
+attempts to redo and balance among the new subset of CPUs.
+
+There is a bug in this code path where the algorithm considers all active
+CPUs in the system (minus the source that was just masked out). This is
+not valid for two reasons: some active CPUs may not be in the current
+scheduling domain and one of the active CPUs is dst_cpu. These CPUs should
+not be considered, as we cannot pull load from them.
+
+Instead of failing out of load_balance(), we may end up redoing the search
+with no valid CPUs and incorrectly concluding the domain is balanced.
+Additionally, if the group_imbalance flag was just set, it may also be
+incorrectly unset, thus the flag will not be seen by other CPUs in future
+load_balance() runs as that algorithm intends.
+
+Fix the check by removing CPUs not in the current domain and the dst_cpu
+from considertation, thus limiting the evaluation to valid remaining CPUs
+from which load might be migrated.
+
+Co-authored-by: Austin Christ <austinwc@codeaurora.org>
+Co-authored-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Tested-by: Tyler Baicar <tbaicar@codeaurora.org>
+Signed-off-by: Jeffrey Hugo <jhugo@codeaurora.org>
+Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Austin Christ <austinwc@codeaurora.org>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Timur Tabi <timur@codeaurora.org>
+Link: http://lkml.kernel.org/r/1496863138-11322-2-git-send-email-jhugo@codeaurora.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/fair.c | 32 ++++++++++++++++++++------------
+ 1 file changed, 20 insertions(+), 12 deletions(-)
+
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -6619,10 +6619,10 @@ int can_migrate_task(struct task_struct
+ * our sched_group. We may want to revisit it if we couldn't
+ * meet load balance goals by pulling other tasks on src_cpu.
+ *
+- * Also avoid computing new_dst_cpu if we have already computed
+- * one in current iteration.
++ * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
++ * already computed one in current iteration.
+ */
+- if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
++ if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
+ return 0;
+
+ /* Prevent to re-select dst_cpu via env's cpus */
+@@ -7973,14 +7973,7 @@ static int load_balance(int this_cpu, st
+ .tasks = LIST_HEAD_INIT(env.tasks),
+ };
+
+- /*
+- * For NEWLY_IDLE load_balancing, we don't need to consider
+- * other cpus in our group
+- */
+- if (idle == CPU_NEWLY_IDLE)
+- env.dst_grpmask = NULL;
+-
+- cpumask_copy(cpus, cpu_active_mask);
++ cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
+
+ schedstat_inc(sd->lb_count[idle]);
+
+@@ -8102,7 +8095,15 @@ more_balance:
+ /* All tasks on this runqueue were pinned by CPU affinity */
+ if (unlikely(env.flags & LBF_ALL_PINNED)) {
+ cpumask_clear_cpu(cpu_of(busiest), cpus);
+- if (!cpumask_empty(cpus)) {
++ /*
++ * Attempting to continue load balancing at the current
++ * sched_domain level only makes sense if there are
++ * active CPUs remaining as possible busiest CPUs to
++ * pull load from which are not contained within the
++ * destination group that is receiving any migrated
++ * load.
++ */
++ if (!cpumask_subset(cpus, env.dst_grpmask)) {
+ env.loop = 0;
+ env.loop_break = sched_nr_migrate_break;
+ goto redo;
+@@ -8398,6 +8399,13 @@ static int active_load_balance_cpu_stop(
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
++ /*
++ * can_migrate_task() doesn't need to compute new_dst_cpu
++ * for active balancing. Since we have CPU_IDLE, but no
++ * @dst_grpmask we need to make that test go away with lying
++ * about DST_PINNED.
++ */
++ .flags = LBF_DST_PINNED,
+ };
+
+ schedstat_inc(sd->alb_count);
drm-i915-make-dp-mst-connector-info-work.patch
mlx5-avoid-that-mlx5_ib_sg_to_klms-overflows-the-klms-array.patch
hfsplus-don-t-clear-sgid-when-inheriting-acls.patch
+vtime-sched-cputime-remove-vtime_account_user.patch
+sched-cputime-always-set-tsk-vtime_snap_whence-after-accounting-vtime.patch
+sched-cputime-rename-vtime-fields.patch
+sched-cputime-move-the-vtime-task-fields-to-their-own-struct.patch
+sched-cputime-accumulate-vtime-on-top-of-nsec-clocksource.patch
+sched-fair-fix-load_balance-affinity-redo-path.patch
+percpu_counter-rename-__percpu_counter_add-to-percpu_counter_add_batch.patch
+writeback-rework-wb__stat-family-of-functions.patch
+kernel-fork.c-virtually-mapped-stacks-do-not-disable-interrupts.patch
--- /dev/null
+From 1c3eda01a79b8e9237d91c52c5a75b20983f47c6 Mon Sep 17 00:00:00 2001
+From: Frederic Weisbecker <fweisbec@gmail.com>
+Date: Thu, 29 Jun 2017 19:15:07 +0200
+Subject: vtime, sched/cputime: Remove vtime_account_user()
+
+From: Frederic Weisbecker <fweisbec@gmail.com>
+
+commit 1c3eda01a79b8e9237d91c52c5a75b20983f47c6 upstream.
+
+It's an unnecessary function between vtime_user_exit() and
+account_user_time().
+
+Tested-by: Luiz Capitulino <lcapitulino@redhat.com>
+Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Rik van Riel <riel@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Wanpeng Li <kernellwp@gmail.com>
+Link: http://lkml.kernel.org/r/1498756511-11714-2-git-send-email-fweisbec@gmail.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/vtime.h | 9 +--------
+ kernel/sched/cputime.c | 12 ++++++------
+ 2 files changed, 7 insertions(+), 14 deletions(-)
+
+--- a/include/linux/vtime.h
++++ b/include/linux/vtime.h
+@@ -67,19 +67,12 @@ static inline void vtime_account_system(
+
+ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+ extern void arch_vtime_task_switch(struct task_struct *tsk);
+-extern void vtime_account_user(struct task_struct *tsk);
+ extern void vtime_user_enter(struct task_struct *tsk);
+-
+-static inline void vtime_user_exit(struct task_struct *tsk)
+-{
+- vtime_account_user(tsk);
+-}
+-
++extern void vtime_user_exit(struct task_struct *tsk);
+ extern void vtime_guest_enter(struct task_struct *tsk);
+ extern void vtime_guest_exit(struct task_struct *tsk);
+ extern void vtime_init_idle(struct task_struct *tsk, int cpu);
+ #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
+-static inline void vtime_account_user(struct task_struct *tsk) { }
+ static inline void vtime_user_enter(struct task_struct *tsk) { }
+ static inline void vtime_user_exit(struct task_struct *tsk) { }
+ static inline void vtime_guest_enter(struct task_struct *tsk) { }
+--- a/kernel/sched/cputime.c
++++ b/kernel/sched/cputime.c
+@@ -724,21 +724,21 @@ void vtime_account_system(struct task_st
+ write_seqcount_end(&tsk->vtime_seqcount);
+ }
+
+-void vtime_account_user(struct task_struct *tsk)
++void vtime_user_enter(struct task_struct *tsk)
+ {
+ write_seqcount_begin(&tsk->vtime_seqcount);
+- tsk->vtime_snap_whence = VTIME_SYS;
+ if (vtime_delta(tsk))
+- account_user_time(tsk, get_vtime_delta(tsk));
++ __vtime_account_system(tsk);
++ tsk->vtime_snap_whence = VTIME_USER;
+ write_seqcount_end(&tsk->vtime_seqcount);
+ }
+
+-void vtime_user_enter(struct task_struct *tsk)
++void vtime_user_exit(struct task_struct *tsk)
+ {
+ write_seqcount_begin(&tsk->vtime_seqcount);
++ tsk->vtime_snap_whence = VTIME_SYS;
+ if (vtime_delta(tsk))
+- __vtime_account_system(tsk);
+- tsk->vtime_snap_whence = VTIME_USER;
++ account_user_time(tsk, get_vtime_delta(tsk));
+ write_seqcount_end(&tsk->vtime_seqcount);
+ }
+
--- /dev/null
+From 3e8f399da490e6ac20a3cfd6aa404c9aa961a9a2 Mon Sep 17 00:00:00 2001
+From: Nikolay Borisov <nborisov@suse.com>
+Date: Wed, 12 Jul 2017 14:37:51 -0700
+Subject: writeback: rework wb_[dec|inc]_stat family of functions
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit 3e8f399da490e6ac20a3cfd6aa404c9aa961a9a2 upstream.
+
+Currently the writeback statistics code uses a percpu counters to hold
+various statistics. Furthermore we have 2 families of functions - those
+which disable local irq and those which doesn't and whose names begin
+with double underscore. However, they both end up calling
+__add_wb_stats which in turn calls percpu_counter_add_batch which is
+already irq-safe.
+
+Exploiting this fact allows to eliminated the __wb_* functions since
+they don't add any further protection than we already have.
+Furthermore, refactor the wb_* function to call __add_wb_stat directly
+without the irq-disabling dance. This will likely result in better
+runtime of code which deals with modifying the stat counters.
+
+While at it also document why percpu_counter_add_batch is in fact
+preempt and irq-safe since at least 3 people got confused.
+
+Link: http://lkml.kernel.org/r/1498029937-27293-1-git-send-email-nborisov@suse.com
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Cc: Josef Bacik <jbacik@fb.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Jeff Layton <jlayton@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/fs-writeback.c | 8 ++++----
+ include/linux/backing-dev.h | 24 ++----------------------
+ lib/percpu_counter.c | 7 +++++++
+ mm/page-writeback.c | 10 +++++-----
+ 4 files changed, 18 insertions(+), 31 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -380,8 +380,8 @@ static void inode_switch_wbs_work_fn(str
+ struct page *page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (likely(page) && PageDirty(page)) {
+- __dec_wb_stat(old_wb, WB_RECLAIMABLE);
+- __inc_wb_stat(new_wb, WB_RECLAIMABLE);
++ dec_wb_stat(old_wb, WB_RECLAIMABLE);
++ inc_wb_stat(new_wb, WB_RECLAIMABLE);
+ }
+ }
+
+@@ -391,8 +391,8 @@ static void inode_switch_wbs_work_fn(str
+ &mapping->tree_lock);
+ if (likely(page)) {
+ WARN_ON_ONCE(!PageWriteback(page));
+- __dec_wb_stat(old_wb, WB_WRITEBACK);
+- __inc_wb_stat(new_wb, WB_WRITEBACK);
++ dec_wb_stat(old_wb, WB_WRITEBACK);
++ inc_wb_stat(new_wb, WB_WRITEBACK);
+ }
+ }
+
+--- a/include/linux/backing-dev.h
++++ b/include/linux/backing-dev.h
+@@ -69,34 +69,14 @@ static inline void __add_wb_stat(struct
+ percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
+ }
+
+-static inline void __inc_wb_stat(struct bdi_writeback *wb,
+- enum wb_stat_item item)
+-{
+- __add_wb_stat(wb, item, 1);
+-}
+-
+ static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
+ {
+- unsigned long flags;
+-
+- local_irq_save(flags);
+- __inc_wb_stat(wb, item);
+- local_irq_restore(flags);
+-}
+-
+-static inline void __dec_wb_stat(struct bdi_writeback *wb,
+- enum wb_stat_item item)
+-{
+- __add_wb_stat(wb, item, -1);
++ __add_wb_stat(wb, item, 1);
+ }
+
+ static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
+ {
+- unsigned long flags;
+-
+- local_irq_save(flags);
+- __dec_wb_stat(wb, item);
+- local_irq_restore(flags);
++ __add_wb_stat(wb, item, -1);
+ }
+
+ static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
+--- a/lib/percpu_counter.c
++++ b/lib/percpu_counter.c
+@@ -72,6 +72,13 @@ void percpu_counter_set(struct percpu_co
+ }
+ EXPORT_SYMBOL(percpu_counter_set);
+
++/**
++ * This function is both preempt and irq safe. The former is due to explicit
++ * preemption disable. The latter is guaranteed by the fact that the slow path
++ * is explicitly protected by an irq-safe spinlock whereas the fast patch uses
++ * this_cpu_add which is irq-safe by definition. Hence there is no need muck
++ * with irq state before calling this one
++ */
+ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
+ {
+ s64 count;
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -601,7 +601,7 @@ static inline void __wb_writeout_inc(str
+ {
+ struct wb_domain *cgdom;
+
+- __inc_wb_stat(wb, WB_WRITTEN);
++ inc_wb_stat(wb, WB_WRITTEN);
+ wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
+ wb->bdi->max_prop_frac);
+
+@@ -2437,8 +2437,8 @@ void account_page_dirtied(struct page *p
+ __inc_node_page_state(page, NR_FILE_DIRTY);
+ __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+ __inc_node_page_state(page, NR_DIRTIED);
+- __inc_wb_stat(wb, WB_RECLAIMABLE);
+- __inc_wb_stat(wb, WB_DIRTIED);
++ inc_wb_stat(wb, WB_RECLAIMABLE);
++ inc_wb_stat(wb, WB_DIRTIED);
+ task_io_account_write(PAGE_SIZE);
+ current->nr_dirtied++;
+ this_cpu_inc(bdp_ratelimits);
+@@ -2745,7 +2745,7 @@ int test_clear_page_writeback(struct pag
+ if (bdi_cap_account_writeback(bdi)) {
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+- __dec_wb_stat(wb, WB_WRITEBACK);
++ dec_wb_stat(wb, WB_WRITEBACK);
+ __wb_writeout_inc(wb);
+ }
+ }
+@@ -2791,7 +2791,7 @@ int __test_set_page_writeback(struct pag
+ page_index(page),
+ PAGECACHE_TAG_WRITEBACK);
+ if (bdi_cap_account_writeback(bdi))
+- __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
++ inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
+
+ /*
+ * We can come through here when swapping anonymous