From 9ea078294fda173608ea4ee270f32090ad5ec6a6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jul 2017 08:18:32 -0700 Subject: [PATCH] 4.12-stable patches added patches: kernel-fork.c-virtually-mapped-stacks-do-not-disable-interrupts.patch percpu_counter-rename-__percpu_counter_add-to-percpu_counter_add_batch.patch sched-cputime-accumulate-vtime-on-top-of-nsec-clocksource.patch sched-cputime-always-set-tsk-vtime_snap_whence-after-accounting-vtime.patch sched-cputime-move-the-vtime-task-fields-to-their-own-struct.patch sched-cputime-rename-vtime-fields.patch sched-fair-fix-load_balance-affinity-redo-path.patch vtime-sched-cputime-remove-vtime_account_user.patch writeback-rework-wb__stat-family-of-functions.patch --- ...ped-stacks-do-not-disable-interrupts.patch | 73 ++++ ...nter_add-to-percpu_counter_add_batch.patch | 284 +++++++++++++++ ...ate-vtime-on-top-of-nsec-clocksource.patch | 228 ++++++++++++ ...e_snap_whence-after-accounting-vtime.patch | 47 +++ ...time-task-fields-to-their-own-struct.patch | 343 ++++++++++++++++++ .../sched-cputime-rename-vtime-fields.patch | 177 +++++++++ ...-fix-load_balance-affinity-redo-path.patch | 112 ++++++ queue-4.12/series | 9 + ...ed-cputime-remove-vtime_account_user.patch | 82 +++++ ...-rework-wb__stat-family-of-functions.patch | 163 +++++++++ 10 files changed, 1518 insertions(+) create mode 100644 queue-4.12/kernel-fork.c-virtually-mapped-stacks-do-not-disable-interrupts.patch create mode 100644 queue-4.12/percpu_counter-rename-__percpu_counter_add-to-percpu_counter_add_batch.patch create mode 100644 queue-4.12/sched-cputime-accumulate-vtime-on-top-of-nsec-clocksource.patch create mode 100644 queue-4.12/sched-cputime-always-set-tsk-vtime_snap_whence-after-accounting-vtime.patch create mode 100644 queue-4.12/sched-cputime-move-the-vtime-task-fields-to-their-own-struct.patch create mode 100644 queue-4.12/sched-cputime-rename-vtime-fields.patch create mode 100644 queue-4.12/sched-fair-fix-load_balance-affinity-redo-path.patch create mode 100644 queue-4.12/vtime-sched-cputime-remove-vtime_account_user.patch create mode 100644 queue-4.12/writeback-rework-wb__stat-family-of-functions.patch diff --git a/queue-4.12/kernel-fork.c-virtually-mapped-stacks-do-not-disable-interrupts.patch b/queue-4.12/kernel-fork.c-virtually-mapped-stacks-do-not-disable-interrupts.patch new file mode 100644 index 00000000000..9d4a1c9b8a5 --- /dev/null +++ b/queue-4.12/kernel-fork.c-virtually-mapped-stacks-do-not-disable-interrupts.patch @@ -0,0 +1,73 @@ +From 112166f88cf83dd11486cf1818672d42b540865b Mon Sep 17 00:00:00 2001 +From: Christoph Lameter +Date: Wed, 12 Jul 2017 14:33:11 -0700 +Subject: kernel/fork.c: virtually mapped stacks: do not disable interrupts + +From: Christoph Lameter + +commit 112166f88cf83dd11486cf1818672d42b540865b upstream. + +The reason to disable interrupts seems to be to avoid switching to a +different processor while handling per cpu data using individual loads and +stores. If we use per cpu RMV primitives we will not have to disable +interrupts. + +Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1705171055130.5898@east.gentwo.org +Signed-off-by: Christoph Lameter +Cc: Andy Lutomirski +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/fork.c | 16 +++++----------- + 1 file changed, 5 insertions(+), 11 deletions(-) + +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack + void *stack; + int i; + +- local_irq_disable(); + for (i = 0; i < NR_CACHED_STACKS; i++) { +- struct vm_struct *s = this_cpu_read(cached_stacks[i]); ++ struct vm_struct *s; ++ ++ s = this_cpu_xchg(cached_stacks[i], NULL); + + if (!s) + continue; +- this_cpu_write(cached_stacks[i], NULL); + + tsk->stack_vm_area = s; +- local_irq_enable(); + return s->addr; + } +- local_irq_enable(); + + stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + VMALLOC_START, VMALLOC_END, +@@ -245,19 +243,15 @@ static inline void free_thread_stack(str + { + #ifdef CONFIG_VMAP_STACK + if (task_stack_vm_area(tsk)) { +- unsigned long flags; + int i; + +- local_irq_save(flags); + for (i = 0; i < NR_CACHED_STACKS; i++) { +- if (this_cpu_read(cached_stacks[i])) ++ if (this_cpu_cmpxchg(cached_stacks[i], ++ NULL, tsk->stack_vm_area) != NULL) + continue; + +- this_cpu_write(cached_stacks[i], tsk->stack_vm_area); +- local_irq_restore(flags); + return; + } +- local_irq_restore(flags); + + vfree_atomic(tsk->stack); + return; diff --git a/queue-4.12/percpu_counter-rename-__percpu_counter_add-to-percpu_counter_add_batch.patch b/queue-4.12/percpu_counter-rename-__percpu_counter_add-to-percpu_counter_add_batch.patch new file mode 100644 index 00000000000..310b698bc0e --- /dev/null +++ b/queue-4.12/percpu_counter-rename-__percpu_counter_add-to-percpu_counter_add_batch.patch @@ -0,0 +1,284 @@ +From 104b4e5139fe384431ac11c3b8a6cf4a529edf4a Mon Sep 17 00:00:00 2001 +From: Nikolay Borisov +Date: Tue, 20 Jun 2017 21:01:20 +0300 +Subject: percpu_counter: Rename __percpu_counter_add to percpu_counter_add_batch + +From: Nikolay Borisov + +commit 104b4e5139fe384431ac11c3b8a6cf4a529edf4a upstream. + +Currently, percpu_counter_add is a wrapper around __percpu_counter_add +which is preempt safe due to explicit calls to preempt_disable. Given +how __ prefix is used in percpu related interfaces, the naming +unfortunately creates the false sense that __percpu_counter_add is +less safe than percpu_counter_add. In terms of context-safety, +they're equivalent. The only difference is that the __ version takes +a batch parameter. + +Make this a bit more explicit by just renaming __percpu_counter_add to +percpu_counter_add_batch. + +This patch doesn't cause any functional changes. + +tj: Minor updates to patch description for clarity. Cosmetic + indentation updates. + +Signed-off-by: Nikolay Borisov +Signed-off-by: Tejun Heo +Cc: Chris Mason +Cc: Josef Bacik +Cc: David Sterba +Cc: Darrick J. Wong +Cc: Jan Kara +Cc: Jens Axboe +Cc: linux-mm@kvack.org +Cc: "David S. Miller" +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/disk-io.c | 12 ++++++------ + fs/btrfs/extent_io.c | 6 +++--- + fs/btrfs/inode.c | 8 ++++---- + fs/xfs/xfs_mount.c | 4 ++-- + include/linux/backing-dev.h | 2 +- + include/linux/blk-cgroup.h | 6 +++--- + include/linux/mman.h | 2 +- + include/linux/percpu_counter.h | 7 ++++--- + include/net/inet_frag.h | 4 ++-- + lib/flex_proportions.c | 6 +++--- + lib/percpu_counter.c | 4 ++-- + 11 files changed, 31 insertions(+), 30 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1255,9 +1255,9 @@ void clean_tree_block(struct btrfs_fs_in + btrfs_assert_tree_locked(buf); + + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { +- __percpu_counter_add(&fs_info->dirty_metadata_bytes, +- -buf->len, +- fs_info->dirty_metadata_batch); ++ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, ++ -buf->len, ++ fs_info->dirty_metadata_batch); + /* ugh, clear_extent_buffer_dirty needs to lock the page */ + btrfs_set_lock_blocking(buf); + clear_extent_buffer_dirty(buf); +@@ -4049,9 +4049,9 @@ void btrfs_mark_buffer_dirty(struct exte + buf->start, transid, fs_info->generation); + was_dirty = set_extent_buffer_dirty(buf); + if (!was_dirty) +- __percpu_counter_add(&fs_info->dirty_metadata_bytes, +- buf->len, +- fs_info->dirty_metadata_batch); ++ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, ++ buf->len, ++ fs_info->dirty_metadata_batch); + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + btrfs_print_leaf(fs_info, buf); +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3597,9 +3597,9 @@ lock_extent_buffer_for_io(struct extent_ + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + spin_unlock(&eb->refs_lock); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); +- __percpu_counter_add(&fs_info->dirty_metadata_bytes, +- -eb->len, +- fs_info->dirty_metadata_batch); ++ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, ++ -eb->len, ++ fs_info->dirty_metadata_batch); + ret = 1; + } else { + spin_unlock(&eb->refs_lock); +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -1766,8 +1766,8 @@ static void btrfs_set_bit_hook(struct in + if (btrfs_is_testing(fs_info)) + return; + +- __percpu_counter_add(&fs_info->delalloc_bytes, len, +- fs_info->delalloc_batch); ++ percpu_counter_add_batch(&fs_info->delalloc_bytes, len, ++ fs_info->delalloc_batch); + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->delalloc_bytes += len; + if (*bits & EXTENT_DEFRAG) +@@ -1840,8 +1840,8 @@ static void btrfs_clear_bit_hook(struct + &inode->vfs_inode, + state->start, len); + +- __percpu_counter_add(&fs_info->delalloc_bytes, -len, +- fs_info->delalloc_batch); ++ percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, ++ fs_info->delalloc_batch); + spin_lock(&inode->lock); + inode->delalloc_bytes -= len; + if (do_list && inode->delalloc_bytes == 0 && +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -1209,7 +1209,7 @@ xfs_mod_icount( + struct xfs_mount *mp, + int64_t delta) + { +- __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH); ++ percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH); + if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { + ASSERT(0); + percpu_counter_add(&mp->m_icount, -delta); +@@ -1288,7 +1288,7 @@ xfs_mod_fdblocks( + else + batch = XFS_FDBLOCKS_BATCH; + +- __percpu_counter_add(&mp->m_fdblocks, delta, batch); ++ percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); + if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside, + XFS_FDBLOCKS_BATCH) >= 0) { + /* we had space! */ +--- a/include/linux/backing-dev.h ++++ b/include/linux/backing-dev.h +@@ -66,7 +66,7 @@ static inline bool bdi_has_dirty_io(stru + static inline void __add_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item, s64 amount) + { +- __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH); ++ percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); + } + + static inline void __inc_wb_stat(struct bdi_writeback *wb, +--- a/include/linux/blk-cgroup.h ++++ b/include/linux/blk-cgroup.h +@@ -518,7 +518,7 @@ static inline void blkg_stat_exit(struct + */ + static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) + { +- __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); ++ percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); + } + + /** +@@ -597,14 +597,14 @@ static inline void blkg_rwstat_add(struc + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; + +- __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); ++ percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); + + if (op_is_sync(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; + +- __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); ++ percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); + } + + /** +--- a/include/linux/mman.h ++++ b/include/linux/mman.h +@@ -22,7 +22,7 @@ unsigned long vm_memory_committed(void); + + static inline void vm_acct_memory(long pages) + { +- __percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch); ++ percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); + } + + static inline void vm_unacct_memory(long pages) +--- a/include/linux/percpu_counter.h ++++ b/include/linux/percpu_counter.h +@@ -39,7 +39,8 @@ int __percpu_counter_init(struct percpu_ + + void percpu_counter_destroy(struct percpu_counter *fbc); + void percpu_counter_set(struct percpu_counter *fbc, s64 amount); +-void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); ++void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, ++ s32 batch); + s64 __percpu_counter_sum(struct percpu_counter *fbc); + int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); + +@@ -50,7 +51,7 @@ static inline int percpu_counter_compare + + static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) + { +- __percpu_counter_add(fbc, amount, percpu_counter_batch); ++ percpu_counter_add_batch(fbc, amount, percpu_counter_batch); + } + + static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) +@@ -136,7 +137,7 @@ percpu_counter_add(struct percpu_counter + } + + static inline void +-__percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) ++percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) + { + percpu_counter_add(fbc, amount); + } +--- a/include/net/inet_frag.h ++++ b/include/net/inet_frag.h +@@ -154,12 +154,12 @@ static inline int frag_mem_limit(struct + + static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) + { +- __percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch); ++ percpu_counter_add_batch(&nf->mem, -i, frag_percpu_counter_batch); + } + + static inline void add_frag_mem_limit(struct netns_frags *nf, int i) + { +- __percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch); ++ percpu_counter_add_batch(&nf->mem, i, frag_percpu_counter_batch); + } + + static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) +--- a/lib/flex_proportions.c ++++ b/lib/flex_proportions.c +@@ -207,7 +207,7 @@ static void fprop_reflect_period_percpu( + if (val < (nr_cpu_ids * PROP_BATCH)) + val = percpu_counter_sum(&pl->events); + +- __percpu_counter_add(&pl->events, ++ percpu_counter_add_batch(&pl->events, + -val + (val >> (period-pl->period)), PROP_BATCH); + } else + percpu_counter_set(&pl->events, 0); +@@ -219,7 +219,7 @@ static void fprop_reflect_period_percpu( + void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) + { + fprop_reflect_period_percpu(p, pl); +- __percpu_counter_add(&pl->events, 1, PROP_BATCH); ++ percpu_counter_add_batch(&pl->events, 1, PROP_BATCH); + percpu_counter_add(&p->events, 1); + } + +@@ -267,6 +267,6 @@ void __fprop_inc_percpu_max(struct fprop + return; + } else + fprop_reflect_period_percpu(p, pl); +- __percpu_counter_add(&pl->events, 1, PROP_BATCH); ++ percpu_counter_add_batch(&pl->events, 1, PROP_BATCH); + percpu_counter_add(&p->events, 1); + } +--- a/lib/percpu_counter.c ++++ b/lib/percpu_counter.c +@@ -72,7 +72,7 @@ void percpu_counter_set(struct percpu_co + } + EXPORT_SYMBOL(percpu_counter_set); + +-void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) ++void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) + { + s64 count; + +@@ -89,7 +89,7 @@ void __percpu_counter_add(struct percpu_ + } + preempt_enable(); + } +-EXPORT_SYMBOL(__percpu_counter_add); ++EXPORT_SYMBOL(percpu_counter_add_batch); + + /* + * Add up all the per-cpu counts, return the result. This is a more accurate diff --git a/queue-4.12/sched-cputime-accumulate-vtime-on-top-of-nsec-clocksource.patch b/queue-4.12/sched-cputime-accumulate-vtime-on-top-of-nsec-clocksource.patch new file mode 100644 index 00000000000..bcb2a80b5ad --- /dev/null +++ b/queue-4.12/sched-cputime-accumulate-vtime-on-top-of-nsec-clocksource.patch @@ -0,0 +1,228 @@ +From 2a42eb9594a1480b4ead9e036e06ee1290e5fa6d Mon Sep 17 00:00:00 2001 +From: Wanpeng Li +Date: Thu, 29 Jun 2017 19:15:11 +0200 +Subject: sched/cputime: Accumulate vtime on top of nsec clocksource + +From: Wanpeng Li + +commit 2a42eb9594a1480b4ead9e036e06ee1290e5fa6d upstream. + +Currently the cputime source used by vtime is jiffies. When we cross +a context boundary and jiffies have changed since the last snapshot, the +pending cputime is accounted to the switching out context. + +This system works ok if the ticks are not aligned across CPUs. If they +instead are aligned (ie: all fire at the same time) and the CPUs run in +userspace, the jiffies change is only observed on tick exit and therefore +the user cputime is accounted as system cputime. This is because the +CPU that maintains timekeeping fires its tick at the same time as the +others. It updates jiffies in the middle of the tick and the other CPUs +see that update on IRQ exit: + + CPU 0 (timekeeper) CPU 1 + ------------------- ------------- + jiffies = N + ... run in userspace for a jiffy + tick entry tick entry (sees jiffies = N) + set jiffies = N + 1 + tick exit tick exit (sees jiffies = N + 1) + account 1 jiffy as stime + +Fix this with using a nanosec clock source instead of jiffies. The +cputime is then accumulated and flushed everytime the pending delta +reaches a jiffy in order to mitigate the accounting overhead. + +[ fweisbec: changelog, rebase on struct vtime, field renames, add delta + on cputime readers, keep idle vtime as-is (low overhead accounting), + harmonize clock sources. ] + +Suggested-by: Thomas Gleixner +Reported-by: Luiz Capitulino +Tested-by: Luiz Capitulino +Signed-off-by: Wanpeng Li +Signed-off-by: Frederic Weisbecker +Reviewed-by: Thomas Gleixner +Acked-by: Rik van Riel +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Wanpeng Li +Link: http://lkml.kernel.org/r/1498756511-11714-6-git-send-email-fweisbec@gmail.com +Signed-off-by: Ingo Molnar +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/sched.h | 3 ++ + kernel/sched/cputime.c | 64 ++++++++++++++++++++++++++++++++----------------- + 2 files changed, 45 insertions(+), 22 deletions(-) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -236,6 +236,9 @@ struct vtime { + seqcount_t seqcount; + unsigned long long starttime; + enum vtime_state state; ++ u64 utime; ++ u64 stime; ++ u64 gtime; + }; + + struct sched_info { +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -681,18 +681,19 @@ void thread_group_cputime_adjusted(struc + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + static u64 vtime_delta(struct vtime *vtime) + { +- unsigned long now = READ_ONCE(jiffies); ++ unsigned long long clock; + +- if (time_before(now, (unsigned long)vtime->starttime)) ++ clock = sched_clock_cpu(smp_processor_id()); ++ if (clock < vtime->starttime) + return 0; + +- return jiffies_to_nsecs(now - vtime->starttime); ++ return clock - vtime->starttime; + } + + static u64 get_vtime_delta(struct vtime *vtime) + { +- unsigned long now = READ_ONCE(jiffies); +- u64 delta, other; ++ u64 delta = vtime_delta(vtime); ++ u64 other; + + /* + * Unlike tick based timing, vtime based timing never has lost +@@ -701,17 +702,31 @@ static u64 get_vtime_delta(struct vtime + * elapsed time. Limit account_other_time to prevent rounding + * errors from causing elapsed vtime to go negative. + */ +- delta = jiffies_to_nsecs(now - vtime->starttime); + other = account_other_time(delta); + WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); +- vtime->starttime = now; ++ vtime->starttime += delta; + + return delta - other; + } + +-static void __vtime_account_system(struct task_struct *tsk) ++static void __vtime_account_system(struct task_struct *tsk, ++ struct vtime *vtime) + { +- account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime)); ++ vtime->stime += get_vtime_delta(vtime); ++ if (vtime->stime >= TICK_NSEC) { ++ account_system_time(tsk, irq_count(), vtime->stime); ++ vtime->stime = 0; ++ } ++} ++ ++static void vtime_account_guest(struct task_struct *tsk, ++ struct vtime *vtime) ++{ ++ vtime->gtime += get_vtime_delta(vtime); ++ if (vtime->gtime >= TICK_NSEC) { ++ account_guest_time(tsk, vtime->gtime); ++ vtime->gtime = 0; ++ } + } + + void vtime_account_system(struct task_struct *tsk) +@@ -722,7 +737,11 @@ void vtime_account_system(struct task_st + return; + + write_seqcount_begin(&vtime->seqcount); +- __vtime_account_system(tsk); ++ /* We might have scheduled out from guest path */ ++ if (current->flags & PF_VCPU) ++ vtime_account_guest(tsk, vtime); ++ else ++ __vtime_account_system(tsk, vtime); + write_seqcount_end(&vtime->seqcount); + } + +@@ -731,8 +750,7 @@ void vtime_user_enter(struct task_struct + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); +- if (vtime_delta(vtime)) +- __vtime_account_system(tsk); ++ __vtime_account_system(tsk, vtime); + vtime->state = VTIME_USER; + write_seqcount_end(&vtime->seqcount); + } +@@ -742,8 +760,11 @@ void vtime_user_exit(struct task_struct + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); +- if (vtime_delta(vtime)) +- account_user_time(tsk, get_vtime_delta(vtime)); ++ vtime->utime += get_vtime_delta(vtime); ++ if (vtime->utime >= TICK_NSEC) { ++ account_user_time(tsk, vtime->utime); ++ vtime->utime = 0; ++ } + vtime->state = VTIME_SYS; + write_seqcount_end(&vtime->seqcount); + } +@@ -759,8 +780,7 @@ void vtime_guest_enter(struct task_struc + * that can thus safely catch up with a tickless delta. + */ + write_seqcount_begin(&vtime->seqcount); +- if (vtime_delta(vtime)) +- __vtime_account_system(tsk); ++ __vtime_account_system(tsk, vtime); + current->flags |= PF_VCPU; + write_seqcount_end(&vtime->seqcount); + } +@@ -771,7 +791,7 @@ void vtime_guest_exit(struct task_struct + struct vtime *vtime = &tsk->vtime; + + write_seqcount_begin(&vtime->seqcount); +- __vtime_account_system(tsk); ++ vtime_account_guest(tsk, vtime); + current->flags &= ~PF_VCPU; + write_seqcount_end(&vtime->seqcount); + } +@@ -794,7 +814,7 @@ void arch_vtime_task_switch(struct task_ + + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_SYS; +- vtime->starttime = jiffies; ++ vtime->starttime = sched_clock_cpu(smp_processor_id()); + write_seqcount_end(&vtime->seqcount); + } + +@@ -806,7 +826,7 @@ void vtime_init_idle(struct task_struct + local_irq_save(flags); + write_seqcount_begin(&vtime->seqcount); + vtime->state = VTIME_SYS; +- vtime->starttime = jiffies; ++ vtime->starttime = sched_clock_cpu(cpu); + write_seqcount_end(&vtime->seqcount); + local_irq_restore(flags); + } +@@ -825,7 +845,7 @@ u64 task_gtime(struct task_struct *t) + + gtime = t->gtime; + if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) +- gtime += vtime_delta(vtime); ++ gtime += vtime->gtime + vtime_delta(vtime); + + } while (read_seqcount_retry(&vtime->seqcount, seq)); + +@@ -866,9 +886,9 @@ void task_cputime(struct task_struct *t, + * the right place. + */ + if (vtime->state == VTIME_USER || t->flags & PF_VCPU) +- *utime += delta; ++ *utime += vtime->utime + delta; + else if (vtime->state == VTIME_SYS) +- *stime += delta; ++ *stime += vtime->stime + delta; + } while (read_seqcount_retry(&vtime->seqcount, seq)); + } + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/queue-4.12/sched-cputime-always-set-tsk-vtime_snap_whence-after-accounting-vtime.patch b/queue-4.12/sched-cputime-always-set-tsk-vtime_snap_whence-after-accounting-vtime.patch new file mode 100644 index 00000000000..000fb8a4813 --- /dev/null +++ b/queue-4.12/sched-cputime-always-set-tsk-vtime_snap_whence-after-accounting-vtime.patch @@ -0,0 +1,47 @@ +From 9fa57cf5a5c4aed1e45879b335fe433048709327 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Thu, 29 Jun 2017 19:15:08 +0200 +Subject: sched/cputime: Always set tsk->vtime_snap_whence after accounting vtime + +From: Frederic Weisbecker + +commit 9fa57cf5a5c4aed1e45879b335fe433048709327 upstream. + +Even though it doesn't have functional consequences, setting +the task's new context state after we actually accounted the pending +vtime from the old context state makes more sense from a review +perspective. + +vtime_user_exit() is the only function that doesn't follow that rule +and that can bug the reviewer for a little while until he realizes there +is no reason for this special case. + +Tested-by: Luiz Capitulino +Signed-off-by: Frederic Weisbecker +Reviewed-by: Thomas Gleixner +Acked-by: Rik van Riel +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Wanpeng Li +Link: http://lkml.kernel.org/r/1498756511-11714-3-git-send-email-fweisbec@gmail.com +Signed-off-by: Ingo Molnar +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched/cputime.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -736,9 +736,9 @@ void vtime_user_enter(struct task_struct + void vtime_user_exit(struct task_struct *tsk) + { + write_seqcount_begin(&tsk->vtime_seqcount); +- tsk->vtime_snap_whence = VTIME_SYS; + if (vtime_delta(tsk)) + account_user_time(tsk, get_vtime_delta(tsk)); ++ tsk->vtime_snap_whence = VTIME_SYS; + write_seqcount_end(&tsk->vtime_seqcount); + } + diff --git a/queue-4.12/sched-cputime-move-the-vtime-task-fields-to-their-own-struct.patch b/queue-4.12/sched-cputime-move-the-vtime-task-fields-to-their-own-struct.patch new file mode 100644 index 00000000000..ee68d3dd9ca --- /dev/null +++ b/queue-4.12/sched-cputime-move-the-vtime-task-fields-to-their-own-struct.patch @@ -0,0 +1,343 @@ +From bac5b6b6b11560f323e71d0ebac4061cfe5f56c0 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Thu, 29 Jun 2017 19:15:10 +0200 +Subject: sched/cputime: Move the vtime task fields to their own struct + +From: Frederic Weisbecker + +commit bac5b6b6b11560f323e71d0ebac4061cfe5f56c0 upstream. + +We are about to add vtime accumulation fields to the task struct. Let's +avoid more bloatification and gather vtime information to their own +struct. + +Tested-by: Luiz Capitulino +Signed-off-by: Frederic Weisbecker +Reviewed-by: Thomas Gleixner +Acked-by: Rik van Riel +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Wanpeng Li +Link: http://lkml.kernel.org/r/1498756511-11714-5-git-send-email-fweisbec@gmail.com +Signed-off-by: Ingo Molnar +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/init_task.h | 6 +- + include/linux/sched.h | 26 ++++++---- + kernel/fork.c | 6 +- + kernel/sched/cputime.c | 114 ++++++++++++++++++++++++++-------------------- + 4 files changed, 87 insertions(+), 65 deletions(-) + +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -170,9 +170,9 @@ extern struct cred init_cred; + + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + # define INIT_VTIME(tsk) \ +- .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ +- .vtime_starttime = 0, \ +- .vtime_state = VTIME_SYS, ++ .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \ ++ .vtime.starttime = 0, \ ++ .vtime.state = VTIME_SYS, + #else + # define INIT_VTIME(tsk) + #endif +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -223,6 +223,21 @@ struct task_cputime { + #define prof_exp stime + #define sched_exp sum_exec_runtime + ++enum vtime_state { ++ /* Task is sleeping or running in a CPU with VTIME inactive: */ ++ VTIME_INACTIVE = 0, ++ /* Task runs in userspace in a CPU with VTIME active: */ ++ VTIME_USER, ++ /* Task runs in kernelspace in a CPU with VTIME active: */ ++ VTIME_SYS, ++}; ++ ++struct vtime { ++ seqcount_t seqcount; ++ unsigned long long starttime; ++ enum vtime_state state; ++}; ++ + struct sched_info { + #ifdef CONFIG_SCHED_INFO + /* Cumulative counters: */ +@@ -670,16 +685,7 @@ struct task_struct { + u64 gtime; + struct prev_cputime prev_cputime; + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +- seqcount_t vtime_seqcount; +- unsigned long long vtime_starttime; +- enum { +- /* Task is sleeping or running in a CPU with VTIME inactive: */ +- VTIME_INACTIVE = 0, +- /* Task runs in userspace in a CPU with VTIME active: */ +- VTIME_USER, +- /* Task runs in kernelspace in a CPU with VTIME active: */ +- VTIME_SYS, +- } vtime_state; ++ struct vtime vtime; + #endif + + #ifdef CONFIG_NO_HZ_FULL +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1637,9 +1637,9 @@ static __latent_entropy struct task_stru + prev_cputime_init(&p->prev_cputime); + + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +- seqcount_init(&p->vtime_seqcount); +- p->vtime_starttime = 0; +- p->vtime_state = VTIME_INACTIVE; ++ seqcount_init(&p->vtime.seqcount); ++ p->vtime.starttime = 0; ++ p->vtime.state = VTIME_INACTIVE; + #endif + + #if defined(SPLIT_RSS_COUNTING) +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -679,17 +679,17 @@ void thread_group_cputime_adjusted(struc + #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +-static u64 vtime_delta(struct task_struct *tsk) ++static u64 vtime_delta(struct vtime *vtime) + { + unsigned long now = READ_ONCE(jiffies); + +- if (time_before(now, (unsigned long)tsk->vtime_starttime)) ++ if (time_before(now, (unsigned long)vtime->starttime)) + return 0; + +- return jiffies_to_nsecs(now - tsk->vtime_starttime); ++ return jiffies_to_nsecs(now - vtime->starttime); + } + +-static u64 get_vtime_delta(struct task_struct *tsk) ++static u64 get_vtime_delta(struct vtime *vtime) + { + unsigned long now = READ_ONCE(jiffies); + u64 delta, other; +@@ -701,49 +701,56 @@ static u64 get_vtime_delta(struct task_s + * elapsed time. Limit account_other_time to prevent rounding + * errors from causing elapsed vtime to go negative. + */ +- delta = jiffies_to_nsecs(now - tsk->vtime_starttime); ++ delta = jiffies_to_nsecs(now - vtime->starttime); + other = account_other_time(delta); +- WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE); +- tsk->vtime_starttime = now; ++ WARN_ON_ONCE(vtime->state == VTIME_INACTIVE); ++ vtime->starttime = now; + + return delta - other; + } + + static void __vtime_account_system(struct task_struct *tsk) + { +- account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); ++ account_system_time(tsk, irq_count(), get_vtime_delta(&tsk->vtime)); + } + + void vtime_account_system(struct task_struct *tsk) + { +- if (!vtime_delta(tsk)) ++ struct vtime *vtime = &tsk->vtime; ++ ++ if (!vtime_delta(vtime)) + return; + +- write_seqcount_begin(&tsk->vtime_seqcount); ++ write_seqcount_begin(&vtime->seqcount); + __vtime_account_system(tsk); +- write_seqcount_end(&tsk->vtime_seqcount); ++ write_seqcount_end(&vtime->seqcount); + } + + void vtime_user_enter(struct task_struct *tsk) + { +- write_seqcount_begin(&tsk->vtime_seqcount); +- if (vtime_delta(tsk)) ++ struct vtime *vtime = &tsk->vtime; ++ ++ write_seqcount_begin(&vtime->seqcount); ++ if (vtime_delta(vtime)) + __vtime_account_system(tsk); +- tsk->vtime_snap_whence = VTIME_USER; +- write_seqcount_end(&tsk->vtime_seqcount); ++ vtime->state = VTIME_USER; ++ write_seqcount_end(&vtime->seqcount); + } + + void vtime_user_exit(struct task_struct *tsk) + { +- write_seqcount_begin(&tsk->vtime_seqcount); +- if (vtime_delta(tsk)) +- account_user_time(tsk, get_vtime_delta(tsk)); +- tsk->vtime_snap_whence = VTIME_SYS; +- write_seqcount_end(&tsk->vtime_seqcount); ++ struct vtime *vtime = &tsk->vtime; ++ ++ write_seqcount_begin(&vtime->seqcount); ++ if (vtime_delta(vtime)) ++ account_user_time(tsk, get_vtime_delta(vtime)); ++ vtime->state = VTIME_SYS; ++ write_seqcount_end(&vtime->seqcount); + } + + void vtime_guest_enter(struct task_struct *tsk) + { ++ struct vtime *vtime = &tsk->vtime; + /* + * The flags must be updated under the lock with + * the vtime_starttime flush and update. +@@ -751,54 +758,62 @@ void vtime_guest_enter(struct task_struc + * synchronization against the reader (task_gtime()) + * that can thus safely catch up with a tickless delta. + */ +- write_seqcount_begin(&tsk->vtime_seqcount); +- if (vtime_delta(tsk)) ++ write_seqcount_begin(&vtime->seqcount); ++ if (vtime_delta(vtime)) + __vtime_account_system(tsk); + current->flags |= PF_VCPU; +- write_seqcount_end(&tsk->vtime_seqcount); ++ write_seqcount_end(&vtime->seqcount); + } + EXPORT_SYMBOL_GPL(vtime_guest_enter); + + void vtime_guest_exit(struct task_struct *tsk) + { +- write_seqcount_begin(&tsk->vtime_seqcount); ++ struct vtime *vtime = &tsk->vtime; ++ ++ write_seqcount_begin(&vtime->seqcount); + __vtime_account_system(tsk); + current->flags &= ~PF_VCPU; +- write_seqcount_end(&tsk->vtime_seqcount); ++ write_seqcount_end(&vtime->seqcount); + } + EXPORT_SYMBOL_GPL(vtime_guest_exit); + + void vtime_account_idle(struct task_struct *tsk) + { +- account_idle_time(get_vtime_delta(tsk)); ++ account_idle_time(get_vtime_delta(&tsk->vtime)); + } + + void arch_vtime_task_switch(struct task_struct *prev) + { +- write_seqcount_begin(&prev->vtime_seqcount); +- prev->vtime_state = VTIME_INACTIVE; +- write_seqcount_end(&prev->vtime_seqcount); +- +- write_seqcount_begin(¤t->vtime_seqcount); +- current->vtime_state = VTIME_SYS; +- current->vtime_starttime = jiffies; +- write_seqcount_end(¤t->vtime_seqcount); ++ struct vtime *vtime = &prev->vtime; ++ ++ write_seqcount_begin(&vtime->seqcount); ++ vtime->state = VTIME_INACTIVE; ++ write_seqcount_end(&vtime->seqcount); ++ ++ vtime = ¤t->vtime; ++ ++ write_seqcount_begin(&vtime->seqcount); ++ vtime->state = VTIME_SYS; ++ vtime->starttime = jiffies; ++ write_seqcount_end(&vtime->seqcount); + } + + void vtime_init_idle(struct task_struct *t, int cpu) + { ++ struct vtime *vtime = &t->vtime; + unsigned long flags; + + local_irq_save(flags); +- write_seqcount_begin(&t->vtime_seqcount); +- t->vtime_state = VTIME_SYS; +- t->vtime_starttime = jiffies; +- write_seqcount_end(&t->vtime_seqcount); ++ write_seqcount_begin(&vtime->seqcount); ++ vtime->state = VTIME_SYS; ++ vtime->starttime = jiffies; ++ write_seqcount_end(&vtime->seqcount); + local_irq_restore(flags); + } + + u64 task_gtime(struct task_struct *t) + { ++ struct vtime *vtime = &t->vtime; + unsigned int seq; + u64 gtime; + +@@ -806,13 +821,13 @@ u64 task_gtime(struct task_struct *t) + return t->gtime; + + do { +- seq = read_seqcount_begin(&t->vtime_seqcount); ++ seq = read_seqcount_begin(&vtime->seqcount); + + gtime = t->gtime; +- if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU) +- gtime += vtime_delta(t); ++ if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) ++ gtime += vtime_delta(vtime); + +- } while (read_seqcount_retry(&t->vtime_seqcount, seq)); ++ } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return gtime; + } +@@ -824,8 +839,9 @@ u64 task_gtime(struct task_struct *t) + */ + void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) + { +- u64 delta; ++ struct vtime *vtime = &t->vtime; + unsigned int seq; ++ u64 delta; + + if (!vtime_accounting_enabled()) { + *utime = t->utime; +@@ -834,25 +850,25 @@ void task_cputime(struct task_struct *t, + } + + do { +- seq = read_seqcount_begin(&t->vtime_seqcount); ++ seq = read_seqcount_begin(&vtime->seqcount); + + *utime = t->utime; + *stime = t->stime; + + /* Task is sleeping, nothing to add */ +- if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t)) ++ if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) + continue; + +- delta = vtime_delta(t); ++ delta = vtime_delta(vtime); + + /* + * Task runs either in user or kernel space, add pending nohz time to + * the right place. + */ +- if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU) ++ if (vtime->state == VTIME_USER || t->flags & PF_VCPU) + *utime += delta; +- else if (t->vtime_state == VTIME_SYS) ++ else if (vtime->state == VTIME_SYS) + *stime += delta; +- } while (read_seqcount_retry(&t->vtime_seqcount, seq)); ++ } while (read_seqcount_retry(&vtime->seqcount, seq)); + } + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/queue-4.12/sched-cputime-rename-vtime-fields.patch b/queue-4.12/sched-cputime-rename-vtime-fields.patch new file mode 100644 index 00000000000..8d2f4d4cd0e --- /dev/null +++ b/queue-4.12/sched-cputime-rename-vtime-fields.patch @@ -0,0 +1,177 @@ +From 60a9ce57e7c5ac1df3a39fb941022bbfa40c0862 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Thu, 29 Jun 2017 19:15:09 +0200 +Subject: sched/cputime: Rename vtime fields + +From: Frederic Weisbecker + +commit 60a9ce57e7c5ac1df3a39fb941022bbfa40c0862 upstream. + +The current "snapshot" based naming on vtime fields suggests we record +some past event but that's a low level picture of their actual purpose +which comes out blurry. The real point of these fields is to run a basic +state machine that tracks down cputime entry while switching between +contexts. + +So lets reflect that with more meaningful names. + +Tested-by: Luiz Capitulino +Signed-off-by: Frederic Weisbecker +Reviewed-by: Thomas Gleixner +Acked-by: Rik van Riel +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Wanpeng Li +Link: http://lkml.kernel.org/r/1498756511-11714-4-git-send-email-fweisbec@gmail.com +Signed-off-by: Ingo Molnar +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/init_task.h | 4 ++-- + include/linux/sched.h | 4 ++-- + kernel/fork.c | 4 ++-- + kernel/sched/cputime.c | 30 +++++++++++++++--------------- + 4 files changed, 21 insertions(+), 21 deletions(-) + +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -171,8 +171,8 @@ extern struct cred init_cred; + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + # define INIT_VTIME(tsk) \ + .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ +- .vtime_snap = 0, \ +- .vtime_snap_whence = VTIME_SYS, ++ .vtime_starttime = 0, \ ++ .vtime_state = VTIME_SYS, + #else + # define INIT_VTIME(tsk) + #endif +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -671,7 +671,7 @@ struct task_struct { + struct prev_cputime prev_cputime; + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + seqcount_t vtime_seqcount; +- unsigned long long vtime_snap; ++ unsigned long long vtime_starttime; + enum { + /* Task is sleeping or running in a CPU with VTIME inactive: */ + VTIME_INACTIVE = 0, +@@ -679,7 +679,7 @@ struct task_struct { + VTIME_USER, + /* Task runs in kernelspace in a CPU with VTIME active: */ + VTIME_SYS, +- } vtime_snap_whence; ++ } vtime_state; + #endif + + #ifdef CONFIG_NO_HZ_FULL +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1638,8 +1638,8 @@ static __latent_entropy struct task_stru + + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + seqcount_init(&p->vtime_seqcount); +- p->vtime_snap = 0; +- p->vtime_snap_whence = VTIME_INACTIVE; ++ p->vtime_starttime = 0; ++ p->vtime_state = VTIME_INACTIVE; + #endif + + #if defined(SPLIT_RSS_COUNTING) +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -683,10 +683,10 @@ static u64 vtime_delta(struct task_struc + { + unsigned long now = READ_ONCE(jiffies); + +- if (time_before(now, (unsigned long)tsk->vtime_snap)) ++ if (time_before(now, (unsigned long)tsk->vtime_starttime)) + return 0; + +- return jiffies_to_nsecs(now - tsk->vtime_snap); ++ return jiffies_to_nsecs(now - tsk->vtime_starttime); + } + + static u64 get_vtime_delta(struct task_struct *tsk) +@@ -701,10 +701,10 @@ static u64 get_vtime_delta(struct task_s + * elapsed time. Limit account_other_time to prevent rounding + * errors from causing elapsed vtime to go negative. + */ +- delta = jiffies_to_nsecs(now - tsk->vtime_snap); ++ delta = jiffies_to_nsecs(now - tsk->vtime_starttime); + other = account_other_time(delta); +- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); +- tsk->vtime_snap = now; ++ WARN_ON_ONCE(tsk->vtime_state == VTIME_INACTIVE); ++ tsk->vtime_starttime = now; + + return delta - other; + } +@@ -746,7 +746,7 @@ void vtime_guest_enter(struct task_struc + { + /* + * The flags must be updated under the lock with +- * the vtime_snap flush and update. ++ * the vtime_starttime flush and update. + * That enforces a right ordering and update sequence + * synchronization against the reader (task_gtime()) + * that can thus safely catch up with a tickless delta. +@@ -776,12 +776,12 @@ void vtime_account_idle(struct task_stru + void arch_vtime_task_switch(struct task_struct *prev) + { + write_seqcount_begin(&prev->vtime_seqcount); +- prev->vtime_snap_whence = VTIME_INACTIVE; ++ prev->vtime_state = VTIME_INACTIVE; + write_seqcount_end(&prev->vtime_seqcount); + + write_seqcount_begin(¤t->vtime_seqcount); +- current->vtime_snap_whence = VTIME_SYS; +- current->vtime_snap = jiffies; ++ current->vtime_state = VTIME_SYS; ++ current->vtime_starttime = jiffies; + write_seqcount_end(¤t->vtime_seqcount); + } + +@@ -791,8 +791,8 @@ void vtime_init_idle(struct task_struct + + local_irq_save(flags); + write_seqcount_begin(&t->vtime_seqcount); +- t->vtime_snap_whence = VTIME_SYS; +- t->vtime_snap = jiffies; ++ t->vtime_state = VTIME_SYS; ++ t->vtime_starttime = jiffies; + write_seqcount_end(&t->vtime_seqcount); + local_irq_restore(flags); + } +@@ -809,7 +809,7 @@ u64 task_gtime(struct task_struct *t) + seq = read_seqcount_begin(&t->vtime_seqcount); + + gtime = t->gtime; +- if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) ++ if (t->vtime_state == VTIME_SYS && t->flags & PF_VCPU) + gtime += vtime_delta(t); + + } while (read_seqcount_retry(&t->vtime_seqcount, seq)); +@@ -840,7 +840,7 @@ void task_cputime(struct task_struct *t, + *stime = t->stime; + + /* Task is sleeping, nothing to add */ +- if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) ++ if (t->vtime_state == VTIME_INACTIVE || is_idle_task(t)) + continue; + + delta = vtime_delta(t); +@@ -849,9 +849,9 @@ void task_cputime(struct task_struct *t, + * Task runs either in user or kernel space, add pending nohz time to + * the right place. + */ +- if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) ++ if (t->vtime_state == VTIME_USER || t->flags & PF_VCPU) + *utime += delta; +- else if (t->vtime_snap_whence == VTIME_SYS) ++ else if (t->vtime_state == VTIME_SYS) + *stime += delta; + } while (read_seqcount_retry(&t->vtime_seqcount, seq)); + } diff --git a/queue-4.12/sched-fair-fix-load_balance-affinity-redo-path.patch b/queue-4.12/sched-fair-fix-load_balance-affinity-redo-path.patch new file mode 100644 index 00000000000..c763d23f236 --- /dev/null +++ b/queue-4.12/sched-fair-fix-load_balance-affinity-redo-path.patch @@ -0,0 +1,112 @@ +From 65a4433aebe36c8c6abeb69b99ef00274b971c6c Mon Sep 17 00:00:00 2001 +From: Jeffrey Hugo +Date: Wed, 7 Jun 2017 13:18:57 -0600 +Subject: sched/fair: Fix load_balance() affinity redo path + +From: Jeffrey Hugo + +commit 65a4433aebe36c8c6abeb69b99ef00274b971c6c upstream. + +If load_balance() fails to migrate any tasks because all tasks were +affined, load_balance() removes the source CPU from consideration and +attempts to redo and balance among the new subset of CPUs. + +There is a bug in this code path where the algorithm considers all active +CPUs in the system (minus the source that was just masked out). This is +not valid for two reasons: some active CPUs may not be in the current +scheduling domain and one of the active CPUs is dst_cpu. These CPUs should +not be considered, as we cannot pull load from them. + +Instead of failing out of load_balance(), we may end up redoing the search +with no valid CPUs and incorrectly concluding the domain is balanced. +Additionally, if the group_imbalance flag was just set, it may also be +incorrectly unset, thus the flag will not be seen by other CPUs in future +load_balance() runs as that algorithm intends. + +Fix the check by removing CPUs not in the current domain and the dst_cpu +from considertation, thus limiting the evaluation to valid remaining CPUs +from which load might be migrated. + +Co-authored-by: Austin Christ +Co-authored-by: Dietmar Eggemann +Tested-by: Tyler Baicar +Signed-off-by: Jeffrey Hugo +Acked-by: Peter Zijlstra +Cc: Austin Christ +Cc: Dietmar Eggemann +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Timur Tabi +Link: http://lkml.kernel.org/r/1496863138-11322-2-git-send-email-jhugo@codeaurora.org +Signed-off-by: Ingo Molnar +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched/fair.c | 32 ++++++++++++++++++++------------ + 1 file changed, 20 insertions(+), 12 deletions(-) + +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -6619,10 +6619,10 @@ int can_migrate_task(struct task_struct + * our sched_group. We may want to revisit it if we couldn't + * meet load balance goals by pulling other tasks on src_cpu. + * +- * Also avoid computing new_dst_cpu if we have already computed +- * one in current iteration. ++ * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have ++ * already computed one in current iteration. + */ +- if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED)) ++ if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) + return 0; + + /* Prevent to re-select dst_cpu via env's cpus */ +@@ -7973,14 +7973,7 @@ static int load_balance(int this_cpu, st + .tasks = LIST_HEAD_INIT(env.tasks), + }; + +- /* +- * For NEWLY_IDLE load_balancing, we don't need to consider +- * other cpus in our group +- */ +- if (idle == CPU_NEWLY_IDLE) +- env.dst_grpmask = NULL; +- +- cpumask_copy(cpus, cpu_active_mask); ++ cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); + + schedstat_inc(sd->lb_count[idle]); + +@@ -8102,7 +8095,15 @@ more_balance: + /* All tasks on this runqueue were pinned by CPU affinity */ + if (unlikely(env.flags & LBF_ALL_PINNED)) { + cpumask_clear_cpu(cpu_of(busiest), cpus); +- if (!cpumask_empty(cpus)) { ++ /* ++ * Attempting to continue load balancing at the current ++ * sched_domain level only makes sense if there are ++ * active CPUs remaining as possible busiest CPUs to ++ * pull load from which are not contained within the ++ * destination group that is receiving any migrated ++ * load. ++ */ ++ if (!cpumask_subset(cpus, env.dst_grpmask)) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + goto redo; +@@ -8398,6 +8399,13 @@ static int active_load_balance_cpu_stop( + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, ++ /* ++ * can_migrate_task() doesn't need to compute new_dst_cpu ++ * for active balancing. Since we have CPU_IDLE, but no ++ * @dst_grpmask we need to make that test go away with lying ++ * about DST_PINNED. ++ */ ++ .flags = LBF_DST_PINNED, + }; + + schedstat_inc(sd->alb_count); diff --git a/queue-4.12/series b/queue-4.12/series index cfd8b10bedd..08cc14f0713 100644 --- a/queue-4.12/series +++ b/queue-4.12/series @@ -171,3 +171,12 @@ drm-mst-avoid-processing-partially-received-up-down-message-transactions.patch drm-i915-make-dp-mst-connector-info-work.patch mlx5-avoid-that-mlx5_ib_sg_to_klms-overflows-the-klms-array.patch hfsplus-don-t-clear-sgid-when-inheriting-acls.patch +vtime-sched-cputime-remove-vtime_account_user.patch +sched-cputime-always-set-tsk-vtime_snap_whence-after-accounting-vtime.patch +sched-cputime-rename-vtime-fields.patch +sched-cputime-move-the-vtime-task-fields-to-their-own-struct.patch +sched-cputime-accumulate-vtime-on-top-of-nsec-clocksource.patch +sched-fair-fix-load_balance-affinity-redo-path.patch +percpu_counter-rename-__percpu_counter_add-to-percpu_counter_add_batch.patch +writeback-rework-wb__stat-family-of-functions.patch +kernel-fork.c-virtually-mapped-stacks-do-not-disable-interrupts.patch diff --git a/queue-4.12/vtime-sched-cputime-remove-vtime_account_user.patch b/queue-4.12/vtime-sched-cputime-remove-vtime_account_user.patch new file mode 100644 index 00000000000..6e635783930 --- /dev/null +++ b/queue-4.12/vtime-sched-cputime-remove-vtime_account_user.patch @@ -0,0 +1,82 @@ +From 1c3eda01a79b8e9237d91c52c5a75b20983f47c6 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Thu, 29 Jun 2017 19:15:07 +0200 +Subject: vtime, sched/cputime: Remove vtime_account_user() + +From: Frederic Weisbecker + +commit 1c3eda01a79b8e9237d91c52c5a75b20983f47c6 upstream. + +It's an unnecessary function between vtime_user_exit() and +account_user_time(). + +Tested-by: Luiz Capitulino +Signed-off-by: Frederic Weisbecker +Reviewed-by: Thomas Gleixner +Acked-by: Rik van Riel +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Wanpeng Li +Link: http://lkml.kernel.org/r/1498756511-11714-2-git-send-email-fweisbec@gmail.com +Signed-off-by: Ingo Molnar +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/vtime.h | 9 +-------- + kernel/sched/cputime.c | 12 ++++++------ + 2 files changed, 7 insertions(+), 14 deletions(-) + +--- a/include/linux/vtime.h ++++ b/include/linux/vtime.h +@@ -67,19 +67,12 @@ static inline void vtime_account_system( + + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN + extern void arch_vtime_task_switch(struct task_struct *tsk); +-extern void vtime_account_user(struct task_struct *tsk); + extern void vtime_user_enter(struct task_struct *tsk); +- +-static inline void vtime_user_exit(struct task_struct *tsk) +-{ +- vtime_account_user(tsk); +-} +- ++extern void vtime_user_exit(struct task_struct *tsk); + extern void vtime_guest_enter(struct task_struct *tsk); + extern void vtime_guest_exit(struct task_struct *tsk); + extern void vtime_init_idle(struct task_struct *tsk, int cpu); + #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ +-static inline void vtime_account_user(struct task_struct *tsk) { } + static inline void vtime_user_enter(struct task_struct *tsk) { } + static inline void vtime_user_exit(struct task_struct *tsk) { } + static inline void vtime_guest_enter(struct task_struct *tsk) { } +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -724,21 +724,21 @@ void vtime_account_system(struct task_st + write_seqcount_end(&tsk->vtime_seqcount); + } + +-void vtime_account_user(struct task_struct *tsk) ++void vtime_user_enter(struct task_struct *tsk) + { + write_seqcount_begin(&tsk->vtime_seqcount); +- tsk->vtime_snap_whence = VTIME_SYS; + if (vtime_delta(tsk)) +- account_user_time(tsk, get_vtime_delta(tsk)); ++ __vtime_account_system(tsk); ++ tsk->vtime_snap_whence = VTIME_USER; + write_seqcount_end(&tsk->vtime_seqcount); + } + +-void vtime_user_enter(struct task_struct *tsk) ++void vtime_user_exit(struct task_struct *tsk) + { + write_seqcount_begin(&tsk->vtime_seqcount); ++ tsk->vtime_snap_whence = VTIME_SYS; + if (vtime_delta(tsk)) +- __vtime_account_system(tsk); +- tsk->vtime_snap_whence = VTIME_USER; ++ account_user_time(tsk, get_vtime_delta(tsk)); + write_seqcount_end(&tsk->vtime_seqcount); + } + diff --git a/queue-4.12/writeback-rework-wb__stat-family-of-functions.patch b/queue-4.12/writeback-rework-wb__stat-family-of-functions.patch new file mode 100644 index 00000000000..2be38093bb3 --- /dev/null +++ b/queue-4.12/writeback-rework-wb__stat-family-of-functions.patch @@ -0,0 +1,163 @@ +From 3e8f399da490e6ac20a3cfd6aa404c9aa961a9a2 Mon Sep 17 00:00:00 2001 +From: Nikolay Borisov +Date: Wed, 12 Jul 2017 14:37:51 -0700 +Subject: writeback: rework wb_[dec|inc]_stat family of functions + +From: Nikolay Borisov + +commit 3e8f399da490e6ac20a3cfd6aa404c9aa961a9a2 upstream. + +Currently the writeback statistics code uses a percpu counters to hold +various statistics. Furthermore we have 2 families of functions - those +which disable local irq and those which doesn't and whose names begin +with double underscore. However, they both end up calling +__add_wb_stats which in turn calls percpu_counter_add_batch which is +already irq-safe. + +Exploiting this fact allows to eliminated the __wb_* functions since +they don't add any further protection than we already have. +Furthermore, refactor the wb_* function to call __add_wb_stat directly +without the irq-disabling dance. This will likely result in better +runtime of code which deals with modifying the stat counters. + +While at it also document why percpu_counter_add_batch is in fact +preempt and irq-safe since at least 3 people got confused. + +Link: http://lkml.kernel.org/r/1498029937-27293-1-git-send-email-nborisov@suse.com +Signed-off-by: Nikolay Borisov +Acked-by: Tejun Heo +Reviewed-by: Jan Kara +Cc: Josef Bacik +Cc: Mel Gorman +Cc: Jeff Layton +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Mel Gorman +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fs-writeback.c | 8 ++++---- + include/linux/backing-dev.h | 24 ++---------------------- + lib/percpu_counter.c | 7 +++++++ + mm/page-writeback.c | 10 +++++----- + 4 files changed, 18 insertions(+), 31 deletions(-) + +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -380,8 +380,8 @@ static void inode_switch_wbs_work_fn(str + struct page *page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (likely(page) && PageDirty(page)) { +- __dec_wb_stat(old_wb, WB_RECLAIMABLE); +- __inc_wb_stat(new_wb, WB_RECLAIMABLE); ++ dec_wb_stat(old_wb, WB_RECLAIMABLE); ++ inc_wb_stat(new_wb, WB_RECLAIMABLE); + } + } + +@@ -391,8 +391,8 @@ static void inode_switch_wbs_work_fn(str + &mapping->tree_lock); + if (likely(page)) { + WARN_ON_ONCE(!PageWriteback(page)); +- __dec_wb_stat(old_wb, WB_WRITEBACK); +- __inc_wb_stat(new_wb, WB_WRITEBACK); ++ dec_wb_stat(old_wb, WB_WRITEBACK); ++ inc_wb_stat(new_wb, WB_WRITEBACK); + } + } + +--- a/include/linux/backing-dev.h ++++ b/include/linux/backing-dev.h +@@ -69,34 +69,14 @@ static inline void __add_wb_stat(struct + percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); + } + +-static inline void __inc_wb_stat(struct bdi_writeback *wb, +- enum wb_stat_item item) +-{ +- __add_wb_stat(wb, item, 1); +-} +- + static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) + { +- unsigned long flags; +- +- local_irq_save(flags); +- __inc_wb_stat(wb, item); +- local_irq_restore(flags); +-} +- +-static inline void __dec_wb_stat(struct bdi_writeback *wb, +- enum wb_stat_item item) +-{ +- __add_wb_stat(wb, item, -1); ++ __add_wb_stat(wb, item, 1); + } + + static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) + { +- unsigned long flags; +- +- local_irq_save(flags); +- __dec_wb_stat(wb, item); +- local_irq_restore(flags); ++ __add_wb_stat(wb, item, -1); + } + + static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) +--- a/lib/percpu_counter.c ++++ b/lib/percpu_counter.c +@@ -72,6 +72,13 @@ void percpu_counter_set(struct percpu_co + } + EXPORT_SYMBOL(percpu_counter_set); + ++/** ++ * This function is both preempt and irq safe. The former is due to explicit ++ * preemption disable. The latter is guaranteed by the fact that the slow path ++ * is explicitly protected by an irq-safe spinlock whereas the fast patch uses ++ * this_cpu_add which is irq-safe by definition. Hence there is no need muck ++ * with irq state before calling this one ++ */ + void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) + { + s64 count; +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -601,7 +601,7 @@ static inline void __wb_writeout_inc(str + { + struct wb_domain *cgdom; + +- __inc_wb_stat(wb, WB_WRITTEN); ++ inc_wb_stat(wb, WB_WRITTEN); + wb_domain_writeout_inc(&global_wb_domain, &wb->completions, + wb->bdi->max_prop_frac); + +@@ -2437,8 +2437,8 @@ void account_page_dirtied(struct page *p + __inc_node_page_state(page, NR_FILE_DIRTY); + __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); + __inc_node_page_state(page, NR_DIRTIED); +- __inc_wb_stat(wb, WB_RECLAIMABLE); +- __inc_wb_stat(wb, WB_DIRTIED); ++ inc_wb_stat(wb, WB_RECLAIMABLE); ++ inc_wb_stat(wb, WB_DIRTIED); + task_io_account_write(PAGE_SIZE); + current->nr_dirtied++; + this_cpu_inc(bdp_ratelimits); +@@ -2745,7 +2745,7 @@ int test_clear_page_writeback(struct pag + if (bdi_cap_account_writeback(bdi)) { + struct bdi_writeback *wb = inode_to_wb(inode); + +- __dec_wb_stat(wb, WB_WRITEBACK); ++ dec_wb_stat(wb, WB_WRITEBACK); + __wb_writeout_inc(wb); + } + } +@@ -2791,7 +2791,7 @@ int __test_set_page_writeback(struct pag + page_index(page), + PAGECACHE_TAG_WRITEBACK); + if (bdi_cap_account_writeback(bdi)) +- __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); ++ inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + + /* + * We can come through here when swapping anonymous -- 2.47.3