From 48e5314137f207a963dab3d72d76280abbdc4acc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 29 Apr 2015 14:06:35 +0200 Subject: [PATCH] 4.0-stable patches added patches: btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch md-fix-md-io-stats-accounting-broken.patch mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch x86-vdso-fix-pvclock-races-with-task-migration.patch --- ...cept-bare-namespace-as-a-valid-xattr.patch | 134 ++++++++++++ ...-infinite-loop-after-cloning-into-it.patch | 103 +++++++++ ...nfinite-loop-after-extent_same-ioctl.patch | 49 +++++ ...ption-when-fs-mounted-with-o-discard.patch | 57 +++++ ...fix-msr_ia32_bndcfgs-in-msrs_to_save.patch | 51 +++++ ...md-fix-md-io-stats-accounting-broken.patch | 59 +++++ ...etlb-use-pmd_page-in-follow_huge_pmd.patch | 44 ++++ ...-core2-atom-nhm-wsm-cycles-pp-events.patch | 70 ++++++ ...ec_nr_pmds-in-hugetlb_free_pmd_range.patch | 39 ++++ ...-unnecessary-mwait_idle-resched-ipis.patch | 72 ++++++ ...r-savings-and-to-improve-performance.patch | 152 +++++++++++++ queue-4.0/series | 15 ++ ...instruction-size-in-the-insn-decoder.patch | 62 ++++++ ...probe_kernel_write-tail-zeroing-case.patch | 58 +++++ ...ed-notifier-for-cross-cpu-migrations.patch | 205 ++++++++++++++++++ ...ix-pvclock-races-with-task-migration.patch | 65 ++++++ 16 files changed, 1235 insertions(+) create mode 100644 queue-4.0/btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch create mode 100644 queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch create mode 100644 queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch create mode 100644 queue-4.0/btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch create mode 100644 queue-4.0/kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch create mode 100644 queue-4.0/md-fix-md-io-stats-accounting-broken.patch create mode 100644 queue-4.0/mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch create mode 100644 queue-4.0/perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch create mode 100644 queue-4.0/powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch create mode 100644 queue-4.0/sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch create mode 100644 queue-4.0/sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch create mode 100644 queue-4.0/x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch create mode 100644 queue-4.0/x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch create mode 100644 queue-4.0/x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch create mode 100644 queue-4.0/x86-vdso-fix-pvclock-races-with-task-migration.patch diff --git a/queue-4.0/btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch b/queue-4.0/btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch new file mode 100644 index 00000000000..df59c3300e1 --- /dev/null +++ b/queue-4.0/btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch @@ -0,0 +1,134 @@ +From 3c3b04d10ff1811a27f86684ccd2f5ba6983211d Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Wed, 25 Mar 2015 19:26:41 +0100 +Subject: btrfs: don't accept bare namespace as a valid xattr + +From: David Sterba + +commit 3c3b04d10ff1811a27f86684ccd2f5ba6983211d upstream. + +Due to insufficient check in btrfs_is_valid_xattr, this unexpectedly +works: + + $ touch file + $ setfattr -n user. -v 1 file + $ getfattr -d file +user.="1" + +ie. the missing attribute name after the namespace. + +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=94291 +Reported-by: William Douglas +Signed-off-by: David Sterba +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/xattr.c | 53 +++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 39 insertions(+), 14 deletions(-) + +--- a/fs/btrfs/xattr.c ++++ b/fs/btrfs/xattr.c +@@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_ + /* + * Check if the attribute is in a supported namespace. + * +- * This applied after the check for the synthetic attributes in the system ++ * This is applied after the check for the synthetic attributes in the system + * namespace. + */ +-static bool btrfs_is_valid_xattr(const char *name) ++static int btrfs_is_valid_xattr(const char *name) + { +- return !strncmp(name, XATTR_SECURITY_PREFIX, +- XATTR_SECURITY_PREFIX_LEN) || +- !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || +- !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || +- !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) || +- !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN); ++ int len = strlen(name); ++ int prefixlen = 0; ++ ++ if (!strncmp(name, XATTR_SECURITY_PREFIX, ++ XATTR_SECURITY_PREFIX_LEN)) ++ prefixlen = XATTR_SECURITY_PREFIX_LEN; ++ else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) ++ prefixlen = XATTR_SYSTEM_PREFIX_LEN; ++ else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) ++ prefixlen = XATTR_TRUSTED_PREFIX_LEN; ++ else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) ++ prefixlen = XATTR_USER_PREFIX_LEN; ++ else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) ++ prefixlen = XATTR_BTRFS_PREFIX_LEN; ++ else ++ return -EOPNOTSUPP; ++ ++ /* ++ * The name cannot consist of just prefix ++ */ ++ if (len <= prefixlen) ++ return -EINVAL; ++ ++ return 0; + } + + ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) + { ++ int ret; ++ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler +@@ -388,8 +408,9 @@ ssize_t btrfs_getxattr(struct dentry *de + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, buffer, size); + +- if (!btrfs_is_valid_xattr(name)) +- return -EOPNOTSUPP; ++ ret = btrfs_is_valid_xattr(name); ++ if (ret) ++ return ret; + return __btrfs_getxattr(dentry->d_inode, name, buffer, size); + } + +@@ -397,6 +418,7 @@ int btrfs_setxattr(struct dentry *dentry + size_t size, int flags) + { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; ++ int ret; + + /* + * The permission on security.* and system.* is not checked +@@ -413,8 +435,9 @@ int btrfs_setxattr(struct dentry *dentry + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + +- if (!btrfs_is_valid_xattr(name)) +- return -EOPNOTSUPP; ++ ret = btrfs_is_valid_xattr(name); ++ if (ret) ++ return ret; + + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(dentry->d_inode, name, +@@ -430,6 +453,7 @@ int btrfs_setxattr(struct dentry *dentry + int btrfs_removexattr(struct dentry *dentry, const char *name) + { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; ++ int ret; + + /* + * The permission on security.* and system.* is not checked +@@ -446,8 +470,9 @@ int btrfs_removexattr(struct dentry *den + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + +- if (!btrfs_is_valid_xattr(name)) +- return -EOPNOTSUPP; ++ ret = btrfs_is_valid_xattr(name); ++ if (ret) ++ return ret; + + if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + return btrfs_set_prop(dentry->d_inode, name, diff --git a/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch b/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch new file mode 100644 index 00000000000..1804e0175b3 --- /dev/null +++ b/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch @@ -0,0 +1,103 @@ +From ccccf3d67294714af2d72a6fd6fd7d73b01c9329 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 30 Mar 2015 18:23:59 +0100 +Subject: Btrfs: fix inode eviction infinite loop after cloning into it + +From: Filipe Manana + +commit ccccf3d67294714af2d72a6fd6fd7d73b01c9329 upstream. + +If we attempt to clone a 0 length region into a file we can end up +inserting a range in the inode's extent_io tree with a start offset +that is greater then the end offset, which triggers immediately the +following warning: + +[ 3914.619057] WARNING: CPU: 17 PID: 4199 at fs/btrfs/extent_io.c:435 insert_state+0x4b/0x10b [btrfs]() +[ 3914.620886] BTRFS: end < start 4095 4096 +(...) +[ 3914.638093] Call Trace: +[ 3914.638636] [] dump_stack+0x4c/0x65 +[ 3914.639620] [] warn_slowpath_common+0xa1/0xbb +[ 3914.640789] [] ? insert_state+0x4b/0x10b [btrfs] +[ 3914.642041] [] warn_slowpath_fmt+0x46/0x48 +[ 3914.643236] [] insert_state+0x4b/0x10b [btrfs] +[ 3914.644441] [] __set_extent_bit+0x107/0x3f4 [btrfs] +[ 3914.645711] [] lock_extent_bits+0x65/0x1bf [btrfs] +[ 3914.646914] [] ? _raw_spin_unlock+0x28/0x33 +[ 3914.648058] [] ? test_range_bit+0xcc/0xde [btrfs] +[ 3914.650105] [] lock_extent+0x13/0x15 [btrfs] +[ 3914.651361] [] lock_extent_range+0x3d/0xcd [btrfs] +[ 3914.652761] [] btrfs_ioctl_clone+0x278/0x388 [btrfs] +[ 3914.654128] [] ? might_fault+0x58/0xb5 +[ 3914.655320] [] btrfs_ioctl+0xb51/0x2195 [btrfs] +(...) +[ 3914.669271] ---[ end trace 14843d3e2e622fc1 ]--- + +This later makes the inode eviction handler enter an infinite loop that +keeps dumping the following warning over and over: + +[ 3915.117629] WARNING: CPU: 22 PID: 4228 at fs/btrfs/extent_io.c:435 insert_state+0x4b/0x10b [btrfs]() +[ 3915.119913] BTRFS: end < start 4095 4096 +(...) +[ 3915.137394] Call Trace: +[ 3915.137913] [] dump_stack+0x4c/0x65 +[ 3915.139154] [] warn_slowpath_common+0xa1/0xbb +[ 3915.140316] [] ? insert_state+0x4b/0x10b [btrfs] +[ 3915.141505] [] warn_slowpath_fmt+0x46/0x48 +[ 3915.142709] [] insert_state+0x4b/0x10b [btrfs] +[ 3915.143849] [] __set_extent_bit+0x107/0x3f4 [btrfs] +[ 3915.145120] [] ? btrfs_kill_super+0x17/0x23 [btrfs] +[ 3915.146352] [] ? deactivate_locked_super+0x3b/0x50 +[ 3915.147565] [] lock_extent_bits+0x65/0x1bf [btrfs] +[ 3915.148785] [] ? _raw_write_unlock+0x28/0x33 +[ 3915.149931] [] btrfs_evict_inode+0x196/0x482 [btrfs] +[ 3915.151154] [] evict+0xa0/0x148 +[ 3915.152094] [] dispose_list+0x39/0x43 +[ 3915.153081] [] evict_inodes+0xdc/0xeb +[ 3915.154062] [] generic_shutdown_super+0x49/0xef +[ 3915.155193] [] kill_anon_super+0x13/0x1e +[ 3915.156274] [] btrfs_kill_super+0x17/0x23 [btrfs] +(...) +[ 3915.167404] ---[ end trace 14843d3e2e622fc2 ]--- + +So just bail out of the clone ioctl if the length of the region to clone +is zero, without locking any extent range, in order to prevent this issue +(same behaviour as a pwrite with a 0 length for example). + +This is trivial to reproduce. For example, the steps for the test I just +made for fstests: + + mkfs.btrfs -f SCRATCH_DEV + mount SCRATCH_DEV $SCRATCH_MNT + + touch $SCRATCH_MNT/foo + touch $SCRATCH_MNT/bar + + $CLONER_PROG -s 0 -d 4096 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + umount $SCRATCH_MNT + +A test case for fstests follows soon. + +Signed-off-by: Filipe Manana +Reviewed-by: Omar Sandoval +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -3626,6 +3626,11 @@ static noinline long btrfs_ioctl_clone(s + if (off + len == src->i_size) + len = ALIGN(src->i_size, bs) - off; + ++ if (len == 0) { ++ ret = 0; ++ goto out_unlock; ++ } ++ + /* verify the end result is block aligned */ + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || + !IS_ALIGNED(destoff, bs)) diff --git a/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch b/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch new file mode 100644 index 00000000000..63c518dc96e --- /dev/null +++ b/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch @@ -0,0 +1,49 @@ +From 113e8283869b9855c8b999796aadd506bbac155f Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 30 Mar 2015 18:26:47 +0100 +Subject: Btrfs: fix inode eviction infinite loop after extent_same ioctl + +From: Filipe Manana + +commit 113e8283869b9855c8b999796aadd506bbac155f upstream. + +If we pass a length of 0 to the extent_same ioctl, we end up locking an +extent range with a start offset greater then its end offset (if the +destination file's offset is greater than zero). This results in a warning +from extent_io.c:insert_state through the following call chain: + + btrfs_extent_same() + btrfs_double_lock() + lock_extent_range() + lock_extent(inode->io_tree, offset, offset + len - 1) + lock_extent_bits() + __set_extent_bit() + insert_state() + --> WARN_ON(end < start) + +This leads to an infinite loop when evicting the inode. This is the same +problem that my previous patch titled +"Btrfs: fix inode eviction infinite loop after cloning into it" addressed +but for the extent_same ioctl instead of the clone ioctl. + +Signed-off-by: Filipe Manana +Reviewed-by: Omar Sandoval +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -2897,6 +2897,9 @@ static int btrfs_extent_same(struct inod + if (src == dst) + return -EINVAL; + ++ if (len == 0) ++ return 0; ++ + btrfs_double_lock(src, loff, dst, dst_loff, len); + + ret = extent_same_check_offsets(src, loff, len); diff --git a/queue-4.0/btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch b/queue-4.0/btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch new file mode 100644 index 00000000000..97be7a8cdbb --- /dev/null +++ b/queue-4.0/btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch @@ -0,0 +1,57 @@ +From dcc82f4783ad91d4ab654f89f37ae9291cdc846a Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 23 Mar 2015 14:07:40 +0000 +Subject: Btrfs: fix log tree corruption when fs mounted with -o discard + +From: Filipe Manana + +commit dcc82f4783ad91d4ab654f89f37ae9291cdc846a upstream. + +While committing a transaction we free the log roots before we write the +new super block. Freeing the log roots implies marking the disk location +of every node/leaf (metadata extent) as pinned before the new super block +is written. This is to prevent the disk location of log metadata extents +from being reused before the new super block is written, otherwise we +would have a corrupted log tree if before the new super block is written +a crash/reboot happens and the location of any log tree metadata extent +ended up being reused and rewritten. + +Even though we pinned the log tree's metadata extents, we were issuing a +discard against them if the fs was mounted with the -o discard option, +resulting in corruption of the log tree if a crash/reboot happened before +writing the new super block - the next time the fs was mounted, during +the log replay process we would find nodes/leafs of the log btree with +a content full of zeroes, causing the process to fail and require the +use of the tool btrfs-zero-log to wipeout the log tree (and all data +previously fsynced becoming lost forever). + +Fix this by not doing a discard when pinning an extent. The discard will +be done later when it's safe (after the new super block is committed) at +extent-tree.c:btrfs_finish_extent_commit(). + +Fixes: e688b7252f78 (Btrfs: fix extent pinning bugs in the tree log) +Signed-off-by: Filipe Manana +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -6956,12 +6956,11 @@ static int __btrfs_free_reserved_extent( + return -ENOSPC; + } + +- if (btrfs_test_opt(root, DISCARD)) +- ret = btrfs_discard_extent(root, start, len, NULL); +- + if (pin) + pin_down_extent(root, cache, start, len, 1); + else { ++ if (btrfs_test_opt(root, DISCARD)) ++ ret = btrfs_discard_extent(root, start, len, NULL); + btrfs_add_free_space(cache, start, len); + btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + } diff --git a/queue-4.0/kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch b/queue-4.0/kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch new file mode 100644 index 00000000000..8f8d6f47dcf --- /dev/null +++ b/queue-4.0/kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch @@ -0,0 +1,51 @@ +From 9e9c3fe40bcd28e3f98f0ad8408435f4503f2781 Mon Sep 17 00:00:00 2001 +From: Nadav Amit +Date: Sun, 12 Apr 2015 21:47:15 +0300 +Subject: KVM: x86: Fix MSR_IA32_BNDCFGS in msrs_to_save + +From: Nadav Amit + +commit 9e9c3fe40bcd28e3f98f0ad8408435f4503f2781 upstream. + +kvm_init_msr_list is currently called before hardware_setup. As a result, +vmx_mpx_supported always returns false when kvm_init_msr_list checks whether to +save MSR_IA32_BNDCFGS. + +Move kvm_init_msr_list after vmx_hardware_setup is called to fix this issue. + +Signed-off-by: Nadav Amit +Signed-off-by: Greg Kroah-Hartman + +Message-Id: <1428864435-4732-1-git-send-email-namit@cs.technion.ac.il> +Signed-off-by: Paolo Bonzini + +--- + arch/x86/kvm/x86.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -5775,7 +5775,6 @@ int kvm_arch_init(void *opaque) + kvm_set_mmio_spte_mask(); + + kvm_x86_ops = ops; +- kvm_init_msr_list(); + + kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, + PT_DIRTY_MASK, PT64_NX_MASK, 0); +@@ -7209,7 +7208,14 @@ void kvm_arch_hardware_disable(void) + + int kvm_arch_hardware_setup(void) + { +- return kvm_x86_ops->hardware_setup(); ++ int r; ++ ++ r = kvm_x86_ops->hardware_setup(); ++ if (r != 0) ++ return r; ++ ++ kvm_init_msr_list(); ++ return 0; + } + + void kvm_arch_hardware_unsetup(void) diff --git a/queue-4.0/md-fix-md-io-stats-accounting-broken.patch b/queue-4.0/md-fix-md-io-stats-accounting-broken.patch new file mode 100644 index 00000000000..e52934180f8 --- /dev/null +++ b/queue-4.0/md-fix-md-io-stats-accounting-broken.patch @@ -0,0 +1,59 @@ +From 74672d069b298b03e9f657fd70915e055739882e Mon Sep 17 00:00:00 2001 +From: Gu Zheng +Date: Fri, 3 Apr 2015 08:44:47 +0800 +Subject: md: fix md io stats accounting broken + +From: Gu Zheng + +commit 74672d069b298b03e9f657fd70915e055739882e upstream. + +Simon reported the md io stats accounting issue: +" +I'm seeing "iostat -x -k 1" print this after a RAID1 rebuild on 4.0-rc5. +It's not abnormal other than it's 3-disk, with one being SSD (sdc) and +the other two being write-mostly: + +Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util +sda 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +sdb 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +sdc 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +md0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 345.00 0.00 0.00 0.00 0.00 100.00 +md2 0.00 0.00 0.00 0.00 0.00 0.00 0.00 58779.00 0.00 0.00 0.00 0.00 100.00 +md1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 12.00 0.00 0.00 0.00 0.00 100.00 +" +The cause is commit "18c0b223cf9901727ef3b02da6711ac930b4e5d4" uses the +generic_start_io_acct to account the disk stats rather than the open code, +but it also introduced the increase to .in_flight[rw] which is needless to +md. So we re-use the open code here to fix it. + +Reported-by: Simon Kirby +Signed-off-by: Gu Zheng +Signed-off-by: NeilBrown +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/md.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -249,6 +249,7 @@ static void md_make_request(struct reque + const int rw = bio_data_dir(bio); + struct mddev *mddev = q->queuedata; + unsigned int sectors; ++ int cpu; + + if (mddev == NULL || mddev->pers == NULL + || !mddev->ready) { +@@ -284,7 +285,10 @@ static void md_make_request(struct reque + sectors = bio_sectors(bio); + mddev->pers->make_request(mddev, bio); + +- generic_start_io_acct(rw, sectors, &mddev->gendisk->part0); ++ cpu = part_stat_lock(); ++ part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); ++ part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); ++ part_stat_unlock(); + + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); diff --git a/queue-4.0/mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch b/queue-4.0/mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch new file mode 100644 index 00000000000..bf6ef98b9c7 --- /dev/null +++ b/queue-4.0/mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch @@ -0,0 +1,44 @@ +From 97534127012f0e396eddea4691f4c9b170aed74b Mon Sep 17 00:00:00 2001 +From: Gerald Schaefer +Date: Tue, 14 Apr 2015 15:42:30 -0700 +Subject: mm/hugetlb: use pmd_page() in follow_huge_pmd() + +From: Gerald Schaefer + +commit 97534127012f0e396eddea4691f4c9b170aed74b upstream. + +Commit 61f77eda9bbf ("mm/hugetlb: reduce arch dependent code around +follow_huge_*") broke follow_huge_pmd() on s390, where pmd and pte +layout differ and using pte_page() on a huge pmd will return wrong +results. Using pmd_page() instead fixes this. + +All architectures that were touched by that commit have pmd_page() +defined, so this should not break anything on other architectures. + +Fixes: 61f77eda "mm/hugetlb: reduce arch dependent code around follow_huge_*" +Signed-off-by: Gerald Schaefer +Acked-by: Naoya Horiguchi +Cc: Hugh Dickins +Cc: Michal Hocko , Andrea Arcangeli +Cc: Martin Schwidefsky +Acked-by: David Rientjes +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/hugetlb.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -3735,8 +3735,7 @@ retry: + if (!pmd_huge(*pmd)) + goto out; + if (pmd_present(*pmd)) { +- page = pte_page(*(pte_t *)pmd) + +- ((address & ~PMD_MASK) >> PAGE_SHIFT); ++ page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); + if (flags & FOLL_GET) + get_page(page); + } else { diff --git a/queue-4.0/perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch b/queue-4.0/perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch new file mode 100644 index 00000000000..15adbd01a21 --- /dev/null +++ b/queue-4.0/perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch @@ -0,0 +1,70 @@ +From 517e6341fa123ec3a2f9ea78ad547be910529881 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Sat, 11 Apr 2015 12:16:22 +0200 +Subject: perf/x86/intel: Fix Core2,Atom,NHM,WSM cycles:pp events + +From: Peter Zijlstra + +commit 517e6341fa123ec3a2f9ea78ad547be910529881 upstream. + +Ingo reported that cycles:pp didn't work for him on some machines. + +It turns out that in this commit: + + af4bdcf675cf perf/x86/intel: Disallow flags for most Core2/Atom/Nehalem/Westmere events + +Andi forgot to explicitly allow that event when he +disabled event flags for PEBS on those uarchs. + +Reported-by: Ingo Molnar +Signed-off-by: Peter Zijlstra (Intel) +Cc: Arnaldo Carvalho de Melo +Cc: Jiri Olsa +Cc: Linus Torvalds +Cc: Peter Zijlstra +Fixes: af4bdcf675cf ("perf/x86/intel: Disallow flags for most Core2/Atom/Nehalem/Westmere events") +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/cpu/perf_event_intel_ds.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c ++++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c +@@ -557,6 +557,8 @@ struct event_constraint intel_core2_pebs + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ ++ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ ++ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), + EVENT_CONSTRAINT_END + }; + +@@ -564,6 +566,8 @@ struct event_constraint intel_atom_pebs_ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ ++ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ ++ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), + EVENT_CONSTRAINT_END + }; + +@@ -587,6 +591,8 @@ struct event_constraint intel_nehalem_pe + INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ ++ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ ++ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), + EVENT_CONSTRAINT_END + }; + +@@ -602,6 +608,8 @@ struct event_constraint intel_westmere_p + INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ ++ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ ++ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), + EVENT_CONSTRAINT_END + }; + diff --git a/queue-4.0/powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch b/queue-4.0/powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch new file mode 100644 index 00000000000..1b85f8691dc --- /dev/null +++ b/queue-4.0/powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch @@ -0,0 +1,39 @@ +From 50c6a665b383cb5839e45d04e36faeeefaffa052 Mon Sep 17 00:00:00 2001 +From: Scott Wood +Date: Fri, 10 Apr 2015 19:37:34 -0500 +Subject: powerpc/hugetlb: Call mm_dec_nr_pmds() in hugetlb_free_pmd_range() + +From: Scott Wood + +commit 50c6a665b383cb5839e45d04e36faeeefaffa052 upstream. + +Commit dc6c9a35b66b5 ("mm: account pmd page tables to the process") +added a counter that is incremented whenever a PMD is allocated and +decremented whenever a PMD is freed. For hugepages on PPC, common code +is used to allocated PMDs, but arch-specific code is used to free PMDs. + +This results in kernel output such as "BUG: non-zero nr_pmds on freeing +mm: 1" when using hugepages. + +Update the PPC hugepage PMD freeing code to decrement the count, just +as the above commit did for free_pmd_range(). + +Fixes: dc6c9a35b66b5 ("mm: account pmd page tables to the process") +Signed-off-by: Scott Wood +Reviewed-by: Aneesh Kumar K.V +Signed-off-by: Greg Kroah-Hartman + +--- + arch/powerpc/mm/hugetlbpage.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/powerpc/mm/hugetlbpage.c ++++ b/arch/powerpc/mm/hugetlbpage.c +@@ -581,6 +581,7 @@ static void hugetlb_free_pmd_range(struc + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd, start); ++ mm_dec_nr_pmds(tlb->mm); + } + + static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, diff --git a/queue-4.0/sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch b/queue-4.0/sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch new file mode 100644 index 00000000000..edb656be460 --- /dev/null +++ b/queue-4.0/sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch @@ -0,0 +1,72 @@ +From f8e617f4582995f7c25ef25b4167213120ad122b Mon Sep 17 00:00:00 2001 +From: Mike Galbraith +Date: Sat, 18 Jan 2014 17:14:44 +0100 +Subject: sched/idle/x86: Optimize unnecessary mwait_idle() resched IPIs + +From: Mike Galbraith + +commit f8e617f4582995f7c25ef25b4167213120ad122b upstream. + +To fully take advantage of MWAIT, apparently the CLFLUSH instruction needs +another quirk on certain CPUs: proper barriers around it on certain machines. + +On a Q6600 SMP system, pipe-test scheduling performance, cross core, +improves significantly: + + 3.8.13 487.2 KHz 1.000 + 3.13.0-master 415.5 KHz .852 + 3.13.0-master+ 415.2 KHz .852 + restore mwait_idle + 3.13.0-master++ 488.5 KHz 1.002 + restore mwait_idle + IPI fix + +Since X86_BUG_CLFLUSH_MONITOR is already a quirk, don't create a separate +quirk for the extra smp_mb()s. + +Signed-off-by: Mike Galbraith +Cc: Borislav Petkov +Cc: H. Peter Anvin +Cc: Ian Malone +Cc: Josh Boyer +Cc: Len Brown +Cc: Len Brown +Cc: Linus Torvalds +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/1390061684.5566.4.camel@marge.simpson.net +[ Ported to recent kernel, added comments about the quirk. ] +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/process.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -429,18 +429,22 @@ static int prefer_mwait_c1_over_halt(con + + static void mwait_idle(void) + { +- if (!need_resched()) { +- if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) ++ if (!current_set_polling_and_test()) { ++ if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { ++ smp_mb(); /* quirk */ + clflush((void *)¤t_thread_info()->flags); ++ smp_mb(); /* quirk */ ++ } + + __monitor((void *)¤t_thread_info()->flags, 0, 0); +- smp_mb(); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); +- } else ++ } else { + local_irq_enable(); ++ } ++ __current_clr_polling(); + } + + void select_idle_routine(const struct cpuinfo_x86 *c) diff --git a/queue-4.0/sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch b/queue-4.0/sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch new file mode 100644 index 00000000000..fdfde7a89e1 --- /dev/null +++ b/queue-4.0/sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch @@ -0,0 +1,152 @@ +From b253149b843f89cd300cbdbea27ce1f847506f99 Mon Sep 17 00:00:00 2001 +From: Len Brown +Date: Wed, 15 Jan 2014 00:37:34 -0500 +Subject: sched/idle/x86: Restore mwait_idle() to fix boot hangs, to improve power savings and to improve performance + +From: Len Brown + +commit b253149b843f89cd300cbdbea27ce1f847506f99 upstream. + +In Linux-3.9 we removed the mwait_idle() loop: + + 69fb3676df33 ("x86 idle: remove mwait_idle() and "idle=mwait" cmdline param") + +The reasoning was that modern machines should be sufficiently +happy during the boot process using the default_idle() HALT +loop, until cpuidle loads and either acpi_idle or intel_idle +invoke the newer MWAIT-with-hints idle loop. + +But two machines reported problems: + + 1. Certain Core2-era machines support MWAIT-C1 and HALT only. + MWAIT-C1 is preferred for optimal power and performance. + But if they support just C1, cpuidle never loads and + so they use the boot-time default idle loop forever. + + 2. Some laptops will boot-hang if HALT is used, + but will boot successfully if MWAIT is used. + This appears to be a hidden assumption in BIOS SMI, + that is presumably valid on the proprietary OS + where the BIOS was validated. + + https://bugzilla.kernel.org/show_bug.cgi?id=60770 + +So here we effectively revert the patch above, restoring +the mwait_idle() loop. However, we don't bother restoring +the idle=mwait cmdline parameter, since it appears to add +no value. + +Maintainer notes: + + For 3.9, simply revert 69fb3676df + for 3.10, patch -F3 applies, fuzz needed due to __cpuinit use in + context For 3.11, 3.12, 3.13, this patch applies cleanly + +Tested-by: Mike Galbraith +Signed-off-by: Len Brown +Acked-by: Mike Galbraith +Cc: Borislav Petkov +Cc: H. Peter Anvin +Cc: Ian Malone +Cc: Josh Boyer +Cc: Linus Torvalds +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/345254a551eb5a6a866e048d7ab570fd2193aca4.1389763084.git.len.brown@intel.com +[ Ported to recent kernels. ] +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/mwait.h | 8 +++++++ + arch/x86/kernel/process.c | 47 +++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 55 insertions(+) + +--- a/arch/x86/include/asm/mwait.h ++++ b/arch/x86/include/asm/mwait.h +@@ -30,6 +30,14 @@ static inline void __mwait(unsigned long + :: "a" (eax), "c" (ecx)); + } + ++static inline void __sti_mwait(unsigned long eax, unsigned long ecx) ++{ ++ trace_hardirqs_on(); ++ /* "mwait %eax, %ecx;" */ ++ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" ++ :: "a" (eax), "c" (ecx)); ++} ++ + /* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -399,6 +400,49 @@ static void amd_e400_idle(void) + default_idle(); + } + ++/* ++ * Intel Core2 and older machines prefer MWAIT over HALT for C1. ++ * We can't rely on cpuidle installing MWAIT, because it will not load ++ * on systems that support only C1 -- so the boot default must be MWAIT. ++ * ++ * Some AMD machines are the opposite, they depend on using HALT. ++ * ++ * So for default C1, which is used during boot until cpuidle loads, ++ * use MWAIT-C1 on Intel HW that has it, else use HALT. ++ */ ++static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) ++{ ++ if (c->x86_vendor != X86_VENDOR_INTEL) ++ return 0; ++ ++ if (!cpu_has(c, X86_FEATURE_MWAIT)) ++ return 0; ++ ++ return 1; ++} ++ ++/* ++ * MONITOR/MWAIT with no hints, used for default default C1 state. ++ * This invokes MWAIT with interrutps enabled and no flags, ++ * which is backwards compatible with the original MWAIT implementation. ++ */ ++ ++static void mwait_idle(void) ++{ ++ if (!need_resched()) { ++ if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) ++ clflush((void *)¤t_thread_info()->flags); ++ ++ __monitor((void *)¤t_thread_info()->flags, 0, 0); ++ smp_mb(); ++ if (!need_resched()) ++ __sti_mwait(0, 0); ++ else ++ local_irq_enable(); ++ } else ++ local_irq_enable(); ++} ++ + void select_idle_routine(const struct cpuinfo_x86 *c) + { + #ifdef CONFIG_SMP +@@ -412,6 +456,9 @@ void select_idle_routine(const struct cp + /* E400: APIC timer interrupt does not wake up CPU from C1e */ + pr_info("using AMD E400 aware idle routine\n"); + x86_idle = amd_e400_idle; ++ } else if (prefer_mwait_c1_over_halt(c)) { ++ pr_info("using mwait in idle threads\n"); ++ x86_idle = mwait_idle; + } else + x86_idle = default_idle; + } diff --git a/queue-4.0/series b/queue-4.0/series index c9bbcbca25e..dd6e43010d6 100644 --- a/queue-4.0/series +++ b/queue-4.0/series @@ -8,3 +8,18 @@ net-fix-crash-in-build_skb.patch pxa168-fix-double-deallocation-of-managed-resources.patch net-rfs-fix-crash-in-get_rps_cpus.patch net-mlx4_en-prevent-setting-invalid-rss-hash-function.patch +md-fix-md-io-stats-accounting-broken.patch +x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch +x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch +x86-vdso-fix-pvclock-races-with-task-migration.patch +sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch +sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch +perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch +x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch +kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch +btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch +btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch +btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch +btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch +mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch +powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch diff --git a/queue-4.0/x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch b/queue-4.0/x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch new file mode 100644 index 00000000000..1c1fc1cc94e --- /dev/null +++ b/queue-4.0/x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch @@ -0,0 +1,62 @@ +From 91e5ed49fca09c2b83b262b9757d1376ee2b46c3 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 27 Jan 2015 16:06:02 -0800 +Subject: x86/asm/decoder: Fix and enforce max instruction size in the insn decoder + +From: Andy Lutomirski + +commit 91e5ed49fca09c2b83b262b9757d1376ee2b46c3 upstream. + +x86 instructions cannot exceed 15 bytes, and the instruction +decoder should enforce that. Prior to 6ba48ff46f76, the +instruction length limit was implicitly set to 16, which was an +approximation of 15, but there is currently no limit at all. + +Fix MAX_INSN_SIZE (it should be 15, not 16), and fix the decoder +to reject instructions that exceed MAX_INSN_SIZE. + +Other than potentially confusing some of the decoder sanity +checks, I'm not aware of any actual problems that omitting this +check would cause, nor am I aware of any practical problems +caused by the MAX_INSN_SIZE error. + +Signed-off-by: Andy Lutomirski +Acked-by: Masami Hiramatsu +Cc: Dave Hansen +Fixes: 6ba48ff46f76 ("x86: Remove arbitrary instruction size limit ... +Link: http://lkml.kernel.org/r/f8f0bc9b8c58cfd6830f7d88400bf1396cbdcd0f.1422403511.git.luto@amacapital.net +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/insn.h | 2 +- + arch/x86/lib/insn.c | 7 +++++++ + 2 files changed, 8 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/insn.h ++++ b/arch/x86/include/asm/insn.h +@@ -69,7 +69,7 @@ struct insn { + const insn_byte_t *next_byte; + }; + +-#define MAX_INSN_SIZE 16 ++#define MAX_INSN_SIZE 15 + + #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) + #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) +--- a/arch/x86/lib/insn.c ++++ b/arch/x86/lib/insn.c +@@ -52,6 +52,13 @@ + */ + void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) + { ++ /* ++ * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid ++ * even if the input buffer is long enough to hold them. ++ */ ++ if (buf_len > MAX_INSN_SIZE) ++ buf_len = MAX_INSN_SIZE; ++ + memset(insn, 0, sizeof(*insn)); + insn->kaddr = kaddr; + insn->end_kaddr = kaddr + buf_len; diff --git a/queue-4.0/x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch b/queue-4.0/x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch new file mode 100644 index 00000000000..33e7f021d66 --- /dev/null +++ b/queue-4.0/x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch @@ -0,0 +1,58 @@ +From d869844bd081081bf537e806a44811884230643e Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Thu, 23 Apr 2015 08:33:59 -0700 +Subject: x86: fix special __probe_kernel_write() tail zeroing case + +From: Linus Torvalds + +commit d869844bd081081bf537e806a44811884230643e upstream. + +Commit cae2a173fe94 ("x86: clean up/fix 'copy_in_user()' tail zeroing") +fixed the failure case tail zeroing of one special case of the x86-64 +generic user-copy routine, namely when used for the user-to-user case +("copy_in_user()"). + +But in the process it broke an even more unusual case: using the user +copy routine for kernel-to-kernel copying. + +Now, normally kernel-kernel copies are obviously done using memcpy(), +but we have a couple of special cases when we use the user-copy +functions. One is when we pass a kernel buffer to a regular user-buffer +routine, using set_fs(KERNEL_DS). That's a "normal" case, and continued +to work fine, because it never takes any faults (with the possible +exception of a silent and successful vmalloc fault). + +But Jan Beulich pointed out another, very unusual, special case: when we +use the user-copy routines not because it's a path that expects a user +pointer, but for a couple of ftrace/kgdb cases that want to do a kernel +copy, but do so using "unsafe" buffers, and use the user-copy routine to +gracefully handle faults. IOW, for probe_kernel_write(). + +And that broke for the case of a faulting kernel destination, because we +saw the kernel destination and wanted to try to clear the tail of the +buffer. Which doesn't work, since that's what faults. + +This only triggers for things like kgdb and ftrace users (eg trying +setting a breakpoint on read-only memory), but it's definitely a bug. +The fix is to not compare against the kernel address start (TASK_SIZE), +but instead use the same limits "access_ok()" uses. + +Reported-and-tested-by: Jan Beulich +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/lib/usercopy_64.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/lib/usercopy_64.c ++++ b/arch/x86/lib/usercopy_64.c +@@ -82,7 +82,7 @@ copy_user_handle_tail(char *to, char *fr + clac(); + + /* If the destination is a kernel buffer, we always clear the end */ +- if ((unsigned long)to >= TASK_SIZE_MAX) ++ if (!__addr_ok(to)) + memset(to, 0, len); + return len; + } diff --git a/queue-4.0/x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch b/queue-4.0/x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch new file mode 100644 index 00000000000..33e875407b1 --- /dev/null +++ b/queue-4.0/x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch @@ -0,0 +1,205 @@ +From 0a4e6be9ca17c54817cf814b4b5aa60478c6df27 Mon Sep 17 00:00:00 2001 +From: Marcelo Tosatti +Date: Mon, 23 Mar 2015 20:21:51 -0300 +Subject: x86: kvm: Revert "remove sched notifier for cross-cpu migrations" + +From: Marcelo Tosatti + +commit 0a4e6be9ca17c54817cf814b4b5aa60478c6df27 upstream. + +The following point: + + 2. per-CPU pvclock time info is updated if the + underlying CPU changes. + +Is not true anymore since "KVM: x86: update pvclock area conditionally, +on cpu migration". + +Add task migration notification back. + +Problem noticed by Andy Lutomirski. + +Signed-off-by: Marcelo Tosatti +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/pvclock.h | 1 + arch/x86/kernel/pvclock.c | 44 +++++++++++++++++++++++++++++++++++++++++ + arch/x86/vdso/vclock_gettime.c | 16 +++++++------- + include/linux/sched.h | 8 +++++++ + kernel/sched/core.c | 15 +++++++++++++ + 5 files changed, 76 insertions(+), 8 deletions(-) + +--- a/arch/x86/include/asm/pvclock.h ++++ b/arch/x86/include/asm/pvclock.h +@@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const str + + struct pvclock_vsyscall_time_info { + struct pvclock_vcpu_time_info pvti; ++ u32 migrate_count; + } __attribute__((__aligned__(SMP_CACHE_BYTES))); + + #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) +--- a/arch/x86/kernel/pvclock.c ++++ b/arch/x86/kernel/pvclock.c +@@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclo + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); + } + ++static struct pvclock_vsyscall_time_info *pvclock_vdso_info; ++ ++static struct pvclock_vsyscall_time_info * ++pvclock_get_vsyscall_user_time_info(int cpu) ++{ ++ if (!pvclock_vdso_info) { ++ BUG(); ++ return NULL; ++ } ++ ++ return &pvclock_vdso_info[cpu]; ++} ++ ++struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) ++{ ++ return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; ++} ++ + #ifdef CONFIG_X86_64 ++static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, ++ void *v) ++{ ++ struct task_migration_notifier *mn = v; ++ struct pvclock_vsyscall_time_info *pvti; ++ ++ pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); ++ ++ /* this is NULL when pvclock vsyscall is not initialized */ ++ if (unlikely(pvti == NULL)) ++ return NOTIFY_DONE; ++ ++ pvti->migrate_count++; ++ ++ return NOTIFY_DONE; ++} ++ ++static struct notifier_block pvclock_migrate = { ++ .notifier_call = pvclock_task_migrate, ++}; ++ + /* + * Initialize the generic pvclock vsyscall state. This will allocate + * a/some page(s) for the per-vcpu pvclock information, set up a +@@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct + + WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); + ++ pvclock_vdso_info = i; ++ + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { + __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, + __pa(i) + (idx*PAGE_SIZE), + PAGE_KERNEL_VVAR); + } + ++ ++ register_task_migration_notifier(&pvclock_migrate); ++ + return 0; + } + #endif +--- a/arch/x86/vdso/vclock_gettime.c ++++ b/arch/x86/vdso/vclock_gettime.c +@@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int + cycle_t ret; + u64 last; + u32 version; ++ u32 migrate_count; + u8 flags; + unsigned cpu, cpu1; + + + /* +- * Note: hypervisor must guarantee that: +- * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. +- * 2. that per-CPU pvclock time info is updated if the +- * underlying CPU changes. +- * 3. that version is increased whenever underlying CPU +- * changes. +- * ++ * When looping to get a consistent (time-info, tsc) pair, we ++ * also need to deal with the possibility we can switch vcpus, ++ * so make sure we always re-fetch time-info for the current vcpu. + */ + do { + cpu = __getcpu() & VGETCPU_CPU_MASK; +@@ -104,6 +101,8 @@ static notrace cycle_t vread_pvclock(int + + pvti = get_pvti(cpu); + ++ migrate_count = pvti->migrate_count; ++ + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); + + /* +@@ -115,7 +114,8 @@ static notrace cycle_t vread_pvclock(int + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || +- pvti->pvti.version != version)); ++ pvti->pvti.version != version || ++ pvti->migrate_count != migrate_count)); + + if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) + *mode = VCLOCK_NONE; +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -176,6 +176,14 @@ extern void get_iowait_load(unsigned lon + extern void calc_global_load(unsigned long ticks); + extern void update_cpu_load_nohz(void); + ++/* Notifier for when a task gets migrated to a new CPU */ ++struct task_migration_notifier { ++ struct task_struct *task; ++ int from_cpu; ++ int to_cpu; ++}; ++extern void register_task_migration_notifier(struct notifier_block *n); ++ + extern unsigned long get_parent_ip(unsigned long addr); + + extern void dump_cpu_task(int cpu); +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -996,6 +996,13 @@ void check_preempt_curr(struct rq *rq, s + rq_clock_skip_update(rq, true); + } + ++static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); ++ ++void register_task_migration_notifier(struct notifier_block *n) ++{ ++ atomic_notifier_chain_register(&task_migration_notifier, n); ++} ++ + #ifdef CONFIG_SMP + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) + { +@@ -1026,10 +1033,18 @@ void set_task_cpu(struct task_struct *p, + trace_sched_migrate_task(p, new_cpu); + + if (task_cpu(p) != new_cpu) { ++ struct task_migration_notifier tmn; ++ + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); + p->se.nr_migrations++; + perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); ++ ++ tmn.task = p; ++ tmn.from_cpu = task_cpu(p); ++ tmn.to_cpu = new_cpu; ++ ++ atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); + } + + __set_task_cpu(p, new_cpu); diff --git a/queue-4.0/x86-vdso-fix-pvclock-races-with-task-migration.patch b/queue-4.0/x86-vdso-fix-pvclock-races-with-task-migration.patch new file mode 100644 index 00000000000..ca54c5f0172 --- /dev/null +++ b/queue-4.0/x86-vdso-fix-pvclock-races-with-task-migration.patch @@ -0,0 +1,65 @@ +From 80f7fdb1c7f0f9266421f823964fd1962681f6ce Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= +Date: Thu, 2 Apr 2015 20:44:23 +0200 +Subject: x86: vdso: fix pvclock races with task migration +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= + +commit 80f7fdb1c7f0f9266421f823964fd1962681f6ce upstream. + +If we were migrated right after __getcpu, but before reading the +migration_count, we wouldn't notice that we read TSC of a different +VCPU, nor that KVM's bug made pvti invalid, as only migration_count +on source VCPU is increased. + +Change vdso instead of updating migration_count on destination. + +Signed-off-by: Radim Krčmář +Fixes: 0a4e6be9ca17 ("x86: kvm: Revert "remove sched notifier for cross-cpu migrations"") +Message-Id: <1428000263-11892-1-git-send-email-rkrcmar@redhat.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/vdso/vclock_gettime.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +--- a/arch/x86/vdso/vclock_gettime.c ++++ b/arch/x86/vdso/vclock_gettime.c +@@ -99,21 +99,25 @@ static notrace cycle_t vread_pvclock(int + * __getcpu() calls (Gleb). + */ + +- pvti = get_pvti(cpu); ++ /* Make sure migrate_count will change if we leave the VCPU. */ ++ do { ++ pvti = get_pvti(cpu); ++ migrate_count = pvti->migrate_count; + +- migrate_count = pvti->migrate_count; ++ cpu1 = cpu; ++ cpu = __getcpu() & VGETCPU_CPU_MASK; ++ } while (unlikely(cpu != cpu1)); + + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); + + /* + * Test we're still on the cpu as well as the version. +- * We could have been migrated just after the first +- * vgetcpu but before fetching the version, so we +- * wouldn't notice a version change. ++ * - We must read TSC of pvti's VCPU. ++ * - KVM doesn't follow the versioning protocol, so data could ++ * change before version if we left the VCPU. + */ +- cpu1 = __getcpu() & VGETCPU_CPU_MASK; +- } while (unlikely(cpu != cpu1 || +- (pvti->pvti.version & 1) || ++ smp_rmb(); ++ } while (unlikely((pvti->pvti.version & 1) || + pvti->pvti.version != version || + pvti->migrate_count != migrate_count)); + -- 2.47.2