From 48e5314137f207a963dab3d72d76280abbdc4acc Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 29 Apr 2015 14:06:35 +0200
Subject: [PATCH] 4.0-stable patches

added patches:
	btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch
	btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch
	btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch
	btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch
	kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch
	md-fix-md-io-stats-accounting-broken.patch
	mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch
	perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch
	powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch
	sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch
	sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch
	x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch
	x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch
	x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch
	x86-vdso-fix-pvclock-races-with-task-migration.patch
---
 ...cept-bare-namespace-as-a-valid-xattr.patch | 134 ++++++++++++
 ...-infinite-loop-after-cloning-into-it.patch | 103 +++++++++
 ...nfinite-loop-after-extent_same-ioctl.patch |  49 +++++
 ...ption-when-fs-mounted-with-o-discard.patch |  57 +++++
 ...fix-msr_ia32_bndcfgs-in-msrs_to_save.patch |  51 +++++
 ...md-fix-md-io-stats-accounting-broken.patch |  59 +++++
 ...etlb-use-pmd_page-in-follow_huge_pmd.patch |  44 ++++
 ...-core2-atom-nhm-wsm-cycles-pp-events.patch |  70 ++++++
 ...ec_nr_pmds-in-hugetlb_free_pmd_range.patch |  39 ++++
 ...-unnecessary-mwait_idle-resched-ipis.patch |  72 ++++++
 ...r-savings-and-to-improve-performance.patch | 152 +++++++++++++
 queue-4.0/series                              |  15 ++
 ...instruction-size-in-the-insn-decoder.patch |  62 ++++++
 ...probe_kernel_write-tail-zeroing-case.patch |  58 +++++
 ...ed-notifier-for-cross-cpu-migrations.patch | 205 ++++++++++++++++++
 ...ix-pvclock-races-with-task-migration.patch |  65 ++++++
 16 files changed, 1235 insertions(+)
 create mode 100644 queue-4.0/btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch
 create mode 100644 queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch
 create mode 100644 queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch
 create mode 100644 queue-4.0/btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch
 create mode 100644 queue-4.0/kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch
 create mode 100644 queue-4.0/md-fix-md-io-stats-accounting-broken.patch
 create mode 100644 queue-4.0/mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch
 create mode 100644 queue-4.0/perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch
 create mode 100644 queue-4.0/powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch
 create mode 100644 queue-4.0/sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch
 create mode 100644 queue-4.0/sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch
 create mode 100644 queue-4.0/x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch
 create mode 100644 queue-4.0/x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch
 create mode 100644 queue-4.0/x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch
 create mode 100644 queue-4.0/x86-vdso-fix-pvclock-races-with-task-migration.patch

diff --git a/queue-4.0/btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch b/queue-4.0/btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch
new file mode 100644
index 00000000000..df59c3300e1
--- /dev/null
+++ b/queue-4.0/btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch
@@ -0,0 +1,134 @@
+From 3c3b04d10ff1811a27f86684ccd2f5ba6983211d Mon Sep 17 00:00:00 2001
+From: David Sterba <dsterba@suse.cz>
+Date: Wed, 25 Mar 2015 19:26:41 +0100
+Subject: btrfs: don't accept bare namespace as a valid xattr
+
+From: David Sterba <dsterba@suse.cz>
+
+commit 3c3b04d10ff1811a27f86684ccd2f5ba6983211d upstream.
+
+Due to insufficient check in btrfs_is_valid_xattr, this unexpectedly
+works:
+
+ $ touch file
+ $ setfattr -n user. -v 1 file
+ $ getfattr -d file
+user.="1"
+
+ie. the missing attribute name after the namespace.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=94291
+Reported-by: William Douglas <william.douglas@intel.com>
+Signed-off-by: David Sterba <dsterba@suse.cz>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/xattr.c |   53 +++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 39 insertions(+), 14 deletions(-)
+
+--- a/fs/btrfs/xattr.c
++++ b/fs/btrfs/xattr.c
+@@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_
+ /*
+  * Check if the attribute is in a supported namespace.
+  *
+- * This applied after the check for the synthetic attributes in the system
++ * This is applied after the check for the synthetic attributes in the system
+  * namespace.
+  */
+-static bool btrfs_is_valid_xattr(const char *name)
++static int btrfs_is_valid_xattr(const char *name)
+ {
+-	return !strncmp(name, XATTR_SECURITY_PREFIX,
+-			XATTR_SECURITY_PREFIX_LEN) ||
+-	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
+-	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+-	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
+-		!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN);
++	int len = strlen(name);
++	int prefixlen = 0;
++
++	if (!strncmp(name, XATTR_SECURITY_PREFIX,
++			XATTR_SECURITY_PREFIX_LEN))
++		prefixlen = XATTR_SECURITY_PREFIX_LEN;
++	else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
++		prefixlen = XATTR_SYSTEM_PREFIX_LEN;
++	else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
++		prefixlen = XATTR_TRUSTED_PREFIX_LEN;
++	else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
++		prefixlen = XATTR_USER_PREFIX_LEN;
++	else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
++		prefixlen = XATTR_BTRFS_PREFIX_LEN;
++	else
++		return -EOPNOTSUPP;
++
++	/*
++	 * The name cannot consist of just prefix
++	 */
++	if (len <= prefixlen)
++		return -EINVAL;
++
++	return 0;
+ }
+ 
+ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+ 		       void *buffer, size_t size)
+ {
++	int ret;
++
+ 	/*
+ 	 * If this is a request for a synthetic attribute in the system.*
+ 	 * namespace use the generic infrastructure to resolve a handler
+@@ -388,8 +408,9 @@ ssize_t btrfs_getxattr(struct dentry *de
+ 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ 		return generic_getxattr(dentry, name, buffer, size);
+ 
+-	if (!btrfs_is_valid_xattr(name))
+-		return -EOPNOTSUPP;
++	ret = btrfs_is_valid_xattr(name);
++	if (ret)
++		return ret;
+ 	return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
+ }
+ 
+@@ -397,6 +418,7 @@ int btrfs_setxattr(struct dentry *dentry
+ 		   size_t size, int flags)
+ {
+ 	struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
++	int ret;
+ 
+ 	/*
+ 	 * The permission on security.* and system.* is not checked
+@@ -413,8 +435,9 @@ int btrfs_setxattr(struct dentry *dentry
+ 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ 		return generic_setxattr(dentry, name, value, size, flags);
+ 
+-	if (!btrfs_is_valid_xattr(name))
+-		return -EOPNOTSUPP;
++	ret = btrfs_is_valid_xattr(name);
++	if (ret)
++		return ret;
+ 
+ 	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+ 		return btrfs_set_prop(dentry->d_inode, name,
+@@ -430,6 +453,7 @@ int btrfs_setxattr(struct dentry *dentry
+ int btrfs_removexattr(struct dentry *dentry, const char *name)
+ {
+ 	struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
++	int ret;
+ 
+ 	/*
+ 	 * The permission on security.* and system.* is not checked
+@@ -446,8 +470,9 @@ int btrfs_removexattr(struct dentry *den
+ 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ 		return generic_removexattr(dentry, name);
+ 
+-	if (!btrfs_is_valid_xattr(name))
+-		return -EOPNOTSUPP;
++	ret = btrfs_is_valid_xattr(name);
++	if (ret)
++		return ret;
+ 
+ 	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+ 		return btrfs_set_prop(dentry->d_inode, name,
diff --git a/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch b/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch
new file mode 100644
index 00000000000..1804e0175b3
--- /dev/null
+++ b/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch
@@ -0,0 +1,103 @@
+From ccccf3d67294714af2d72a6fd6fd7d73b01c9329 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 30 Mar 2015 18:23:59 +0100
+Subject: Btrfs: fix inode eviction infinite loop after cloning into it
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit ccccf3d67294714af2d72a6fd6fd7d73b01c9329 upstream.
+
+If we attempt to clone a 0 length region into a file we can end up
+inserting a range in the inode's extent_io tree with a start offset
+that is greater then the end offset, which triggers immediately the
+following warning:
+
+[ 3914.619057] WARNING: CPU: 17 PID: 4199 at fs/btrfs/extent_io.c:435 insert_state+0x4b/0x10b [btrfs]()
+[ 3914.620886] BTRFS: end < start 4095 4096
+(...)
+[ 3914.638093] Call Trace:
+[ 3914.638636]  [<ffffffff81425fd9>] dump_stack+0x4c/0x65
+[ 3914.639620]  [<ffffffff81045390>] warn_slowpath_common+0xa1/0xbb
+[ 3914.640789]  [<ffffffffa03ca44f>] ? insert_state+0x4b/0x10b [btrfs]
+[ 3914.642041]  [<ffffffff810453f0>] warn_slowpath_fmt+0x46/0x48
+[ 3914.643236]  [<ffffffffa03ca44f>] insert_state+0x4b/0x10b [btrfs]
+[ 3914.644441]  [<ffffffffa03ca729>] __set_extent_bit+0x107/0x3f4 [btrfs]
+[ 3914.645711]  [<ffffffffa03cb256>] lock_extent_bits+0x65/0x1bf [btrfs]
+[ 3914.646914]  [<ffffffff8142b2fb>] ? _raw_spin_unlock+0x28/0x33
+[ 3914.648058]  [<ffffffffa03cbac4>] ? test_range_bit+0xcc/0xde [btrfs]
+[ 3914.650105]  [<ffffffffa03cb3c3>] lock_extent+0x13/0x15 [btrfs]
+[ 3914.651361]  [<ffffffffa03db39e>] lock_extent_range+0x3d/0xcd [btrfs]
+[ 3914.652761]  [<ffffffffa03de1fe>] btrfs_ioctl_clone+0x278/0x388 [btrfs]
+[ 3914.654128]  [<ffffffff811226dd>] ? might_fault+0x58/0xb5
+[ 3914.655320]  [<ffffffffa03e0909>] btrfs_ioctl+0xb51/0x2195 [btrfs]
+(...)
+[ 3914.669271] ---[ end trace 14843d3e2e622fc1 ]---
+
+This later makes the inode eviction handler enter an infinite loop that
+keeps dumping the following warning over and over:
+
+[ 3915.117629] WARNING: CPU: 22 PID: 4228 at fs/btrfs/extent_io.c:435 insert_state+0x4b/0x10b [btrfs]()
+[ 3915.119913] BTRFS: end < start 4095 4096
+(...)
+[ 3915.137394] Call Trace:
+[ 3915.137913]  [<ffffffff81425fd9>] dump_stack+0x4c/0x65
+[ 3915.139154]  [<ffffffff81045390>] warn_slowpath_common+0xa1/0xbb
+[ 3915.140316]  [<ffffffffa03ca44f>] ? insert_state+0x4b/0x10b [btrfs]
+[ 3915.141505]  [<ffffffff810453f0>] warn_slowpath_fmt+0x46/0x48
+[ 3915.142709]  [<ffffffffa03ca44f>] insert_state+0x4b/0x10b [btrfs]
+[ 3915.143849]  [<ffffffffa03ca729>] __set_extent_bit+0x107/0x3f4 [btrfs]
+[ 3915.145120]  [<ffffffffa038c1e3>] ? btrfs_kill_super+0x17/0x23 [btrfs]
+[ 3915.146352]  [<ffffffff811548f6>] ? deactivate_locked_super+0x3b/0x50
+[ 3915.147565]  [<ffffffffa03cb256>] lock_extent_bits+0x65/0x1bf [btrfs]
+[ 3915.148785]  [<ffffffff8142b7e2>] ? _raw_write_unlock+0x28/0x33
+[ 3915.149931]  [<ffffffffa03bc325>] btrfs_evict_inode+0x196/0x482 [btrfs]
+[ 3915.151154]  [<ffffffff81168904>] evict+0xa0/0x148
+[ 3915.152094]  [<ffffffff811689e5>] dispose_list+0x39/0x43
+[ 3915.153081]  [<ffffffff81169564>] evict_inodes+0xdc/0xeb
+[ 3915.154062]  [<ffffffff81154418>] generic_shutdown_super+0x49/0xef
+[ 3915.155193]  [<ffffffff811546d1>] kill_anon_super+0x13/0x1e
+[ 3915.156274]  [<ffffffffa038c1e3>] btrfs_kill_super+0x17/0x23 [btrfs]
+(...)
+[ 3915.167404] ---[ end trace 14843d3e2e622fc2 ]---
+
+So just bail out of the clone ioctl if the length of the region to clone
+is zero, without locking any extent range, in order to prevent this issue
+(same behaviour as a pwrite with a 0 length for example).
+
+This is trivial to reproduce. For example, the steps for the test I just
+made for fstests:
+
+  mkfs.btrfs -f SCRATCH_DEV
+  mount SCRATCH_DEV $SCRATCH_MNT
+
+  touch $SCRATCH_MNT/foo
+  touch $SCRATCH_MNT/bar
+
+  $CLONER_PROG -s 0 -d 4096 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
+  umount $SCRATCH_MNT
+
+A test case for fstests follows soon.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: Omar Sandoval <osandov@osandov.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -3626,6 +3626,11 @@ static noinline long btrfs_ioctl_clone(s
+ 	if (off + len == src->i_size)
+ 		len = ALIGN(src->i_size, bs) - off;
+ 
++	if (len == 0) {
++		ret = 0;
++		goto out_unlock;
++	}
++
+ 	/* verify the end result is block aligned */
+ 	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
+ 	    !IS_ALIGNED(destoff, bs))
diff --git a/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch b/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch
new file mode 100644
index 00000000000..63c518dc96e
--- /dev/null
+++ b/queue-4.0/btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch
@@ -0,0 +1,49 @@
+From 113e8283869b9855c8b999796aadd506bbac155f Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 30 Mar 2015 18:26:47 +0100
+Subject: Btrfs: fix inode eviction infinite loop after extent_same ioctl
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 113e8283869b9855c8b999796aadd506bbac155f upstream.
+
+If we pass a length of 0 to the extent_same ioctl, we end up locking an
+extent range with a start offset greater then its end offset (if the
+destination file's offset is greater than zero). This results in a warning
+from extent_io.c:insert_state through the following call chain:
+
+  btrfs_extent_same()
+    btrfs_double_lock()
+      lock_extent_range()
+        lock_extent(inode->io_tree, offset, offset + len - 1)
+          lock_extent_bits()
+            __set_extent_bit()
+              insert_state()
+                --> WARN_ON(end < start)
+
+This leads to an infinite loop when evicting the inode. This is the same
+problem that my previous patch titled
+"Btrfs: fix inode eviction infinite loop after cloning into it" addressed
+but for the extent_same ioctl instead of the clone ioctl.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: Omar Sandoval <osandov@osandov.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -2897,6 +2897,9 @@ static int btrfs_extent_same(struct inod
+ 	if (src == dst)
+ 		return -EINVAL;
+ 
++	if (len == 0)
++		return 0;
++
+ 	btrfs_double_lock(src, loff, dst, dst_loff, len);
+ 
+ 	ret = extent_same_check_offsets(src, loff, len);
diff --git a/queue-4.0/btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch b/queue-4.0/btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch
new file mode 100644
index 00000000000..97be7a8cdbb
--- /dev/null
+++ b/queue-4.0/btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch
@@ -0,0 +1,57 @@
+From dcc82f4783ad91d4ab654f89f37ae9291cdc846a Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 23 Mar 2015 14:07:40 +0000
+Subject: Btrfs: fix log tree corruption when fs mounted with -o discard
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit dcc82f4783ad91d4ab654f89f37ae9291cdc846a upstream.
+
+While committing a transaction we free the log roots before we write the
+new super block. Freeing the log roots implies marking the disk location
+of every node/leaf (metadata extent) as pinned before the new super block
+is written. This is to prevent the disk location of log metadata extents
+from being reused before the new super block is written, otherwise we
+would have a corrupted log tree if before the new super block is written
+a crash/reboot happens and the location of any log tree metadata extent
+ended up being reused and rewritten.
+
+Even though we pinned the log tree's metadata extents, we were issuing a
+discard against them if the fs was mounted with the -o discard option,
+resulting in corruption of the log tree if a crash/reboot happened before
+writing the new super block - the next time the fs was mounted, during
+the log replay process we would find nodes/leafs of the log btree with
+a content full of zeroes, causing the process to fail and require the
+use of the tool btrfs-zero-log to wipeout the log tree (and all data
+previously fsynced becoming lost forever).
+
+Fix this by not doing a discard when pinning an extent. The discard will
+be done later when it's safe (after the new super block is committed) at
+extent-tree.c:btrfs_finish_extent_commit().
+
+Fixes: e688b7252f78 (Btrfs: fix extent pinning bugs in the tree log)
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/extent-tree.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -6956,12 +6956,11 @@ static int __btrfs_free_reserved_extent(
+ 		return -ENOSPC;
+ 	}
+ 
+-	if (btrfs_test_opt(root, DISCARD))
+-		ret = btrfs_discard_extent(root, start, len, NULL);
+-
+ 	if (pin)
+ 		pin_down_extent(root, cache, start, len, 1);
+ 	else {
++		if (btrfs_test_opt(root, DISCARD))
++			ret = btrfs_discard_extent(root, start, len, NULL);
+ 		btrfs_add_free_space(cache, start, len);
+ 		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+ 	}
diff --git a/queue-4.0/kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch b/queue-4.0/kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch
new file mode 100644
index 00000000000..8f8d6f47dcf
--- /dev/null
+++ b/queue-4.0/kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch
@@ -0,0 +1,51 @@
+From 9e9c3fe40bcd28e3f98f0ad8408435f4503f2781 Mon Sep 17 00:00:00 2001
+From: Nadav Amit <namit@cs.technion.ac.il>
+Date: Sun, 12 Apr 2015 21:47:15 +0300
+Subject: KVM: x86: Fix MSR_IA32_BNDCFGS in msrs_to_save
+
+From: Nadav Amit <namit@cs.technion.ac.il>
+
+commit 9e9c3fe40bcd28e3f98f0ad8408435f4503f2781 upstream.
+
+kvm_init_msr_list is currently called before hardware_setup. As a result,
+vmx_mpx_supported always returns false when kvm_init_msr_list checks whether to
+save MSR_IA32_BNDCFGS.
+
+Move kvm_init_msr_list after vmx_hardware_setup is called to fix this issue.
+
+Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+Message-Id: <1428864435-4732-1-git-send-email-namit@cs.technion.ac.il>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+
+---
+ arch/x86/kvm/x86.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -5775,7 +5775,6 @@ int kvm_arch_init(void *opaque)
+ 	kvm_set_mmio_spte_mask();
+ 
+ 	kvm_x86_ops = ops;
+-	kvm_init_msr_list();
+ 
+ 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
+ 			PT_DIRTY_MASK, PT64_NX_MASK, 0);
+@@ -7209,7 +7208,14 @@ void kvm_arch_hardware_disable(void)
+ 
+ int kvm_arch_hardware_setup(void)
+ {
+-	return kvm_x86_ops->hardware_setup();
++	int r;
++
++	r = kvm_x86_ops->hardware_setup();
++	if (r != 0)
++		return r;
++
++	kvm_init_msr_list();
++	return 0;
+ }
+ 
+ void kvm_arch_hardware_unsetup(void)
diff --git a/queue-4.0/md-fix-md-io-stats-accounting-broken.patch b/queue-4.0/md-fix-md-io-stats-accounting-broken.patch
new file mode 100644
index 00000000000..e52934180f8
--- /dev/null
+++ b/queue-4.0/md-fix-md-io-stats-accounting-broken.patch
@@ -0,0 +1,59 @@
+From 74672d069b298b03e9f657fd70915e055739882e Mon Sep 17 00:00:00 2001
+From: Gu Zheng <guz.fnst@cn.fujitsu.com>
+Date: Fri, 3 Apr 2015 08:44:47 +0800
+Subject: md: fix md io stats accounting broken
+
+From: Gu Zheng <guz.fnst@cn.fujitsu.com>
+
+commit 74672d069b298b03e9f657fd70915e055739882e upstream.
+
+Simon reported the md io stats accounting issue:
+"
+I'm seeing "iostat -x -k 1" print this after a RAID1 rebuild on 4.0-rc5.
+It's not abnormal other than it's 3-disk, with one being SSD (sdc) and
+the other two being write-mostly:
+
+Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
+sda               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00    0.00    0.00   0.00   0.00
+sdb               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00    0.00    0.00   0.00   0.00
+sdc               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00    0.00    0.00   0.00   0.00
+md0               0.00     0.00    0.00    0.00     0.00     0.00     0.00   345.00    0.00    0.00    0.00   0.00 100.00
+md2               0.00     0.00    0.00    0.00     0.00     0.00     0.00 58779.00    0.00    0.00    0.00   0.00 100.00
+md1               0.00     0.00    0.00    0.00     0.00     0.00     0.00    12.00    0.00    0.00    0.00   0.00 100.00
+"
+The cause is commit "18c0b223cf9901727ef3b02da6711ac930b4e5d4" uses the
+generic_start_io_acct to account the disk stats rather than the open code,
+but it also introduced the increase to .in_flight[rw] which is needless to
+md. So we re-use the open code here to fix it.
+
+Reported-by: Simon Kirby <sim@hostway.ca>
+Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
+Signed-off-by: NeilBrown <neilb@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/md.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/drivers/md/md.c
++++ b/drivers/md/md.c
+@@ -249,6 +249,7 @@ static void md_make_request(struct reque
+ 	const int rw = bio_data_dir(bio);
+ 	struct mddev *mddev = q->queuedata;
+ 	unsigned int sectors;
++	int cpu;
+ 
+ 	if (mddev == NULL || mddev->pers == NULL
+ 	    || !mddev->ready) {
+@@ -284,7 +285,10 @@ static void md_make_request(struct reque
+ 	sectors = bio_sectors(bio);
+ 	mddev->pers->make_request(mddev, bio);
+ 
+-	generic_start_io_acct(rw, sectors, &mddev->gendisk->part0);
++	cpu = part_stat_lock();
++	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
++	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
++	part_stat_unlock();
+ 
+ 	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+ 		wake_up(&mddev->sb_wait);
diff --git a/queue-4.0/mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch b/queue-4.0/mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch
new file mode 100644
index 00000000000..bf6ef98b9c7
--- /dev/null
+++ b/queue-4.0/mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch
@@ -0,0 +1,44 @@
+From 97534127012f0e396eddea4691f4c9b170aed74b Mon Sep 17 00:00:00 2001
+From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
+Date: Tue, 14 Apr 2015 15:42:30 -0700
+Subject: mm/hugetlb: use pmd_page() in follow_huge_pmd()
+
+From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
+
+commit 97534127012f0e396eddea4691f4c9b170aed74b upstream.
+
+Commit 61f77eda9bbf ("mm/hugetlb: reduce arch dependent code around
+follow_huge_*") broke follow_huge_pmd() on s390, where pmd and pte
+layout differ and using pte_page() on a huge pmd will return wrong
+results.  Using pmd_page() instead fixes this.
+
+All architectures that were touched by that commit have pmd_page()
+defined, so this should not break anything on other architectures.
+
+Fixes: 61f77eda "mm/hugetlb: reduce arch dependent code around follow_huge_*"
+Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
+Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Michal Hocko <mhocko@suse.cz>, Andrea Arcangeli <aarcange@redhat.com>
+Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -3735,8 +3735,7 @@ retry:
+ 	if (!pmd_huge(*pmd))
+ 		goto out;
+ 	if (pmd_present(*pmd)) {
+-		page = pte_page(*(pte_t *)pmd) +
+-			((address & ~PMD_MASK) >> PAGE_SHIFT);
++		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ 		if (flags & FOLL_GET)
+ 			get_page(page);
+ 	} else {
diff --git a/queue-4.0/perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch b/queue-4.0/perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch
new file mode 100644
index 00000000000..15adbd01a21
--- /dev/null
+++ b/queue-4.0/perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch
@@ -0,0 +1,70 @@
+From 517e6341fa123ec3a2f9ea78ad547be910529881 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Sat, 11 Apr 2015 12:16:22 +0200
+Subject: perf/x86/intel: Fix Core2,Atom,NHM,WSM cycles:pp events
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 517e6341fa123ec3a2f9ea78ad547be910529881 upstream.
+
+Ingo reported that cycles:pp didn't work for him on some machines.
+
+It turns out that in this commit:
+
+  af4bdcf675cf perf/x86/intel: Disallow flags for most Core2/Atom/Nehalem/Westmere events
+
+Andi forgot to explicitly allow that event when he
+disabled event flags for PEBS on those uarchs.
+
+Reported-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
+Cc: Jiri Olsa <jolsa@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Fixes: af4bdcf675cf ("perf/x86/intel: Disallow flags for most Core2/Atom/Nehalem/Westmere events")
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/cpu/perf_event_intel_ds.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
++++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
+@@ -557,6 +557,8 @@ struct event_constraint intel_core2_pebs
+ 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+ 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+ 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
++	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
++	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+ 	EVENT_CONSTRAINT_END
+ };
+ 
+@@ -564,6 +566,8 @@ struct event_constraint intel_atom_pebs_
+ 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+ 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+ 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
++	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
++	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+ 	EVENT_CONSTRAINT_END
+ };
+ 
+@@ -587,6 +591,8 @@ struct event_constraint intel_nehalem_pe
+ 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+ 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+ 	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
++	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
++	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+ 	EVENT_CONSTRAINT_END
+ };
+ 
+@@ -602,6 +608,8 @@ struct event_constraint intel_westmere_p
+ 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+ 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+ 	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
++	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
++	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+ 	EVENT_CONSTRAINT_END
+ };
+ 
diff --git a/queue-4.0/powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch b/queue-4.0/powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch
new file mode 100644
index 00000000000..1b85f8691dc
--- /dev/null
+++ b/queue-4.0/powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch
@@ -0,0 +1,39 @@
+From 50c6a665b383cb5839e45d04e36faeeefaffa052 Mon Sep 17 00:00:00 2001
+From: Scott Wood <scottwood@freescale.com>
+Date: Fri, 10 Apr 2015 19:37:34 -0500
+Subject: powerpc/hugetlb: Call mm_dec_nr_pmds() in hugetlb_free_pmd_range()
+
+From: Scott Wood <scottwood@freescale.com>
+
+commit 50c6a665b383cb5839e45d04e36faeeefaffa052 upstream.
+
+Commit dc6c9a35b66b5 ("mm: account pmd page tables to the process")
+added a counter that is incremented whenever a PMD is allocated and
+decremented whenever a PMD is freed.  For hugepages on PPC, common code
+is used to allocated PMDs, but arch-specific code is used to free PMDs.
+
+This results in kernel output such as "BUG: non-zero nr_pmds on freeing
+mm: 1" when using hugepages.
+
+Update the PPC hugepage PMD freeing code to decrement the count, just
+as the above commit did for free_pmd_range().
+
+Fixes: dc6c9a35b66b5 ("mm: account pmd page tables to the process")
+Signed-off-by: Scott Wood <scottwood@freescale.com>
+Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/powerpc/mm/hugetlbpage.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/powerpc/mm/hugetlbpage.c
++++ b/arch/powerpc/mm/hugetlbpage.c
+@@ -581,6 +581,7 @@ static void hugetlb_free_pmd_range(struc
+ 	pmd = pmd_offset(pud, start);
+ 	pud_clear(pud);
+ 	pmd_free_tlb(tlb, pmd, start);
++	mm_dec_nr_pmds(tlb->mm);
+ }
+ 
+ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
diff --git a/queue-4.0/sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch b/queue-4.0/sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch
new file mode 100644
index 00000000000..edb656be460
--- /dev/null
+++ b/queue-4.0/sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch
@@ -0,0 +1,72 @@
+From f8e617f4582995f7c25ef25b4167213120ad122b Mon Sep 17 00:00:00 2001
+From: Mike Galbraith <bitbucket@online.de>
+Date: Sat, 18 Jan 2014 17:14:44 +0100
+Subject: sched/idle/x86: Optimize unnecessary mwait_idle() resched IPIs
+
+From: Mike Galbraith <bitbucket@online.de>
+
+commit f8e617f4582995f7c25ef25b4167213120ad122b upstream.
+
+To fully take advantage of MWAIT, apparently the CLFLUSH instruction needs
+another quirk on certain CPUs: proper barriers around it on certain machines.
+
+On a Q6600 SMP system, pipe-test scheduling performance, cross core,
+improves significantly:
+
+  3.8.13                   487.2 KHz    1.000
+  3.13.0-master            415.5 KHz     .852
+  3.13.0-master+           415.2 KHz     .852     + restore mwait_idle
+  3.13.0-master++          488.5 KHz    1.002     + restore mwait_idle + IPI fix
+
+Since X86_BUG_CLFLUSH_MONITOR is already a quirk, don't create a separate
+quirk for the extra smp_mb()s.
+
+Signed-off-by: Mike Galbraith <bitbucket@online.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Ian Malone <ibmalone@gmail.com>
+Cc: Josh Boyer <jwboyer@redhat.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Len Brown <lenb@kernel.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/1390061684.5566.4.camel@marge.simpson.net
+[ Ported to recent kernel, added comments about the quirk. ]
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/process.c |   12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -429,18 +429,22 @@ static int prefer_mwait_c1_over_halt(con
+ 
+ static void mwait_idle(void)
+ {
+-	if (!need_resched()) {
+-		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR))
++	if (!current_set_polling_and_test()) {
++		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
++			smp_mb(); /* quirk */
+ 			clflush((void *)&current_thread_info()->flags);
++			smp_mb(); /* quirk */
++		}
+ 
+ 		__monitor((void *)&current_thread_info()->flags, 0, 0);
+-		smp_mb();
+ 		if (!need_resched())
+ 			__sti_mwait(0, 0);
+ 		else
+ 			local_irq_enable();
+-	} else
++	} else {
+ 		local_irq_enable();
++	}
++	__current_clr_polling();
+ }
+ 
+ void select_idle_routine(const struct cpuinfo_x86 *c)
diff --git a/queue-4.0/sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch b/queue-4.0/sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch
new file mode 100644
index 00000000000..fdfde7a89e1
--- /dev/null
+++ b/queue-4.0/sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch
@@ -0,0 +1,152 @@
+From b253149b843f89cd300cbdbea27ce1f847506f99 Mon Sep 17 00:00:00 2001
+From: Len Brown <len.brown@intel.com>
+Date: Wed, 15 Jan 2014 00:37:34 -0500
+Subject: sched/idle/x86: Restore mwait_idle() to fix boot hangs, to improve power savings and to improve performance
+
+From: Len Brown <len.brown@intel.com>
+
+commit b253149b843f89cd300cbdbea27ce1f847506f99 upstream.
+
+In Linux-3.9 we removed the mwait_idle() loop:
+
+  69fb3676df33 ("x86 idle: remove mwait_idle() and "idle=mwait" cmdline param")
+
+The reasoning was that modern machines should be sufficiently
+happy during the boot process using the default_idle() HALT
+loop, until cpuidle loads and either acpi_idle or intel_idle
+invoke the newer MWAIT-with-hints idle loop.
+
+But two machines reported problems:
+
+ 1. Certain Core2-era machines support MWAIT-C1 and HALT only.
+    MWAIT-C1 is preferred for optimal power and performance.
+    But if they support just C1, cpuidle never loads and
+    so they use the boot-time default idle loop forever.
+
+ 2. Some laptops will boot-hang if HALT is used,
+    but will boot successfully if MWAIT is used.
+    This appears to be a hidden assumption in BIOS SMI,
+    that is presumably valid on the proprietary OS
+    where the BIOS was validated.
+
+       https://bugzilla.kernel.org/show_bug.cgi?id=60770
+
+So here we effectively revert the patch above, restoring
+the mwait_idle() loop.  However, we don't bother restoring
+the idle=mwait cmdline parameter, since it appears to add
+no value.
+
+Maintainer notes:
+
+  For 3.9, simply revert 69fb3676df
+  for 3.10, patch -F3 applies, fuzz needed due to __cpuinit use in
+  context For 3.11, 3.12, 3.13, this patch applies cleanly
+
+Tested-by: Mike Galbraith <bitbucket@online.de>
+Signed-off-by: Len Brown <len.brown@intel.com>
+Acked-by: Mike Galbraith <bitbucket@online.de>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Ian Malone <ibmalone@gmail.com>
+Cc: Josh Boyer <jwboyer@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/345254a551eb5a6a866e048d7ab570fd2193aca4.1389763084.git.len.brown@intel.com
+[ Ported to recent kernels. ]
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/mwait.h |    8 +++++++
+ arch/x86/kernel/process.c    |   47 +++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 55 insertions(+)
+
+--- a/arch/x86/include/asm/mwait.h
++++ b/arch/x86/include/asm/mwait.h
+@@ -30,6 +30,14 @@ static inline void __mwait(unsigned long
+ 		     :: "a" (eax), "c" (ecx));
+ }
+ 
++static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
++{
++	trace_hardirqs_on();
++	/* "mwait %eax, %ecx;" */
++	asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
++		     :: "a" (eax), "c" (ecx));
++}
++
+ /*
+  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+  * which can obviate IPI to trigger checking of need_resched.
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -24,6 +24,7 @@
+ #include <asm/syscalls.h>
+ #include <asm/idle.h>
+ #include <asm/uaccess.h>
++#include <asm/mwait.h>
+ #include <asm/i387.h>
+ #include <asm/fpu-internal.h>
+ #include <asm/debugreg.h>
+@@ -399,6 +400,49 @@ static void amd_e400_idle(void)
+ 		default_idle();
+ }
+ 
++/*
++ * Intel Core2 and older machines prefer MWAIT over HALT for C1.
++ * We can't rely on cpuidle installing MWAIT, because it will not load
++ * on systems that support only C1 -- so the boot default must be MWAIT.
++ *
++ * Some AMD machines are the opposite, they depend on using HALT.
++ *
++ * So for default C1, which is used during boot until cpuidle loads,
++ * use MWAIT-C1 on Intel HW that has it, else use HALT.
++ */
++static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
++{
++	if (c->x86_vendor != X86_VENDOR_INTEL)
++		return 0;
++
++	if (!cpu_has(c, X86_FEATURE_MWAIT))
++		return 0;
++
++	return 1;
++}
++
++/*
++ * MONITOR/MWAIT with no hints, used for default default C1 state.
++ * This invokes MWAIT with interrutps enabled and no flags,
++ * which is backwards compatible with the original MWAIT implementation.
++ */
++
++static void mwait_idle(void)
++{
++	if (!need_resched()) {
++		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR))
++			clflush((void *)&current_thread_info()->flags);
++
++		__monitor((void *)&current_thread_info()->flags, 0, 0);
++		smp_mb();
++		if (!need_resched())
++			__sti_mwait(0, 0);
++		else
++			local_irq_enable();
++	} else
++		local_irq_enable();
++}
++
+ void select_idle_routine(const struct cpuinfo_x86 *c)
+ {
+ #ifdef CONFIG_SMP
+@@ -412,6 +456,9 @@ void select_idle_routine(const struct cp
+ 		/* E400: APIC timer interrupt does not wake up CPU from C1e */
+ 		pr_info("using AMD E400 aware idle routine\n");
+ 		x86_idle = amd_e400_idle;
++	} else if (prefer_mwait_c1_over_halt(c)) {
++		pr_info("using mwait in idle threads\n");
++		x86_idle = mwait_idle;
+ 	} else
+ 		x86_idle = default_idle;
+ }
diff --git a/queue-4.0/series b/queue-4.0/series
index c9bbcbca25e..dd6e43010d6 100644
--- a/queue-4.0/series
+++ b/queue-4.0/series
@@ -8,3 +8,18 @@ net-fix-crash-in-build_skb.patch
 pxa168-fix-double-deallocation-of-managed-resources.patch
 net-rfs-fix-crash-in-get_rps_cpus.patch
 net-mlx4_en-prevent-setting-invalid-rss-hash-function.patch
+md-fix-md-io-stats-accounting-broken.patch
+x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch
+x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch
+x86-vdso-fix-pvclock-races-with-task-migration.patch
+sched-idle-x86-restore-mwait_idle-to-fix-boot-hangs-to-improve-power-savings-and-to-improve-performance.patch
+sched-idle-x86-optimize-unnecessary-mwait_idle-resched-ipis.patch
+perf-x86-intel-fix-core2-atom-nhm-wsm-cycles-pp-events.patch
+x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch
+kvm-x86-fix-msr_ia32_bndcfgs-in-msrs_to_save.patch
+btrfs-fix-log-tree-corruption-when-fs-mounted-with-o-discard.patch
+btrfs-don-t-accept-bare-namespace-as-a-valid-xattr.patch
+btrfs-fix-inode-eviction-infinite-loop-after-cloning-into-it.patch
+btrfs-fix-inode-eviction-infinite-loop-after-extent_same-ioctl.patch
+mm-hugetlb-use-pmd_page-in-follow_huge_pmd.patch
+powerpc-hugetlb-call-mm_dec_nr_pmds-in-hugetlb_free_pmd_range.patch
diff --git a/queue-4.0/x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch b/queue-4.0/x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch
new file mode 100644
index 00000000000..1c1fc1cc94e
--- /dev/null
+++ b/queue-4.0/x86-asm-decoder-fix-and-enforce-max-instruction-size-in-the-insn-decoder.patch
@@ -0,0 +1,62 @@
+From 91e5ed49fca09c2b83b262b9757d1376ee2b46c3 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@amacapital.net>
+Date: Tue, 27 Jan 2015 16:06:02 -0800
+Subject: x86/asm/decoder: Fix and enforce max instruction size in the insn decoder
+
+From: Andy Lutomirski <luto@amacapital.net>
+
+commit 91e5ed49fca09c2b83b262b9757d1376ee2b46c3 upstream.
+
+x86 instructions cannot exceed 15 bytes, and the instruction
+decoder should enforce that.  Prior to 6ba48ff46f76, the
+instruction length limit was implicitly set to 16, which was an
+approximation of 15, but there is currently no limit at all.
+
+Fix MAX_INSN_SIZE (it should be 15, not 16), and fix the decoder
+to reject instructions that exceed MAX_INSN_SIZE.
+
+Other than potentially confusing some of the decoder sanity
+checks, I'm not aware of any actual problems that omitting this
+check would cause, nor am I aware of any practical problems
+caused by the MAX_INSN_SIZE error.
+
+Signed-off-by: Andy Lutomirski <luto@amacapital.net>
+Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Fixes: 6ba48ff46f76 ("x86: Remove arbitrary instruction size limit ...
+Link: http://lkml.kernel.org/r/f8f0bc9b8c58cfd6830f7d88400bf1396cbdcd0f.1422403511.git.luto@amacapital.net
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/insn.h |    2 +-
+ arch/x86/lib/insn.c         |    7 +++++++
+ 2 files changed, 8 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/insn.h
++++ b/arch/x86/include/asm/insn.h
+@@ -69,7 +69,7 @@ struct insn {
+ 	const insn_byte_t *next_byte;
+ };
+ 
+-#define MAX_INSN_SIZE	16
++#define MAX_INSN_SIZE	15
+ 
+ #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
+ #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
+--- a/arch/x86/lib/insn.c
++++ b/arch/x86/lib/insn.c
+@@ -52,6 +52,13 @@
+  */
+ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
+ {
++	/*
++	 * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid
++	 * even if the input buffer is long enough to hold them.
++	 */
++	if (buf_len > MAX_INSN_SIZE)
++		buf_len = MAX_INSN_SIZE;
++
+ 	memset(insn, 0, sizeof(*insn));
+ 	insn->kaddr = kaddr;
+ 	insn->end_kaddr = kaddr + buf_len;
diff --git a/queue-4.0/x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch b/queue-4.0/x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch
new file mode 100644
index 00000000000..33e7f021d66
--- /dev/null
+++ b/queue-4.0/x86-fix-special-__probe_kernel_write-tail-zeroing-case.patch
@@ -0,0 +1,58 @@
+From d869844bd081081bf537e806a44811884230643e Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linux-foundation.org>
+Date: Thu, 23 Apr 2015 08:33:59 -0700
+Subject: x86: fix special __probe_kernel_write() tail zeroing case
+
+From: Linus Torvalds <torvalds@linux-foundation.org>
+
+commit d869844bd081081bf537e806a44811884230643e upstream.
+
+Commit cae2a173fe94 ("x86: clean up/fix 'copy_in_user()' tail zeroing")
+fixed the failure case tail zeroing of one special case of the x86-64
+generic user-copy routine, namely when used for the user-to-user case
+("copy_in_user()").
+
+But in the process it broke an even more unusual case: using the user
+copy routine for kernel-to-kernel copying.
+
+Now, normally kernel-kernel copies are obviously done using memcpy(),
+but we have a couple of special cases when we use the user-copy
+functions.  One is when we pass a kernel buffer to a regular user-buffer
+routine, using set_fs(KERNEL_DS).  That's a "normal" case, and continued
+to work fine, because it never takes any faults (with the possible
+exception of a silent and successful vmalloc fault).
+
+But Jan Beulich pointed out another, very unusual, special case: when we
+use the user-copy routines not because it's a path that expects a user
+pointer, but for a couple of ftrace/kgdb cases that want to do a kernel
+copy, but do so using "unsafe" buffers, and use the user-copy routine to
+gracefully handle faults.  IOW, for probe_kernel_write().
+
+And that broke for the case of a faulting kernel destination, because we
+saw the kernel destination and wanted to try to clear the tail of the
+buffer.  Which doesn't work, since that's what faults.
+
+This only triggers for things like kgdb and ftrace users (eg trying
+setting a breakpoint on read-only memory), but it's definitely a bug.
+The fix is to not compare against the kernel address start (TASK_SIZE),
+but instead use the same limits "access_ok()" uses.
+
+Reported-and-tested-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/lib/usercopy_64.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/lib/usercopy_64.c
++++ b/arch/x86/lib/usercopy_64.c
+@@ -82,7 +82,7 @@ copy_user_handle_tail(char *to, char *fr
+ 	clac();
+ 
+ 	/* If the destination is a kernel buffer, we always clear the end */
+-	if ((unsigned long)to >= TASK_SIZE_MAX)
++	if (!__addr_ok(to))
+ 		memset(to, 0, len);
+ 	return len;
+ }
diff --git a/queue-4.0/x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch b/queue-4.0/x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch
new file mode 100644
index 00000000000..33e875407b1
--- /dev/null
+++ b/queue-4.0/x86-kvm-revert-remove-sched-notifier-for-cross-cpu-migrations.patch
@@ -0,0 +1,205 @@
+From 0a4e6be9ca17c54817cf814b4b5aa60478c6df27 Mon Sep 17 00:00:00 2001
+From: Marcelo Tosatti <mtosatti@redhat.com>
+Date: Mon, 23 Mar 2015 20:21:51 -0300
+Subject: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
+
+From: Marcelo Tosatti <mtosatti@redhat.com>
+
+commit 0a4e6be9ca17c54817cf814b4b5aa60478c6df27 upstream.
+
+The following point:
+
+    2. per-CPU pvclock time info is updated if the
+       underlying CPU changes.
+
+Is not true anymore since "KVM: x86: update pvclock area conditionally,
+on cpu migration".
+
+Add task migration notification back.
+
+Problem noticed by Andy Lutomirski.
+
+Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/pvclock.h |    1 
+ arch/x86/kernel/pvclock.c      |   44 +++++++++++++++++++++++++++++++++++++++++
+ arch/x86/vdso/vclock_gettime.c |   16 +++++++-------
+ include/linux/sched.h          |    8 +++++++
+ kernel/sched/core.c            |   15 +++++++++++++
+ 5 files changed, 76 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/include/asm/pvclock.h
++++ b/arch/x86/include/asm/pvclock.h
+@@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const str
+ 
+ struct pvclock_vsyscall_time_info {
+ 	struct pvclock_vcpu_time_info pvti;
++	u32 migrate_count;
+ } __attribute__((__aligned__(SMP_CACHE_BYTES)));
+ 
+ #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
+--- a/arch/x86/kernel/pvclock.c
++++ b/arch/x86/kernel/pvclock.c
+@@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclo
+ 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+ }
+ 
++static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
++
++static struct pvclock_vsyscall_time_info *
++pvclock_get_vsyscall_user_time_info(int cpu)
++{
++	if (!pvclock_vdso_info) {
++		BUG();
++		return NULL;
++	}
++
++	return &pvclock_vdso_info[cpu];
++}
++
++struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
++{
++	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
++}
++
+ #ifdef CONFIG_X86_64
++static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
++			        void *v)
++{
++	struct task_migration_notifier *mn = v;
++	struct pvclock_vsyscall_time_info *pvti;
++
++	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
++
++	/* this is NULL when pvclock vsyscall is not initialized */
++	if (unlikely(pvti == NULL))
++		return NOTIFY_DONE;
++
++	pvti->migrate_count++;
++
++	return NOTIFY_DONE;
++}
++
++static struct notifier_block pvclock_migrate = {
++	.notifier_call = pvclock_task_migrate,
++};
++
+ /*
+  * Initialize the generic pvclock vsyscall state.  This will allocate
+  * a/some page(s) for the per-vcpu pvclock information, set up a
+@@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct
+ 
+ 	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
+ 
++	pvclock_vdso_info = i;
++
+ 	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
+ 		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
+ 			     __pa(i) + (idx*PAGE_SIZE),
+ 			     PAGE_KERNEL_VVAR);
+ 	}
+ 
++
++	register_task_migration_notifier(&pvclock_migrate);
++
+ 	return 0;
+ }
+ #endif
+--- a/arch/x86/vdso/vclock_gettime.c
++++ b/arch/x86/vdso/vclock_gettime.c
+@@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int
+ 	cycle_t ret;
+ 	u64 last;
+ 	u32 version;
++	u32 migrate_count;
+ 	u8 flags;
+ 	unsigned cpu, cpu1;
+ 
+ 
+ 	/*
+-	 * Note: hypervisor must guarantee that:
+-	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+-	 * 2. that per-CPU pvclock time info is updated if the
+-	 *    underlying CPU changes.
+-	 * 3. that version is increased whenever underlying CPU
+-	 *    changes.
+-	 *
++	 * When looping to get a consistent (time-info, tsc) pair, we
++	 * also need to deal with the possibility we can switch vcpus,
++	 * so make sure we always re-fetch time-info for the current vcpu.
+ 	 */
+ 	do {
+ 		cpu = __getcpu() & VGETCPU_CPU_MASK;
+@@ -104,6 +101,8 @@ static notrace cycle_t vread_pvclock(int
+ 
+ 		pvti = get_pvti(cpu);
+ 
++		migrate_count = pvti->migrate_count;
++
+ 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
+ 
+ 		/*
+@@ -115,7 +114,8 @@ static notrace cycle_t vread_pvclock(int
+ 		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+ 	} while (unlikely(cpu != cpu1 ||
+ 			  (pvti->pvti.version & 1) ||
+-			  pvti->pvti.version != version));
++			  pvti->pvti.version != version ||
++			  pvti->migrate_count != migrate_count));
+ 
+ 	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
+ 		*mode = VCLOCK_NONE;
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -176,6 +176,14 @@ extern void get_iowait_load(unsigned lon
+ extern void calc_global_load(unsigned long ticks);
+ extern void update_cpu_load_nohz(void);
+ 
++/* Notifier for when a task gets migrated to a new CPU */
++struct task_migration_notifier {
++	struct task_struct *task;
++	int from_cpu;
++	int to_cpu;
++};
++extern void register_task_migration_notifier(struct notifier_block *n);
++
+ extern unsigned long get_parent_ip(unsigned long addr);
+ 
+ extern void dump_cpu_task(int cpu);
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -996,6 +996,13 @@ void check_preempt_curr(struct rq *rq, s
+ 		rq_clock_skip_update(rq, true);
+ }
+ 
++static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
++
++void register_task_migration_notifier(struct notifier_block *n)
++{
++	atomic_notifier_chain_register(&task_migration_notifier, n);
++}
++
+ #ifdef CONFIG_SMP
+ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
+ {
+@@ -1026,10 +1033,18 @@ void set_task_cpu(struct task_struct *p,
+ 	trace_sched_migrate_task(p, new_cpu);
+ 
+ 	if (task_cpu(p) != new_cpu) {
++		struct task_migration_notifier tmn;
++
+ 		if (p->sched_class->migrate_task_rq)
+ 			p->sched_class->migrate_task_rq(p, new_cpu);
+ 		p->se.nr_migrations++;
+ 		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
++
++		tmn.task = p;
++		tmn.from_cpu = task_cpu(p);
++		tmn.to_cpu = new_cpu;
++
++		atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
+ 	}
+ 
+ 	__set_task_cpu(p, new_cpu);
diff --git a/queue-4.0/x86-vdso-fix-pvclock-races-with-task-migration.patch b/queue-4.0/x86-vdso-fix-pvclock-races-with-task-migration.patch
new file mode 100644
index 00000000000..ca54c5f0172
--- /dev/null
+++ b/queue-4.0/x86-vdso-fix-pvclock-races-with-task-migration.patch
@@ -0,0 +1,65 @@
+From 80f7fdb1c7f0f9266421f823964fd1962681f6ce Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
+Date: Thu, 2 Apr 2015 20:44:23 +0200
+Subject: x86: vdso: fix pvclock races with task migration
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
+
+commit 80f7fdb1c7f0f9266421f823964fd1962681f6ce upstream.
+
+If we were migrated right after __getcpu, but before reading the
+migration_count, we wouldn't notice that we read TSC of a different
+VCPU, nor that KVM's bug made pvti invalid, as only migration_count
+on source VCPU is increased.
+
+Change vdso instead of updating migration_count on destination.
+
+Signed-off-by: Radim KrÄmÃ¡Å <rkrcmar@redhat.com>
+Fixes: 0a4e6be9ca17 ("x86: kvm: Revert "remove sched notifier for cross-cpu migrations"")
+Message-Id: <1428000263-11892-1-git-send-email-rkrcmar@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/vdso/vclock_gettime.c |   20 ++++++++++++--------
+ 1 file changed, 12 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/vdso/vclock_gettime.c
++++ b/arch/x86/vdso/vclock_gettime.c
+@@ -99,21 +99,25 @@ static notrace cycle_t vread_pvclock(int
+ 		 * __getcpu() calls (Gleb).
+ 		 */
+ 
+-		pvti = get_pvti(cpu);
++		/* Make sure migrate_count will change if we leave the VCPU. */
++		do {
++			pvti = get_pvti(cpu);
++			migrate_count = pvti->migrate_count;
+ 
+-		migrate_count = pvti->migrate_count;
++			cpu1 = cpu;
++			cpu = __getcpu() & VGETCPU_CPU_MASK;
++		} while (unlikely(cpu != cpu1));
+ 
+ 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
+ 
+ 		/*
+ 		 * Test we're still on the cpu as well as the version.
+-		 * We could have been migrated just after the first
+-		 * vgetcpu but before fetching the version, so we
+-		 * wouldn't notice a version change.
++		 * - We must read TSC of pvti's VCPU.
++		 * - KVM doesn't follow the versioning protocol, so data could
++		 *   change before version if we left the VCPU.
+ 		 */
+-		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+-	} while (unlikely(cpu != cpu1 ||
+-			  (pvti->pvti.version & 1) ||
++		smp_rmb();
++	} while (unlikely((pvti->pvti.version & 1) ||
+ 			  pvti->pvti.version != version ||
+ 			  pvti->migrate_count != migrate_count));
+ 
-- 
2.47.3