From 213b231e80e06828dbb9f38c92ac61d65b9976ee Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 18 Sep 2017 08:57:40 +0200
Subject: [PATCH] 4.13-stable patches

added patches:
	fuse-allow-server-to-run-in-different-pid_ns.patch
	idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch
	ovl-fix-false-positive-estale-on-lookup.patch
	x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch
	x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch
	x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch
	x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch
---
 ...ow-server-to-run-in-different-pid_ns.patch |  95 ++++++
 ...e-when-trying-to-replace-negative-id.patch |  96 ++++++
 ...-fix-false-positive-estale-on-lookup.patch |  54 ++++
 queue-4.13/series                             |   7 +
 ...-and-gs-state-in-start_thread_common.patch |  58 ++++
 ...e-and-gsbase-correctly-in-core-dumps.patch |  56 ++++
 ...-kernel-1-1-mappings-of-poison-pages.patch | 150 +++++++++
 ...-switching-yet-again-to-fix-amd-cpus.patch | 297 ++++++++++++++++++
 8 files changed, 813 insertions(+)
 create mode 100644 queue-4.13/fuse-allow-server-to-run-in-different-pid_ns.patch
 create mode 100644 queue-4.13/idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch
 create mode 100644 queue-4.13/ovl-fix-false-positive-estale-on-lookup.patch
 create mode 100644 queue-4.13/x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch
 create mode 100644 queue-4.13/x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch
 create mode 100644 queue-4.13/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch
 create mode 100644 queue-4.13/x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch

diff --git a/queue-4.13/fuse-allow-server-to-run-in-different-pid_ns.patch b/queue-4.13/fuse-allow-server-to-run-in-different-pid_ns.patch
new file mode 100644
index 00000000000..b63c5b79c92
--- /dev/null
+++ b/queue-4.13/fuse-allow-server-to-run-in-different-pid_ns.patch
@@ -0,0 +1,95 @@
+From 5d6d3a301c4e749e04be6fcdcf4cb1ffa8bae524 Mon Sep 17 00:00:00 2001
+From: Miklos Szeredi <mszeredi@redhat.com>
+Date: Tue, 12 Sep 2017 16:57:53 +0200
+Subject: fuse: allow server to run in different pid_ns
+
+From: Miklos Szeredi <mszeredi@redhat.com>
+
+commit 5d6d3a301c4e749e04be6fcdcf4cb1ffa8bae524 upstream.
+
+Commit 0b6e9ea041e6 ("fuse: Add support for pid namespaces") broke
+Sandstorm.io development tools, which have been sending FUSE file
+descriptors across PID namespace boundaries since early 2014.
+
+The above patch added a check that prevented I/O on the fuse device file
+descriptor if the pid namespace of the reader/writer was different from the
+pid namespace of the mounter.  With this change passing the device file
+descriptor to a different pid namespace simply doesn't work.  The check was
+added because pids are transferred to/from the fuse userspace server in the
+namespace registered at mount time.
+
+To fix this regression, remove the checks and do the following:
+
+1) the pid in the request header (the pid of the task that initiated the
+filesystem operation) is translated to the reader's pid namespace.  If a
+mapping doesn't exist for this pid, then a zero pid is used.  Note: even if
+a mapping would exist between the initiator task's pid namespace and the
+reader's pid namespace the pid will be zero if either mapping from
+initator's to mounter's namespace or mapping from mounter's to reader's
+namespace doesn't exist.
+
+2) The lk.pid value in setlk/setlkw requests and getlk reply is left alone.
+Userspace should not interpret this value anyway.  Also allow the
+setlk/setlkw operations if the pid of the task cannot be represented in the
+mounter's namespace (pid being zero in that case).
+
+Reported-by: Kenton Varda <kenton@sandstorm.io>
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Fixes: 0b6e9ea041e6 ("fuse: Add support for pid namespaces")
+Cc: Eric W. Biederman <ebiederm@xmission.com>
+Cc: Seth Forshee <seth.forshee@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/fuse/dev.c  |   13 +++++++------
+ fs/fuse/file.c |    3 ---
+ 2 files changed, 7 insertions(+), 9 deletions(-)
+
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -1222,9 +1222,6 @@ static ssize_t fuse_dev_do_read(struct f
+ 	struct fuse_in *in;
+ 	unsigned reqsize;
+ 
+-	if (task_active_pid_ns(current) != fc->pid_ns)
+-		return -EIO;
+-
+  restart:
+ 	spin_lock(&fiq->waitq.lock);
+ 	err = -EAGAIN;
+@@ -1262,6 +1259,13 @@ static ssize_t fuse_dev_do_read(struct f
+ 
+ 	in = &req->in;
+ 	reqsize = in->h.len;
++
++	if (task_active_pid_ns(current) != fc->pid_ns) {
++		rcu_read_lock();
++		in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
++		rcu_read_unlock();
++	}
++
+ 	/* If request is too large, reply with an error and restart the read */
+ 	if (nbytes < reqsize) {
+ 		req->out.h.error = -EIO;
+@@ -1823,9 +1827,6 @@ static ssize_t fuse_dev_do_write(struct
+ 	struct fuse_req *req;
+ 	struct fuse_out_header oh;
+ 
+-	if (task_active_pid_ns(current) != fc->pid_ns)
+-		return -EIO;
+-
+ 	if (nbytes < sizeof(struct fuse_out_header))
+ 		return -EINVAL;
+ 
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -2181,9 +2181,6 @@ static int fuse_setlk(struct file *file,
+ 	if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
+ 		return 0;
+ 
+-	if (pid && pid_nr == 0)
+-		return -EOVERFLOW;
+-
+ 	fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
+ 	err = fuse_simple_request(fc, &args);
+ 
diff --git a/queue-4.13/idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch b/queue-4.13/idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch
new file mode 100644
index 00000000000..de0a28764b7
--- /dev/null
+++ b/queue-4.13/idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch
@@ -0,0 +1,96 @@
+From a47f68d6a944113bdc8097db6f933c2e17c27bf9 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Wed, 13 Sep 2017 16:28:11 -0700
+Subject: idr: remove WARN_ON_ONCE() when trying to replace negative ID
+
+From: Eric Biggers <ebiggers@google.com>
+
+commit a47f68d6a944113bdc8097db6f933c2e17c27bf9 upstream.
+
+IDR only supports non-negative IDs.  There used to be a 'WARN_ON_ONCE(id <
+0)' in idr_replace(), but it was intentionally removed by commit
+2e1c9b286765 ("idr: remove WARN_ON_ONCE() on negative IDs").
+
+Then it was added back by commit 0a835c4f090a ("Reimplement IDR and IDA
+using the radix tree").  However it seems that adding it back was a
+mistake, given that some users such as drm_gem_handle_delete()
+(DRM_IOCTL_GEM_CLOSE) pass in a value from userspace to idr_replace(),
+allowing the WARN_ON_ONCE to be triggered.  drm_gem_handle_delete()
+actually just wants idr_replace() to return an error code if the ID is
+not allocated, including in the case where the ID is invalid (negative).
+
+So once again remove the bogus WARN_ON_ONCE().
+
+This bug was found by syzkaller, which encountered the following
+warning:
+
+    WARNING: CPU: 3 PID: 3008 at lib/idr.c:157 idr_replace+0x1d8/0x240 lib/idr.c:157
+    Kernel panic - not syncing: panic_on_warn set ...
+
+    CPU: 3 PID: 3008 Comm: syzkaller218828 Not tainted 4.13.0-rc4-next-20170811 #2
+    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
+    Call Trace:
+     fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190
+     do_trap_no_signal arch/x86/kernel/traps.c:224 [inline]
+     do_trap+0x260/0x390 arch/x86/kernel/traps.c:273
+     do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:310
+     do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323
+     invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:930
+    RIP: 0010:idr_replace+0x1d8/0x240 lib/idr.c:157
+    RSP: 0018:ffff8800394bf9f8 EFLAGS: 00010297
+    RAX: ffff88003c6c60c0 RBX: 1ffff10007297f43 RCX: 0000000000000000
+    RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8800394bfa78
+    RBP: ffff8800394bfae0 R08: ffffffff82856487 R09: 0000000000000000
+    R10: ffff8800394bf9a8 R11: ffff88006c8bae28 R12: ffffffffffffffff
+    R13: ffff8800394bfab8 R14: dffffc0000000000 R15: ffff8800394bfbc8
+     drm_gem_handle_delete+0x33/0xa0 drivers/gpu/drm/drm_gem.c:297
+     drm_gem_close_ioctl+0xa1/0xe0 drivers/gpu/drm/drm_gem.c:671
+     drm_ioctl_kernel+0x1e7/0x2e0 drivers/gpu/drm/drm_ioctl.c:729
+     drm_ioctl+0x72e/0xa50 drivers/gpu/drm/drm_ioctl.c:825
+     vfs_ioctl fs/ioctl.c:45 [inline]
+     do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
+     SYSC_ioctl fs/ioctl.c:700 [inline]
+     SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
+     entry_SYSCALL_64_fastpath+0x1f/0xbe
+
+Here is a C reproducer:
+
+    #include <fcntl.h>
+    #include <stddef.h>
+    #include <stdint.h>
+    #include <sys/ioctl.h>
+    #include <drm/drm.h>
+
+    int main(void)
+    {
+            int cardfd = open("/dev/dri/card0", O_RDONLY);
+
+            ioctl(cardfd, DRM_IOCTL_GEM_CLOSE,
+                  &(struct drm_gem_close) { .handle = -1 } );
+    }
+
+Link: http://lkml.kernel.org/r/20170906235306.20534-1-ebiggers3@gmail.com
+Fixes: 0a835c4f090a ("Reimplement IDR and IDA using the radix tree")
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Matthew Wilcox <mawilcox@microsoft.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ lib/idr.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/lib/idr.c
++++ b/lib/idr.c
+@@ -154,7 +154,7 @@ void *idr_replace(struct idr *idr, void
+ 	void __rcu **slot = NULL;
+ 	void *entry;
+ 
+-	if (WARN_ON_ONCE(id < 0))
++	if (id < 0)
+ 		return ERR_PTR(-EINVAL);
+ 	if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
+ 		return ERR_PTR(-EINVAL);
diff --git a/queue-4.13/ovl-fix-false-positive-estale-on-lookup.patch b/queue-4.13/ovl-fix-false-positive-estale-on-lookup.patch
new file mode 100644
index 00000000000..b5583aebdd4
--- /dev/null
+++ b/queue-4.13/ovl-fix-false-positive-estale-on-lookup.patch
@@ -0,0 +1,54 @@
+From 939ae4efd51c627da270af74ef069db5124cb5b0 Mon Sep 17 00:00:00 2001
+From: Amir Goldstein <amir73il@gmail.com>
+Date: Mon, 11 Sep 2017 16:30:15 +0300
+Subject: ovl: fix false positive ESTALE on lookup
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+commit 939ae4efd51c627da270af74ef069db5124cb5b0 upstream.
+
+Commit b9ac5c274b8c ("ovl: hash overlay non-dir inodes by copy up origin")
+verifies that the origin lower inode stored in the overlayfs inode matched
+the inode of a copy up origin dentry found by lookup.
+
+There is a false positive result in that check when lower fs does not
+support file handles and copy up origin cannot be followed by file handle
+at lookup time.
+
+The false negative happens when finding an overlay inode in cache on a
+copied up overlay dentry lookup. The overlay inode still 'remembers' the
+copy up origin inode, but the copy up origin dentry is not available for
+verification.
+
+Relax the check in case copy up origin dentry is not available.
+
+Fixes: b9ac5c274b8c ("ovl: hash overlay non-dir inodes by copy up...")
+Reported-by: Jordi Pujol <jordipujolp@gmail.com>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/overlayfs/inode.c |   11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/fs/overlayfs/inode.c
++++ b/fs/overlayfs/inode.c
+@@ -576,10 +576,13 @@ static int ovl_inode_set(struct inode *i
+ static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
+ 			     struct dentry *upperdentry)
+ {
+-	struct inode *lowerinode = lowerdentry ? d_inode(lowerdentry) : NULL;
+-
+-	/* Lower (origin) inode must match, even if NULL */
+-	if (ovl_inode_lower(inode) != lowerinode)
++	/*
++	 * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
++	 * This happens when finding a copied up overlay inode for a renamed
++	 * or hardlinked overlay dentry and lower dentry cannot be followed
++	 * by origin because lower fs does not support file handles.
++	 */
++	if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
+ 		return false;
+ 
+ 	/*
diff --git a/queue-4.13/series b/queue-4.13/series
index 1f46a1b0109..1c93d6a5d32 100644
--- a/queue-4.13/series
+++ b/queue-4.13/series
@@ -14,3 +14,10 @@ f2fs-check-hot_data-for-roll-forward-recovery.patch
 thunderbolt-remove-superfluous-check.patch
 thunderbolt-make-key-root-only-accessible.patch
 thunderbolt-allow-clearing-the-key.patch
+x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch
+x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch
+x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch
+x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch
+ovl-fix-false-positive-estale-on-lookup.patch
+fuse-allow-server-to-run-in-different-pid_ns.patch
+idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch
diff --git a/queue-4.13/x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch b/queue-4.13/x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch
new file mode 100644
index 00000000000..0493522bc3f
--- /dev/null
+++ b/queue-4.13/x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch
@@ -0,0 +1,58 @@
+From 767d035d838f4fd6b5a5bbd7a3f6d293b7f65a49 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 1 Aug 2017 07:11:34 -0700
+Subject: x86/fsgsbase/64: Fully initialize FS and GS state in start_thread_common
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 767d035d838f4fd6b5a5bbd7a3f6d293b7f65a49 upstream.
+
+execve used to leak FSBASE and GSBASE on AMD CPUs.  Fix it.
+
+The security impact of this bug is small but not quite zero -- it
+could weaken ASLR when a privileged task execs a less privileged
+program, but only if program changed bitness across the exec, or the
+child binary was highly unusual or actively malicious.  A child
+program that was compromised after the exec would not have access to
+the leaked base.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Chang Seok <chang.seok.bae@intel.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/process_64.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -229,10 +229,19 @@ start_thread_common(struct pt_regs *regs
+ 		    unsigned long new_sp,
+ 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
+ {
++	WARN_ON_ONCE(regs != current_pt_regs());
++
++	if (static_cpu_has(X86_BUG_NULL_SEG)) {
++		/* Loading zero below won't clear the base. */
++		loadsegment(fs, __USER_DS);
++		load_gs_index(__USER_DS);
++	}
++
+ 	loadsegment(fs, 0);
+ 	loadsegment(es, _ds);
+ 	loadsegment(ds, _ds);
+ 	load_gs_index(0);
++
+ 	regs->ip		= new_ip;
+ 	regs->sp		= new_sp;
+ 	regs->cs		= _cs;
diff --git a/queue-4.13/x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch b/queue-4.13/x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch
new file mode 100644
index 00000000000..95e32d8f9ca
--- /dev/null
+++ b/queue-4.13/x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch
@@ -0,0 +1,56 @@
+From 9584d98bed7a7a904d0702ad06bbcc94703cb5b4 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 1 Aug 2017 07:11:35 -0700
+Subject: x86/fsgsbase/64: Report FSBASE and GSBASE correctly in core dumps
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit 9584d98bed7a7a904d0702ad06bbcc94703cb5b4 upstream.
+
+In ELF_COPY_CORE_REGS, we're copying from the current task, so
+accessing thread.fsbase and thread.gsbase makes no sense.  Just read
+the values from the CPU registers.
+
+In practice, the old code would have been correct most of the time
+simply because thread.fsbase and thread.gsbase usually matched the
+CPU registers.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Chang Seok <chang.seok.bae@intel.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/elf.h |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/include/asm/elf.h
++++ b/arch/x86/include/asm/elf.h
+@@ -204,6 +204,7 @@ void set_personality_ia32(bool);
+ 
+ #define ELF_CORE_COPY_REGS(pr_reg, regs)			\
+ do {								\
++	unsigned long base;					\
+ 	unsigned v;						\
+ 	(pr_reg)[0] = (regs)->r15;				\
+ 	(pr_reg)[1] = (regs)->r14;				\
+@@ -226,8 +227,8 @@ do {								\
+ 	(pr_reg)[18] = (regs)->flags;				\
+ 	(pr_reg)[19] = (regs)->sp;				\
+ 	(pr_reg)[20] = (regs)->ss;				\
+-	(pr_reg)[21] = current->thread.fsbase;			\
+-	(pr_reg)[22] = current->thread.gsbase;			\
++	rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base;		\
++	rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base;	\
+ 	asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;	\
+ 	asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;	\
+ 	asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;	\
diff --git a/queue-4.13/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch b/queue-4.13/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch
new file mode 100644
index 00000000000..712d237b897
--- /dev/null
+++ b/queue-4.13/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch
@@ -0,0 +1,150 @@
+From ce0fa3e56ad20f04d8252353dcd24e924abdafca Mon Sep 17 00:00:00 2001
+From: Tony Luck <tony.luck@intel.com>
+Date: Wed, 16 Aug 2017 10:18:03 -0700
+Subject: x86/mm, mm/hwpoison: Clear PRESENT bit for kernel 1:1 mappings of poison pages
+
+From: Tony Luck <tony.luck@intel.com>
+
+commit ce0fa3e56ad20f04d8252353dcd24e924abdafca upstream.
+
+Speculative processor accesses may reference any memory that has a
+valid page table entry.  While a speculative access won't generate
+a machine check, it will log the error in a machine check bank. That
+could cause escalation of a subsequent error since the overflow bit
+will be then set in the machine check bank status register.
+
+Code has to be double-plus-tricky to avoid mentioning the 1:1 virtual
+address of the page we want to map out otherwise we may trigger the
+very problem we are trying to avoid.  We use a non-canonical address
+that passes through the usual Linux table walking code to get to the
+same "pte".
+
+Thanks to Dave Hansen for reviewing several iterations of this.
+
+Also see:
+
+  http://marc.info/?l=linux-mm&m=149860136413338&w=2
+
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bp@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Elliott, Robert (Persistent Memory) <elliott@hpe.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-mm@kvack.org
+Link: http://lkml.kernel.org/r/20170816171803.28342-1-tony.luck@intel.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/page_64.h   |    4 +++
+ arch/x86/kernel/cpu/mcheck/mce.c |   43 +++++++++++++++++++++++++++++++++++++++
+ include/linux/mm_inline.h        |    6 +++++
+ mm/memory-failure.c              |    2 +
+ 4 files changed, 55 insertions(+)
+
+--- a/arch/x86/include/asm/page_64.h
++++ b/arch/x86/include/asm/page_64.h
+@@ -51,6 +51,10 @@ static inline void clear_page(void *page
+ 
+ void copy_page(void *to, void *from);
+ 
++#ifdef CONFIG_X86_MCE
++#define arch_unmap_kpfn arch_unmap_kpfn
++#endif
++
+ #endif	/* !__ASSEMBLY__ */
+ 
+ #ifdef CONFIG_X86_VSYSCALL_EMULATION
+--- a/arch/x86/kernel/cpu/mcheck/mce.c
++++ b/arch/x86/kernel/cpu/mcheck/mce.c
+@@ -51,6 +51,7 @@
+ #include <asm/mce.h>
+ #include <asm/msr.h>
+ #include <asm/reboot.h>
++#include <asm/set_memory.h>
+ 
+ #include "mce-internal.h"
+ 
+@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce
+ 	return ret;
+ }
+ 
++#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
++
++void arch_unmap_kpfn(unsigned long pfn)
++{
++	unsigned long decoy_addr;
++
++	/*
++	 * Unmap this page from the kernel 1:1 mappings to make sure
++	 * we don't log more errors because of speculative access to
++	 * the page.
++	 * We would like to just call:
++	 *	set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
++	 * but doing that would radically increase the odds of a
++	 * speculative access to the posion page because we'd have
++	 * the virtual address of the kernel 1:1 mapping sitting
++	 * around in registers.
++	 * Instead we get tricky.  We create a non-canonical address
++	 * that looks just like the one we want, but has bit 63 flipped.
++	 * This relies on set_memory_np() not checking whether we passed
++	 * a legal address.
++	 */
++
++/*
++ * Build time check to see if we have a spare virtual bit. Don't want
++ * to leave this until run time because most developers don't have a
++ * system that can exercise this code path. This will only become a
++ * problem if/when we move beyond 5-level page tables.
++ *
++ * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
++ */
++#if PGDIR_SHIFT + 9 < 63
++	decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
++#else
++#error "no unused virtual bit available"
++#endif
++
++	if (set_memory_np(decoy_addr, 1))
++		pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
++
++}
++#endif
++
+ /*
+  * The actual machine check handler. This only handles real
+  * exceptions when something got corrupted coming in through int 18.
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -126,4 +126,10 @@ static __always_inline enum lru_list pag
+ 
+ #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
+ 
++#ifdef arch_unmap_kpfn
++extern void arch_unmap_kpfn(unsigned long pfn);
++#else
++static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
++#endif
++
+ #endif
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, in
+ 		return 0;
+ 	}
+ 
++	arch_unmap_kpfn(pfn);
++
+ 	orig_head = hpage = compound_head(p);
+ 	num_poisoned_pages_inc();
+ 
diff --git a/queue-4.13/x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch b/queue-4.13/x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch
new file mode 100644
index 00000000000..1d9bb933633
--- /dev/null
+++ b/queue-4.13/x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch
@@ -0,0 +1,297 @@
+From e137a4d8f4dd2e277e355495b6b2cb241a8693c3 Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@kernel.org>
+Date: Tue, 1 Aug 2017 07:11:37 -0700
+Subject: x86/switch_to/64: Rewrite FS/GS switching yet again to fix AMD CPUs
+
+From: Andy Lutomirski <luto@kernel.org>
+
+commit e137a4d8f4dd2e277e355495b6b2cb241a8693c3 upstream.
+
+Switching FS and GS is a mess, and the current code is still subtly
+wrong: it assumes that "Loading a nonzero value into FS sets the
+index and base", which is false on AMD CPUs if the value being
+loaded is 1, 2, or 3.
+
+(The current code came from commit 3e2b68d752c9 ("x86/asm,
+sched/x86: Rewrite the FS and GS context switch code"), which made
+it better but didn't fully fix it.)
+
+Rewrite it to be much simpler and more obviously correct.  This
+should fix it fully on AMD CPUs and shouldn't adversely affect
+performance.
+
+Signed-off-by: Andy Lutomirski <luto@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Borislav Petkov <bpetkov@suse.de>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Chang Seok <chang.seok.bae@intel.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/process_64.c |  227 +++++++++++++++++++++++--------------------
+ 1 file changed, 122 insertions(+), 105 deletions(-)
+
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -149,6 +149,123 @@ void release_thread(struct task_struct *
+ 	}
+ }
+ 
++enum which_selector {
++	FS,
++	GS
++};
++
++/*
++ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
++ * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
++ * It's forcibly inlined because it'll generate better code and this function
++ * is hot.
++ */
++static __always_inline void save_base_legacy(struct task_struct *prev_p,
++					     unsigned short selector,
++					     enum which_selector which)
++{
++	if (likely(selector == 0)) {
++		/*
++		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
++		 * be the pre-existing saved base or it could be zero.  On AMD
++		 * (with X86_BUG_NULL_SEG), the segment base could be almost
++		 * anything.
++		 *
++		 * This branch is very hot (it's hit twice on almost every
++		 * context switch between 64-bit programs), and avoiding
++		 * the RDMSR helps a lot, so we just assume that whatever
++		 * value is already saved is correct.  This matches historical
++		 * Linux behavior, so it won't break existing applications.
++		 *
++		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
++		 * report that the base is zero, it needs to actually be zero:
++		 * see the corresponding logic in load_seg_legacy.
++		 */
++	} else {
++		/*
++		 * If the selector is 1, 2, or 3, then the base is zero on
++		 * !X86_BUG_NULL_SEG CPUs and could be anything on
++		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
++		 * has never attempted to preserve the base across context
++		 * switches.
++		 *
++		 * If selector > 3, then it refers to a real segment, and
++		 * saving the base isn't necessary.
++		 */
++		if (which == FS)
++			prev_p->thread.fsbase = 0;
++		else
++			prev_p->thread.gsbase = 0;
++	}
++}
++
++static __always_inline void save_fsgs(struct task_struct *task)
++{
++	savesegment(fs, task->thread.fsindex);
++	savesegment(gs, task->thread.gsindex);
++	save_base_legacy(task, task->thread.fsindex, FS);
++	save_base_legacy(task, task->thread.gsindex, GS);
++}
++
++static __always_inline void loadseg(enum which_selector which,
++				    unsigned short sel)
++{
++	if (which == FS)
++		loadsegment(fs, sel);
++	else
++		load_gs_index(sel);
++}
++
++static __always_inline void load_seg_legacy(unsigned short prev_index,
++					    unsigned long prev_base,
++					    unsigned short next_index,
++					    unsigned long next_base,
++					    enum which_selector which)
++{
++	if (likely(next_index <= 3)) {
++		/*
++		 * The next task is using 64-bit TLS, is not using this
++		 * segment at all, or is having fun with arcane CPU features.
++		 */
++		if (next_base == 0) {
++			/*
++			 * Nasty case: on AMD CPUs, we need to forcibly zero
++			 * the base.
++			 */
++			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
++				loadseg(which, __USER_DS);
++				loadseg(which, next_index);
++			} else {
++				/*
++				 * We could try to exhaustively detect cases
++				 * under which we can skip the segment load,
++				 * but there's really only one case that matters
++				 * for performance: if both the previous and
++				 * next states are fully zeroed, we can skip
++				 * the load.
++				 *
++				 * (This assumes that prev_base == 0 has no
++				 * false positives.  This is the case on
++				 * Intel-style CPUs.)
++				 */
++				if (likely(prev_index | next_index | prev_base))
++					loadseg(which, next_index);
++			}
++		} else {
++			if (prev_index != next_index)
++				loadseg(which, next_index);
++			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
++			       next_base);
++		}
++	} else {
++		/*
++		 * The next task is using a real segment.  Loading the selector
++		 * is sufficient.
++		 */
++		loadseg(which, next_index);
++	}
++}
++
+ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+ 		unsigned long arg, struct task_struct *p, unsigned long tls)
+ {
+@@ -286,7 +403,6 @@ __switch_to(struct task_struct *prev_p,
+ 	struct fpu *next_fpu = &next->fpu;
+ 	int cpu = smp_processor_id();
+ 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+-	unsigned prev_fsindex, prev_gsindex;
+ 
+ 	switch_fpu_prepare(prev_fpu, cpu);
+ 
+@@ -295,8 +411,7 @@ __switch_to(struct task_struct *prev_p,
+ 	 *
+ 	 * (e.g. xen_load_tls())
+ 	 */
+-	savesegment(fs, prev_fsindex);
+-	savesegment(gs, prev_gsindex);
++	save_fsgs(prev_p);
+ 
+ 	/*
+ 	 * Load TLS before restoring any segments so that segment loads
+@@ -335,108 +450,10 @@ __switch_to(struct task_struct *prev_p,
+ 	if (unlikely(next->ds | prev->ds))
+ 		loadsegment(ds, next->ds);
+ 
+-	/*
+-	 * Switch FS and GS.
+-	 *
+-	 * These are even more complicated than DS and ES: they have
+-	 * 64-bit bases are that controlled by arch_prctl.  The bases
+-	 * don't necessarily match the selectors, as user code can do
+-	 * any number of things to cause them to be inconsistent.
+-	 *
+-	 * We don't promise to preserve the bases if the selectors are
+-	 * nonzero.  We also don't promise to preserve the base if the
+-	 * selector is zero and the base doesn't match whatever was
+-	 * most recently passed to ARCH_SET_FS/GS.  (If/when the
+-	 * FSGSBASE instructions are enabled, we'll need to offer
+-	 * stronger guarantees.)
+-	 *
+-	 * As an invariant,
+-	 * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
+-	 * impossible.
+-	 */
+-	if (next->fsindex) {
+-		/* Loading a nonzero value into FS sets the index and base. */
+-		loadsegment(fs, next->fsindex);
+-	} else {
+-		if (next->fsbase) {
+-			/* Next index is zero but next base is nonzero. */
+-			if (prev_fsindex)
+-				loadsegment(fs, 0);
+-			wrmsrl(MSR_FS_BASE, next->fsbase);
+-		} else {
+-			/* Next base and index are both zero. */
+-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+-				/*
+-				 * We don't know the previous base and can't
+-				 * find out without RDMSR.  Forcibly clear it.
+-				 */
+-				loadsegment(fs, __USER_DS);
+-				loadsegment(fs, 0);
+-			} else {
+-				/*
+-				 * If the previous index is zero and ARCH_SET_FS
+-				 * didn't change the base, then the base is
+-				 * also zero and we don't need to do anything.
+-				 */
+-				if (prev->fsbase || prev_fsindex)
+-					loadsegment(fs, 0);
+-			}
+-		}
+-	}
+-	/*
+-	 * Save the old state and preserve the invariant.
+-	 * NB: if prev_fsindex == 0, then we can't reliably learn the base
+-	 * without RDMSR because Intel user code can zero it without telling
+-	 * us and AMD user code can program any 32-bit value without telling
+-	 * us.
+-	 */
+-	if (prev_fsindex)
+-		prev->fsbase = 0;
+-	prev->fsindex = prev_fsindex;
+-
+-	if (next->gsindex) {
+-		/* Loading a nonzero value into GS sets the index and base. */
+-		load_gs_index(next->gsindex);
+-	} else {
+-		if (next->gsbase) {
+-			/* Next index is zero but next base is nonzero. */
+-			if (prev_gsindex)
+-				load_gs_index(0);
+-			wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
+-		} else {
+-			/* Next base and index are both zero. */
+-			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+-				/*
+-				 * We don't know the previous base and can't
+-				 * find out without RDMSR.  Forcibly clear it.
+-				 *
+-				 * This contains a pointless SWAPGS pair.
+-				 * Fixing it would involve an explicit check
+-				 * for Xen or a new pvop.
+-				 */
+-				load_gs_index(__USER_DS);
+-				load_gs_index(0);
+-			} else {
+-				/*
+-				 * If the previous index is zero and ARCH_SET_GS
+-				 * didn't change the base, then the base is
+-				 * also zero and we don't need to do anything.
+-				 */
+-				if (prev->gsbase || prev_gsindex)
+-					load_gs_index(0);
+-			}
+-		}
+-	}
+-	/*
+-	 * Save the old state and preserve the invariant.
+-	 * NB: if prev_gsindex == 0, then we can't reliably learn the base
+-	 * without RDMSR because Intel user code can zero it without telling
+-	 * us and AMD user code can program any 32-bit value without telling
+-	 * us.
+-	 */
+-	if (prev_gsindex)
+-		prev->gsbase = 0;
+-	prev->gsindex = prev_gsindex;
++	load_seg_legacy(prev->fsindex, prev->fsbase,
++			next->fsindex, next->fsbase, FS);
++	load_seg_legacy(prev->gsindex, prev->gsbase,
++			next->gsindex, next->gsbase, GS);
+ 
+ 	switch_fpu_finish(next_fpu, cpu);
+ 
-- 
2.47.3