From 213b231e80e06828dbb9f38c92ac61d65b9976ee Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 18 Sep 2017 08:57:40 +0200 Subject: [PATCH] 4.13-stable patches added patches: fuse-allow-server-to-run-in-different-pid_ns.patch idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch ovl-fix-false-positive-estale-on-lookup.patch x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch --- ...ow-server-to-run-in-different-pid_ns.patch | 95 ++++++ ...e-when-trying-to-replace-negative-id.patch | 96 ++++++ ...-fix-false-positive-estale-on-lookup.patch | 54 ++++ queue-4.13/series | 7 + ...-and-gs-state-in-start_thread_common.patch | 58 ++++ ...e-and-gsbase-correctly-in-core-dumps.patch | 56 ++++ ...-kernel-1-1-mappings-of-poison-pages.patch | 150 +++++++++ ...-switching-yet-again-to-fix-amd-cpus.patch | 297 ++++++++++++++++++ 8 files changed, 813 insertions(+) create mode 100644 queue-4.13/fuse-allow-server-to-run-in-different-pid_ns.patch create mode 100644 queue-4.13/idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch create mode 100644 queue-4.13/ovl-fix-false-positive-estale-on-lookup.patch create mode 100644 queue-4.13/x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch create mode 100644 queue-4.13/x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch create mode 100644 queue-4.13/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch create mode 100644 queue-4.13/x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch diff --git a/queue-4.13/fuse-allow-server-to-run-in-different-pid_ns.patch b/queue-4.13/fuse-allow-server-to-run-in-different-pid_ns.patch new file mode 100644 index 00000000000..b63c5b79c92 --- /dev/null +++ b/queue-4.13/fuse-allow-server-to-run-in-different-pid_ns.patch @@ -0,0 +1,95 @@ +From 5d6d3a301c4e749e04be6fcdcf4cb1ffa8bae524 Mon Sep 17 00:00:00 2001 +From: Miklos Szeredi +Date: Tue, 12 Sep 2017 16:57:53 +0200 +Subject: fuse: allow server to run in different pid_ns + +From: Miklos Szeredi + +commit 5d6d3a301c4e749e04be6fcdcf4cb1ffa8bae524 upstream. + +Commit 0b6e9ea041e6 ("fuse: Add support for pid namespaces") broke +Sandstorm.io development tools, which have been sending FUSE file +descriptors across PID namespace boundaries since early 2014. + +The above patch added a check that prevented I/O on the fuse device file +descriptor if the pid namespace of the reader/writer was different from the +pid namespace of the mounter. With this change passing the device file +descriptor to a different pid namespace simply doesn't work. The check was +added because pids are transferred to/from the fuse userspace server in the +namespace registered at mount time. + +To fix this regression, remove the checks and do the following: + +1) the pid in the request header (the pid of the task that initiated the +filesystem operation) is translated to the reader's pid namespace. If a +mapping doesn't exist for this pid, then a zero pid is used. Note: even if +a mapping would exist between the initiator task's pid namespace and the +reader's pid namespace the pid will be zero if either mapping from +initator's to mounter's namespace or mapping from mounter's to reader's +namespace doesn't exist. + +2) The lk.pid value in setlk/setlkw requests and getlk reply is left alone. +Userspace should not interpret this value anyway. Also allow the +setlk/setlkw operations if the pid of the task cannot be represented in the +mounter's namespace (pid being zero in that case). + +Reported-by: Kenton Varda +Signed-off-by: Miklos Szeredi +Fixes: 0b6e9ea041e6 ("fuse: Add support for pid namespaces") +Cc: Eric W. Biederman +Cc: Seth Forshee +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fuse/dev.c | 13 +++++++------ + fs/fuse/file.c | 3 --- + 2 files changed, 7 insertions(+), 9 deletions(-) + +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -1222,9 +1222,6 @@ static ssize_t fuse_dev_do_read(struct f + struct fuse_in *in; + unsigned reqsize; + +- if (task_active_pid_ns(current) != fc->pid_ns) +- return -EIO; +- + restart: + spin_lock(&fiq->waitq.lock); + err = -EAGAIN; +@@ -1262,6 +1259,13 @@ static ssize_t fuse_dev_do_read(struct f + + in = &req->in; + reqsize = in->h.len; ++ ++ if (task_active_pid_ns(current) != fc->pid_ns) { ++ rcu_read_lock(); ++ in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns)); ++ rcu_read_unlock(); ++ } ++ + /* If request is too large, reply with an error and restart the read */ + if (nbytes < reqsize) { + req->out.h.error = -EIO; +@@ -1823,9 +1827,6 @@ static ssize_t fuse_dev_do_write(struct + struct fuse_req *req; + struct fuse_out_header oh; + +- if (task_active_pid_ns(current) != fc->pid_ns) +- return -EIO; +- + if (nbytes < sizeof(struct fuse_out_header)) + return -EINVAL; + +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -2181,9 +2181,6 @@ static int fuse_setlk(struct file *file, + if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) + return 0; + +- if (pid && pid_nr == 0) +- return -EOVERFLOW; +- + fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); + err = fuse_simple_request(fc, &args); + diff --git a/queue-4.13/idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch b/queue-4.13/idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch new file mode 100644 index 00000000000..de0a28764b7 --- /dev/null +++ b/queue-4.13/idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch @@ -0,0 +1,96 @@ +From a47f68d6a944113bdc8097db6f933c2e17c27bf9 Mon Sep 17 00:00:00 2001 +From: Eric Biggers +Date: Wed, 13 Sep 2017 16:28:11 -0700 +Subject: idr: remove WARN_ON_ONCE() when trying to replace negative ID + +From: Eric Biggers + +commit a47f68d6a944113bdc8097db6f933c2e17c27bf9 upstream. + +IDR only supports non-negative IDs. There used to be a 'WARN_ON_ONCE(id < +0)' in idr_replace(), but it was intentionally removed by commit +2e1c9b286765 ("idr: remove WARN_ON_ONCE() on negative IDs"). + +Then it was added back by commit 0a835c4f090a ("Reimplement IDR and IDA +using the radix tree"). However it seems that adding it back was a +mistake, given that some users such as drm_gem_handle_delete() +(DRM_IOCTL_GEM_CLOSE) pass in a value from userspace to idr_replace(), +allowing the WARN_ON_ONCE to be triggered. drm_gem_handle_delete() +actually just wants idr_replace() to return an error code if the ID is +not allocated, including in the case where the ID is invalid (negative). + +So once again remove the bogus WARN_ON_ONCE(). + +This bug was found by syzkaller, which encountered the following +warning: + + WARNING: CPU: 3 PID: 3008 at lib/idr.c:157 idr_replace+0x1d8/0x240 lib/idr.c:157 + Kernel panic - not syncing: panic_on_warn set ... + + CPU: 3 PID: 3008 Comm: syzkaller218828 Not tainted 4.13.0-rc4-next-20170811 #2 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 + Call Trace: + fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190 + do_trap_no_signal arch/x86/kernel/traps.c:224 [inline] + do_trap+0x260/0x390 arch/x86/kernel/traps.c:273 + do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:310 + do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323 + invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:930 + RIP: 0010:idr_replace+0x1d8/0x240 lib/idr.c:157 + RSP: 0018:ffff8800394bf9f8 EFLAGS: 00010297 + RAX: ffff88003c6c60c0 RBX: 1ffff10007297f43 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8800394bfa78 + RBP: ffff8800394bfae0 R08: ffffffff82856487 R09: 0000000000000000 + R10: ffff8800394bf9a8 R11: ffff88006c8bae28 R12: ffffffffffffffff + R13: ffff8800394bfab8 R14: dffffc0000000000 R15: ffff8800394bfbc8 + drm_gem_handle_delete+0x33/0xa0 drivers/gpu/drm/drm_gem.c:297 + drm_gem_close_ioctl+0xa1/0xe0 drivers/gpu/drm/drm_gem.c:671 + drm_ioctl_kernel+0x1e7/0x2e0 drivers/gpu/drm/drm_ioctl.c:729 + drm_ioctl+0x72e/0xa50 drivers/gpu/drm/drm_ioctl.c:825 + vfs_ioctl fs/ioctl.c:45 [inline] + do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685 + SYSC_ioctl fs/ioctl.c:700 [inline] + SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 + entry_SYSCALL_64_fastpath+0x1f/0xbe + +Here is a C reproducer: + + #include + #include + #include + #include + #include + + int main(void) + { + int cardfd = open("/dev/dri/card0", O_RDONLY); + + ioctl(cardfd, DRM_IOCTL_GEM_CLOSE, + &(struct drm_gem_close) { .handle = -1 } ); + } + +Link: http://lkml.kernel.org/r/20170906235306.20534-1-ebiggers3@gmail.com +Fixes: 0a835c4f090a ("Reimplement IDR and IDA using the radix tree") +Signed-off-by: Eric Biggers +Acked-by: Tejun Heo +Cc: Dmitry Vyukov +Cc: Matthew Wilcox +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + lib/idr.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/lib/idr.c ++++ b/lib/idr.c +@@ -154,7 +154,7 @@ void *idr_replace(struct idr *idr, void + void __rcu **slot = NULL; + void *entry; + +- if (WARN_ON_ONCE(id < 0)) ++ if (id < 0) + return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr))) + return ERR_PTR(-EINVAL); diff --git a/queue-4.13/ovl-fix-false-positive-estale-on-lookup.patch b/queue-4.13/ovl-fix-false-positive-estale-on-lookup.patch new file mode 100644 index 00000000000..b5583aebdd4 --- /dev/null +++ b/queue-4.13/ovl-fix-false-positive-estale-on-lookup.patch @@ -0,0 +1,54 @@ +From 939ae4efd51c627da270af74ef069db5124cb5b0 Mon Sep 17 00:00:00 2001 +From: Amir Goldstein +Date: Mon, 11 Sep 2017 16:30:15 +0300 +Subject: ovl: fix false positive ESTALE on lookup + +From: Amir Goldstein + +commit 939ae4efd51c627da270af74ef069db5124cb5b0 upstream. + +Commit b9ac5c274b8c ("ovl: hash overlay non-dir inodes by copy up origin") +verifies that the origin lower inode stored in the overlayfs inode matched +the inode of a copy up origin dentry found by lookup. + +There is a false positive result in that check when lower fs does not +support file handles and copy up origin cannot be followed by file handle +at lookup time. + +The false negative happens when finding an overlay inode in cache on a +copied up overlay dentry lookup. The overlay inode still 'remembers' the +copy up origin inode, but the copy up origin dentry is not available for +verification. + +Relax the check in case copy up origin dentry is not available. + +Fixes: b9ac5c274b8c ("ovl: hash overlay non-dir inodes by copy up...") +Reported-by: Jordi Pujol +Signed-off-by: Amir Goldstein +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman + +--- + fs/overlayfs/inode.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +--- a/fs/overlayfs/inode.c ++++ b/fs/overlayfs/inode.c +@@ -576,10 +576,13 @@ static int ovl_inode_set(struct inode *i + static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, + struct dentry *upperdentry) + { +- struct inode *lowerinode = lowerdentry ? d_inode(lowerdentry) : NULL; +- +- /* Lower (origin) inode must match, even if NULL */ +- if (ovl_inode_lower(inode) != lowerinode) ++ /* ++ * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. ++ * This happens when finding a copied up overlay inode for a renamed ++ * or hardlinked overlay dentry and lower dentry cannot be followed ++ * by origin because lower fs does not support file handles. ++ */ ++ if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) + return false; + + /* diff --git a/queue-4.13/series b/queue-4.13/series index 1f46a1b0109..1c93d6a5d32 100644 --- a/queue-4.13/series +++ b/queue-4.13/series @@ -14,3 +14,10 @@ f2fs-check-hot_data-for-roll-forward-recovery.patch thunderbolt-remove-superfluous-check.patch thunderbolt-make-key-root-only-accessible.patch thunderbolt-allow-clearing-the-key.patch +x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch +x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch +x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch +x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch +ovl-fix-false-positive-estale-on-lookup.patch +fuse-allow-server-to-run-in-different-pid_ns.patch +idr-remove-warn_on_once-when-trying-to-replace-negative-id.patch diff --git a/queue-4.13/x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch b/queue-4.13/x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch new file mode 100644 index 00000000000..0493522bc3f --- /dev/null +++ b/queue-4.13/x86-fsgsbase-64-fully-initialize-fs-and-gs-state-in-start_thread_common.patch @@ -0,0 +1,58 @@ +From 767d035d838f4fd6b5a5bbd7a3f6d293b7f65a49 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 1 Aug 2017 07:11:34 -0700 +Subject: x86/fsgsbase/64: Fully initialize FS and GS state in start_thread_common + +From: Andy Lutomirski + +commit 767d035d838f4fd6b5a5bbd7a3f6d293b7f65a49 upstream. + +execve used to leak FSBASE and GSBASE on AMD CPUs. Fix it. + +The security impact of this bug is small but not quite zero -- it +could weaken ASLR when a privileged task execs a less privileged +program, but only if program changed bitness across the exec, or the +child binary was highly unusual or actively malicious. A child +program that was compromised after the exec would not have access to +the leaked base. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Chang Seok +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/process_64.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -229,10 +229,19 @@ start_thread_common(struct pt_regs *regs + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) + { ++ WARN_ON_ONCE(regs != current_pt_regs()); ++ ++ if (static_cpu_has(X86_BUG_NULL_SEG)) { ++ /* Loading zero below won't clear the base. */ ++ loadsegment(fs, __USER_DS); ++ load_gs_index(__USER_DS); ++ } ++ + loadsegment(fs, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); + load_gs_index(0); ++ + regs->ip = new_ip; + regs->sp = new_sp; + regs->cs = _cs; diff --git a/queue-4.13/x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch b/queue-4.13/x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch new file mode 100644 index 00000000000..95e32d8f9ca --- /dev/null +++ b/queue-4.13/x86-fsgsbase-64-report-fsbase-and-gsbase-correctly-in-core-dumps.patch @@ -0,0 +1,56 @@ +From 9584d98bed7a7a904d0702ad06bbcc94703cb5b4 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 1 Aug 2017 07:11:35 -0700 +Subject: x86/fsgsbase/64: Report FSBASE and GSBASE correctly in core dumps + +From: Andy Lutomirski + +commit 9584d98bed7a7a904d0702ad06bbcc94703cb5b4 upstream. + +In ELF_COPY_CORE_REGS, we're copying from the current task, so +accessing thread.fsbase and thread.gsbase makes no sense. Just read +the values from the CPU registers. + +In practice, the old code would have been correct most of the time +simply because thread.fsbase and thread.gsbase usually matched the +CPU registers. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Chang Seok +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/elf.h | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/arch/x86/include/asm/elf.h ++++ b/arch/x86/include/asm/elf.h +@@ -204,6 +204,7 @@ void set_personality_ia32(bool); + + #define ELF_CORE_COPY_REGS(pr_reg, regs) \ + do { \ ++ unsigned long base; \ + unsigned v; \ + (pr_reg)[0] = (regs)->r15; \ + (pr_reg)[1] = (regs)->r14; \ +@@ -226,8 +227,8 @@ do { \ + (pr_reg)[18] = (regs)->flags; \ + (pr_reg)[19] = (regs)->sp; \ + (pr_reg)[20] = (regs)->ss; \ +- (pr_reg)[21] = current->thread.fsbase; \ +- (pr_reg)[22] = current->thread.gsbase; \ ++ rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ ++ rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ + asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ + asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ + asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ diff --git a/queue-4.13/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch b/queue-4.13/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch new file mode 100644 index 00000000000..712d237b897 --- /dev/null +++ b/queue-4.13/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch @@ -0,0 +1,150 @@ +From ce0fa3e56ad20f04d8252353dcd24e924abdafca Mon Sep 17 00:00:00 2001 +From: Tony Luck +Date: Wed, 16 Aug 2017 10:18:03 -0700 +Subject: x86/mm, mm/hwpoison: Clear PRESENT bit for kernel 1:1 mappings of poison pages + +From: Tony Luck + +commit ce0fa3e56ad20f04d8252353dcd24e924abdafca upstream. + +Speculative processor accesses may reference any memory that has a +valid page table entry. While a speculative access won't generate +a machine check, it will log the error in a machine check bank. That +could cause escalation of a subsequent error since the overflow bit +will be then set in the machine check bank status register. + +Code has to be double-plus-tricky to avoid mentioning the 1:1 virtual +address of the page we want to map out otherwise we may trigger the +very problem we are trying to avoid. We use a non-canonical address +that passes through the usual Linux table walking code to get to the +same "pte". + +Thanks to Dave Hansen for reviewing several iterations of this. + +Also see: + + http://marc.info/?l=linux-mm&m=149860136413338&w=2 + +Signed-off-by: Tony Luck +Cc: Andrew Morton +Cc: Andy Lutomirski +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Dave Hansen +Cc: Denys Vlasenko +Cc: Elliott, Robert (Persistent Memory) +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Naoya Horiguchi +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-mm@kvack.org +Link: http://lkml.kernel.org/r/20170816171803.28342-1-tony.luck@intel.com +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/page_64.h | 4 +++ + arch/x86/kernel/cpu/mcheck/mce.c | 43 +++++++++++++++++++++++++++++++++++++++ + include/linux/mm_inline.h | 6 +++++ + mm/memory-failure.c | 2 + + 4 files changed, 55 insertions(+) + +--- a/arch/x86/include/asm/page_64.h ++++ b/arch/x86/include/asm/page_64.h +@@ -51,6 +51,10 @@ static inline void clear_page(void *page + + void copy_page(void *to, void *from); + ++#ifdef CONFIG_X86_MCE ++#define arch_unmap_kpfn arch_unmap_kpfn ++#endif ++ + #endif /* !__ASSEMBLY__ */ + + #ifdef CONFIG_X86_VSYSCALL_EMULATION +--- a/arch/x86/kernel/cpu/mcheck/mce.c ++++ b/arch/x86/kernel/cpu/mcheck/mce.c +@@ -51,6 +51,7 @@ + #include + #include + #include ++#include + + #include "mce-internal.h" + +@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce + return ret; + } + ++#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) ++ ++void arch_unmap_kpfn(unsigned long pfn) ++{ ++ unsigned long decoy_addr; ++ ++ /* ++ * Unmap this page from the kernel 1:1 mappings to make sure ++ * we don't log more errors because of speculative access to ++ * the page. ++ * We would like to just call: ++ * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); ++ * but doing that would radically increase the odds of a ++ * speculative access to the posion page because we'd have ++ * the virtual address of the kernel 1:1 mapping sitting ++ * around in registers. ++ * Instead we get tricky. We create a non-canonical address ++ * that looks just like the one we want, but has bit 63 flipped. ++ * This relies on set_memory_np() not checking whether we passed ++ * a legal address. ++ */ ++ ++/* ++ * Build time check to see if we have a spare virtual bit. Don't want ++ * to leave this until run time because most developers don't have a ++ * system that can exercise this code path. This will only become a ++ * problem if/when we move beyond 5-level page tables. ++ * ++ * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) ++ */ ++#if PGDIR_SHIFT + 9 < 63 ++ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); ++#else ++#error "no unused virtual bit available" ++#endif ++ ++ if (set_memory_np(decoy_addr, 1)) ++ pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); ++ ++} ++#endif ++ + /* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. +--- a/include/linux/mm_inline.h ++++ b/include/linux/mm_inline.h +@@ -126,4 +126,10 @@ static __always_inline enum lru_list pag + + #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) + ++#ifdef arch_unmap_kpfn ++extern void arch_unmap_kpfn(unsigned long pfn); ++#else ++static __always_inline void arch_unmap_kpfn(unsigned long pfn) { } ++#endif ++ + #endif +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, in + return 0; + } + ++ arch_unmap_kpfn(pfn); ++ + orig_head = hpage = compound_head(p); + num_poisoned_pages_inc(); + diff --git a/queue-4.13/x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch b/queue-4.13/x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch new file mode 100644 index 00000000000..1d9bb933633 --- /dev/null +++ b/queue-4.13/x86-switch_to-64-rewrite-fs-gs-switching-yet-again-to-fix-amd-cpus.patch @@ -0,0 +1,297 @@ +From e137a4d8f4dd2e277e355495b6b2cb241a8693c3 Mon Sep 17 00:00:00 2001 +From: Andy Lutomirski +Date: Tue, 1 Aug 2017 07:11:37 -0700 +Subject: x86/switch_to/64: Rewrite FS/GS switching yet again to fix AMD CPUs + +From: Andy Lutomirski + +commit e137a4d8f4dd2e277e355495b6b2cb241a8693c3 upstream. + +Switching FS and GS is a mess, and the current code is still subtly +wrong: it assumes that "Loading a nonzero value into FS sets the +index and base", which is false on AMD CPUs if the value being +loaded is 1, 2, or 3. + +(The current code came from commit 3e2b68d752c9 ("x86/asm, +sched/x86: Rewrite the FS and GS context switch code"), which made +it better but didn't fully fix it.) + +Rewrite it to be much simpler and more obviously correct. This +should fix it fully on AMD CPUs and shouldn't adversely affect +performance. + +Signed-off-by: Andy Lutomirski +Cc: Borislav Petkov +Cc: Borislav Petkov +Cc: Brian Gerst +Cc: Chang Seok +Cc: Denys Vlasenko +Cc: H. Peter Anvin +Cc: Josh Poimboeuf +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/process_64.c | 227 +++++++++++++++++++++++-------------------- + 1 file changed, 122 insertions(+), 105 deletions(-) + +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -149,6 +149,123 @@ void release_thread(struct task_struct * + } + } + ++enum which_selector { ++ FS, ++ GS ++}; ++ ++/* ++ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are ++ * not available. The goal is to be reasonably fast on non-FSGSBASE systems. ++ * It's forcibly inlined because it'll generate better code and this function ++ * is hot. ++ */ ++static __always_inline void save_base_legacy(struct task_struct *prev_p, ++ unsigned short selector, ++ enum which_selector which) ++{ ++ if (likely(selector == 0)) { ++ /* ++ * On Intel (without X86_BUG_NULL_SEG), the segment base could ++ * be the pre-existing saved base or it could be zero. On AMD ++ * (with X86_BUG_NULL_SEG), the segment base could be almost ++ * anything. ++ * ++ * This branch is very hot (it's hit twice on almost every ++ * context switch between 64-bit programs), and avoiding ++ * the RDMSR helps a lot, so we just assume that whatever ++ * value is already saved is correct. This matches historical ++ * Linux behavior, so it won't break existing applications. ++ * ++ * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we ++ * report that the base is zero, it needs to actually be zero: ++ * see the corresponding logic in load_seg_legacy. ++ */ ++ } else { ++ /* ++ * If the selector is 1, 2, or 3, then the base is zero on ++ * !X86_BUG_NULL_SEG CPUs and could be anything on ++ * X86_BUG_NULL_SEG CPUs. In the latter case, Linux ++ * has never attempted to preserve the base across context ++ * switches. ++ * ++ * If selector > 3, then it refers to a real segment, and ++ * saving the base isn't necessary. ++ */ ++ if (which == FS) ++ prev_p->thread.fsbase = 0; ++ else ++ prev_p->thread.gsbase = 0; ++ } ++} ++ ++static __always_inline void save_fsgs(struct task_struct *task) ++{ ++ savesegment(fs, task->thread.fsindex); ++ savesegment(gs, task->thread.gsindex); ++ save_base_legacy(task, task->thread.fsindex, FS); ++ save_base_legacy(task, task->thread.gsindex, GS); ++} ++ ++static __always_inline void loadseg(enum which_selector which, ++ unsigned short sel) ++{ ++ if (which == FS) ++ loadsegment(fs, sel); ++ else ++ load_gs_index(sel); ++} ++ ++static __always_inline void load_seg_legacy(unsigned short prev_index, ++ unsigned long prev_base, ++ unsigned short next_index, ++ unsigned long next_base, ++ enum which_selector which) ++{ ++ if (likely(next_index <= 3)) { ++ /* ++ * The next task is using 64-bit TLS, is not using this ++ * segment at all, or is having fun with arcane CPU features. ++ */ ++ if (next_base == 0) { ++ /* ++ * Nasty case: on AMD CPUs, we need to forcibly zero ++ * the base. ++ */ ++ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { ++ loadseg(which, __USER_DS); ++ loadseg(which, next_index); ++ } else { ++ /* ++ * We could try to exhaustively detect cases ++ * under which we can skip the segment load, ++ * but there's really only one case that matters ++ * for performance: if both the previous and ++ * next states are fully zeroed, we can skip ++ * the load. ++ * ++ * (This assumes that prev_base == 0 has no ++ * false positives. This is the case on ++ * Intel-style CPUs.) ++ */ ++ if (likely(prev_index | next_index | prev_base)) ++ loadseg(which, next_index); ++ } ++ } else { ++ if (prev_index != next_index) ++ loadseg(which, next_index); ++ wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, ++ next_base); ++ } ++ } else { ++ /* ++ * The next task is using a real segment. Loading the selector ++ * is sufficient. ++ */ ++ loadseg(which, next_index); ++ } ++} ++ + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p, unsigned long tls) + { +@@ -286,7 +403,6 @@ __switch_to(struct task_struct *prev_p, + struct fpu *next_fpu = &next->fpu; + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); +- unsigned prev_fsindex, prev_gsindex; + + switch_fpu_prepare(prev_fpu, cpu); + +@@ -295,8 +411,7 @@ __switch_to(struct task_struct *prev_p, + * + * (e.g. xen_load_tls()) + */ +- savesegment(fs, prev_fsindex); +- savesegment(gs, prev_gsindex); ++ save_fsgs(prev_p); + + /* + * Load TLS before restoring any segments so that segment loads +@@ -335,108 +450,10 @@ __switch_to(struct task_struct *prev_p, + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + +- /* +- * Switch FS and GS. +- * +- * These are even more complicated than DS and ES: they have +- * 64-bit bases are that controlled by arch_prctl. The bases +- * don't necessarily match the selectors, as user code can do +- * any number of things to cause them to be inconsistent. +- * +- * We don't promise to preserve the bases if the selectors are +- * nonzero. We also don't promise to preserve the base if the +- * selector is zero and the base doesn't match whatever was +- * most recently passed to ARCH_SET_FS/GS. (If/when the +- * FSGSBASE instructions are enabled, we'll need to offer +- * stronger guarantees.) +- * +- * As an invariant, +- * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is +- * impossible. +- */ +- if (next->fsindex) { +- /* Loading a nonzero value into FS sets the index and base. */ +- loadsegment(fs, next->fsindex); +- } else { +- if (next->fsbase) { +- /* Next index is zero but next base is nonzero. */ +- if (prev_fsindex) +- loadsegment(fs, 0); +- wrmsrl(MSR_FS_BASE, next->fsbase); +- } else { +- /* Next base and index are both zero. */ +- if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { +- /* +- * We don't know the previous base and can't +- * find out without RDMSR. Forcibly clear it. +- */ +- loadsegment(fs, __USER_DS); +- loadsegment(fs, 0); +- } else { +- /* +- * If the previous index is zero and ARCH_SET_FS +- * didn't change the base, then the base is +- * also zero and we don't need to do anything. +- */ +- if (prev->fsbase || prev_fsindex) +- loadsegment(fs, 0); +- } +- } +- } +- /* +- * Save the old state and preserve the invariant. +- * NB: if prev_fsindex == 0, then we can't reliably learn the base +- * without RDMSR because Intel user code can zero it without telling +- * us and AMD user code can program any 32-bit value without telling +- * us. +- */ +- if (prev_fsindex) +- prev->fsbase = 0; +- prev->fsindex = prev_fsindex; +- +- if (next->gsindex) { +- /* Loading a nonzero value into GS sets the index and base. */ +- load_gs_index(next->gsindex); +- } else { +- if (next->gsbase) { +- /* Next index is zero but next base is nonzero. */ +- if (prev_gsindex) +- load_gs_index(0); +- wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); +- } else { +- /* Next base and index are both zero. */ +- if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { +- /* +- * We don't know the previous base and can't +- * find out without RDMSR. Forcibly clear it. +- * +- * This contains a pointless SWAPGS pair. +- * Fixing it would involve an explicit check +- * for Xen or a new pvop. +- */ +- load_gs_index(__USER_DS); +- load_gs_index(0); +- } else { +- /* +- * If the previous index is zero and ARCH_SET_GS +- * didn't change the base, then the base is +- * also zero and we don't need to do anything. +- */ +- if (prev->gsbase || prev_gsindex) +- load_gs_index(0); +- } +- } +- } +- /* +- * Save the old state and preserve the invariant. +- * NB: if prev_gsindex == 0, then we can't reliably learn the base +- * without RDMSR because Intel user code can zero it without telling +- * us and AMD user code can program any 32-bit value without telling +- * us. +- */ +- if (prev_gsindex) +- prev->gsbase = 0; +- prev->gsindex = prev_gsindex; ++ load_seg_legacy(prev->fsindex, prev->fsbase, ++ next->fsindex, next->fsbase, FS); ++ load_seg_legacy(prev->gsindex, prev->gsbase, ++ next->gsindex, next->gsbase, GS); + + switch_fpu_finish(next_fpu, cpu); + -- 2.47.3