From: Greg Kroah-Hartman Date: Mon, 13 Mar 2017 08:29:51 +0000 (+0800) Subject: 4.9-stable patches X-Git-Tag: v4.4.54~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=4f317be673e406bbacb5969b8262323f0970c0c3;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: drivers-hv-turn-off-write-permission-on-the-hypercall-page.patch fat-fix-using-uninitialized-fields-of-fat_inode-fsinfo_inode.patch mm-do-not-call-mem_cgroup_free-from-within-mem_cgroup_alloc.patch thp-fix-another-corner-case-of-munlock-vs.-thps.patch x86-mm-fix-gup_pte_range-vs-dax-mappings.patch x86-tlb-fix-tlb-flushing-when-lguest-clears-pge.patch --- diff --git a/queue-4.9/drivers-hv-turn-off-write-permission-on-the-hypercall-page.patch b/queue-4.9/drivers-hv-turn-off-write-permission-on-the-hypercall-page.patch new file mode 100644 index 00000000000..cc33c7b196d --- /dev/null +++ b/queue-4.9/drivers-hv-turn-off-write-permission-on-the-hypercall-page.patch @@ -0,0 +1,33 @@ +From 372b1e91343e657a7cc5e2e2bcecd5140ac28119 Mon Sep 17 00:00:00 2001 +From: "K. Y. Srinivasan" +Date: Wed, 8 Feb 2017 18:30:56 -0700 +Subject: drivers: hv: Turn off write permission on the hypercall page + +From: K. Y. Srinivasan + +commit 372b1e91343e657a7cc5e2e2bcecd5140ac28119 upstream. + +The hypercall page only needs to be executable but currently it is setup to +be writable as well. Fix the issue. + +Signed-off-by: K. Y. Srinivasan +Acked-by: Kees Cook +Reported-by: Stephen Hemminger +Tested-by: Stephen Hemminger +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/hv/hv.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/hv/hv.c ++++ b/drivers/hv/hv.c +@@ -220,7 +220,7 @@ int hv_init(void) + /* See if the hypercall page is already set */ + rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + +- virtaddr = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_EXEC); ++ virtaddr = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + + if (!virtaddr) + goto cleanup; diff --git a/queue-4.9/fat-fix-using-uninitialized-fields-of-fat_inode-fsinfo_inode.patch b/queue-4.9/fat-fix-using-uninitialized-fields-of-fat_inode-fsinfo_inode.patch new file mode 100644 index 00000000000..06c18730794 --- /dev/null +++ b/queue-4.9/fat-fix-using-uninitialized-fields-of-fat_inode-fsinfo_inode.patch @@ -0,0 +1,63 @@ +From c0d0e351285161a515396b7b1ee53ec9ffd97e3c Mon Sep 17 00:00:00 2001 +From: OGAWA Hirofumi +Date: Thu, 9 Mar 2017 16:17:37 -0800 +Subject: fat: fix using uninitialized fields of fat_inode/fsinfo_inode + +From: OGAWA Hirofumi + +commit c0d0e351285161a515396b7b1ee53ec9ffd97e3c upstream. + +Recently fallocate patch was merged and it uses +MSDOS_I(inode)->mmu_private at fat_evict_inode(). However, +fat_inode/fsinfo_inode that was introduced in past didn't initialize +MSDOS_I(inode) properly. + +With those combinations, it became the cause of accessing random entry +in FAT area. + +Link: http://lkml.kernel.org/r/87pohrj4i8.fsf@mail.parknet.co.jp +Signed-off-by: OGAWA Hirofumi +Reported-by: Moreno Bartalucci +Tested-by: Moreno Bartalucci +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fat/inode.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/fs/fat/inode.c ++++ b/fs/fat/inode.c +@@ -1359,6 +1359,16 @@ out: + return 0; + } + ++static void fat_dummy_inode_init(struct inode *inode) ++{ ++ /* Initialize this dummy inode to work as no-op. */ ++ MSDOS_I(inode)->mmu_private = 0; ++ MSDOS_I(inode)->i_start = 0; ++ MSDOS_I(inode)->i_logstart = 0; ++ MSDOS_I(inode)->i_attrs = 0; ++ MSDOS_I(inode)->i_pos = 0; ++} ++ + static int fat_read_root(struct inode *inode) + { + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); +@@ -1803,12 +1813,13 @@ int fat_fill_super(struct super_block *s + fat_inode = new_inode(sb); + if (!fat_inode) + goto out_fail; +- MSDOS_I(fat_inode)->i_pos = 0; ++ fat_dummy_inode_init(fat_inode); + sbi->fat_inode = fat_inode; + + fsinfo_inode = new_inode(sb); + if (!fsinfo_inode) + goto out_fail; ++ fat_dummy_inode_init(fsinfo_inode); + fsinfo_inode->i_ino = MSDOS_FSINFO_INO; + sbi->fsinfo_inode = fsinfo_inode; + insert_inode_hash(fsinfo_inode); diff --git a/queue-4.9/mm-do-not-call-mem_cgroup_free-from-within-mem_cgroup_alloc.patch b/queue-4.9/mm-do-not-call-mem_cgroup_free-from-within-mem_cgroup_alloc.patch new file mode 100644 index 00000000000..b903937349d --- /dev/null +++ b/queue-4.9/mm-do-not-call-mem_cgroup_free-from-within-mem_cgroup_alloc.patch @@ -0,0 +1,88 @@ +From 40e952f9d687928b32db20226f085ae660a7237c Mon Sep 17 00:00:00 2001 +From: Tahsin Erdogan +Date: Thu, 9 Mar 2017 16:17:26 -0800 +Subject: mm: do not call mem_cgroup_free() from within mem_cgroup_alloc() + +From: Tahsin Erdogan + +commit 40e952f9d687928b32db20226f085ae660a7237c upstream. + +mem_cgroup_free() indirectly calls wb_domain_exit() which is not +prepared to deal with a struct wb_domain object that hasn't executed +wb_domain_init(). For instance, the following warning message is +printed by lockdep if alloc_percpu() fails in mem_cgroup_alloc(): + + INFO: trying to register non-static key. + the code is fine but needs lockdep annotation. + turning off the locking correctness validator. + CPU: 1 PID: 1950 Comm: mkdir Not tainted 4.10.0+ #151 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 + Call Trace: + dump_stack+0x67/0x99 + register_lock_class+0x36d/0x540 + __lock_acquire+0x7f/0x1a30 + lock_acquire+0xcc/0x200 + del_timer_sync+0x3c/0xc0 + wb_domain_exit+0x14/0x20 + mem_cgroup_free+0x14/0x40 + mem_cgroup_css_alloc+0x3f9/0x620 + cgroup_apply_control_enable+0x190/0x390 + cgroup_mkdir+0x290/0x3d0 + kernfs_iop_mkdir+0x58/0x80 + vfs_mkdir+0x10e/0x1a0 + SyS_mkdirat+0xa8/0xd0 + SyS_mkdir+0x14/0x20 + entry_SYSCALL_64_fastpath+0x18/0xad + +Add __mem_cgroup_free() which skips wb_domain_exit(). This is used by +both mem_cgroup_free() and mem_cgroup_alloc() clean up. + +Fixes: 0b8f73e104285 ("mm: memcontrol: clean up alloc, online, offline, free functions") +Link: http://lkml.kernel.org/r/20170306192122.24262-1-tahsin@google.com +Signed-off-by: Tahsin Erdogan +Acked-by: Michal Hocko +Cc: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -4139,17 +4139,22 @@ static void free_mem_cgroup_per_node_inf + kfree(memcg->nodeinfo[node]); + } + +-static void mem_cgroup_free(struct mem_cgroup *memcg) ++static void __mem_cgroup_free(struct mem_cgroup *memcg) + { + int node; + +- memcg_wb_domain_exit(memcg); + for_each_node(node) + free_mem_cgroup_per_node_info(memcg, node); + free_percpu(memcg->stat); + kfree(memcg); + } + ++static void mem_cgroup_free(struct mem_cgroup *memcg) ++{ ++ memcg_wb_domain_exit(memcg); ++ __mem_cgroup_free(memcg); ++} ++ + static struct mem_cgroup *mem_cgroup_alloc(void) + { + struct mem_cgroup *memcg; +@@ -4200,7 +4205,7 @@ static struct mem_cgroup *mem_cgroup_all + fail: + if (memcg->id.id > 0) + idr_remove(&mem_cgroup_idr, memcg->id.id); +- mem_cgroup_free(memcg); ++ __mem_cgroup_free(memcg); + return NULL; + } + diff --git a/queue-4.9/series b/queue-4.9/series index da85c161255..72aaaed528c 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -55,3 +55,9 @@ drm-cancel-drm_fb_helper_resume_work-on-unload.patch drm-i915-avoid-spurious-warns-about-the-wrong-pipe-in-the-pps-code.patch drm-i915-fix-not-finding-the-vbt-when-it-overlaps-with-opregion_asle_ext.patch libceph-use-bug-instead-of-bug_on-1.patch +x86-mm-fix-gup_pte_range-vs-dax-mappings.patch +x86-tlb-fix-tlb-flushing-when-lguest-clears-pge.patch +thp-fix-another-corner-case-of-munlock-vs.-thps.patch +mm-do-not-call-mem_cgroup_free-from-within-mem_cgroup_alloc.patch +fat-fix-using-uninitialized-fields-of-fat_inode-fsinfo_inode.patch +drivers-hv-turn-off-write-permission-on-the-hypercall-page.patch diff --git a/queue-4.9/thp-fix-another-corner-case-of-munlock-vs.-thps.patch b/queue-4.9/thp-fix-another-corner-case-of-munlock-vs.-thps.patch new file mode 100644 index 00000000000..cf490db1957 --- /dev/null +++ b/queue-4.9/thp-fix-another-corner-case-of-munlock-vs.-thps.patch @@ -0,0 +1,82 @@ +From 6ebb4a1b848fe75323135f93e72c78f8780fd268 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Thu, 9 Mar 2017 16:17:23 -0800 +Subject: thp: fix another corner case of munlock() vs. THPs + +From: Kirill A. Shutemov + +commit 6ebb4a1b848fe75323135f93e72c78f8780fd268 upstream. + +The following test case triggers BUG() in munlock_vma_pages_range(): + + int main(int argc, char *argv[]) + { + int fd; + + system("mount -t tmpfs -o huge=always none /mnt"); + fd = open("/mnt/test", O_CREAT | O_RDWR); + ftruncate(fd, 4UL << 20); + mmap(NULL, 4UL << 20, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED | MAP_LOCKED, fd, 0); + mmap(NULL, 4096, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED, fd, 0); + munlockall(); + return 0; + } + +The second mmap() create PTE-mapping of the first huge page in file. It +makes kernel munlock the page as we never keep PTE-mapped page mlocked. + +On munlockall() when we handle vma created by the first mmap(), +munlock_vma_page() returns page_mask == 0, as the page is not mlocked +anymore. On next iteration follow_page_mask() return tail page, but +page_mask is HPAGE_NR_PAGES - 1. It makes us skip to the first tail +page of the next huge page and step on +VM_BUG_ON_PAGE(PageMlocked(page)). + +The fix is not use the page_mask from follow_page_mask() at all. It has +no use for us. + +Link: http://lkml.kernel.org/r/20170302150252.34120-1-kirill.shutemov@linux.intel.com +Signed-off-by: Kirill A. Shutemov +Cc: Andrea Arcangeli +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/mlock.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +--- a/mm/mlock.c ++++ b/mm/mlock.c +@@ -441,7 +441,7 @@ void munlock_vma_pages_range(struct vm_a + + while (start < end) { + struct page *page; +- unsigned int page_mask; ++ unsigned int page_mask = 0; + unsigned long page_increm; + struct pagevec pvec; + struct zone *zone; +@@ -455,8 +455,7 @@ void munlock_vma_pages_range(struct vm_a + * suits munlock very well (and if somehow an abnormal page + * has sneaked into the range, we won't oops here: great). + */ +- page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, +- &page_mask); ++ page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); + + if (page && !IS_ERR(page)) { + if (PageTransTail(page)) { +@@ -467,8 +466,8 @@ void munlock_vma_pages_range(struct vm_a + /* + * Any THP page found by follow_page_mask() may + * have gotten split before reaching +- * munlock_vma_page(), so we need to recompute +- * the page_mask here. ++ * munlock_vma_page(), so we need to compute ++ * the page_mask here instead. + */ + page_mask = munlock_vma_page(page); + unlock_page(page); diff --git a/queue-4.9/x86-mm-fix-gup_pte_range-vs-dax-mappings.patch b/queue-4.9/x86-mm-fix-gup_pte_range-vs-dax-mappings.patch new file mode 100644 index 00000000000..5ac67b47290 --- /dev/null +++ b/queue-4.9/x86-mm-fix-gup_pte_range-vs-dax-mappings.patch @@ -0,0 +1,57 @@ +From ef947b2529f918d9606533eb9c32b187ed6a5ede Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Thu, 9 Mar 2017 16:16:42 -0800 +Subject: x86, mm: fix gup_pte_range() vs DAX mappings + +From: Dan Williams + +commit ef947b2529f918d9606533eb9c32b187ed6a5ede upstream. + +gup_pte_range() fails to check pte_allows_gup() before translating a DAX +pte entry, pte_devmap(), to a page. This allows writes to read-only +mappings, and bypasses the DAX cacheline dirty tracking due to missed +'mkwrite' faults. The gup_huge_pmd() path and the gup_huge_pud() path +correctly check pte_allows_gup() before checking for _devmap() entries. + +Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings") +Link: http://lkml.kernel.org/r/148804251312.36605.12665024794196605053.stgit@dwillia2-desk3.amr.corp.intel.com +Signed-off-by: Ross Zwisler +Signed-off-by: Dan Williams +Reported-by: Dave Hansen +Reported-by: Ross Zwisler +Cc: Xiong Zhou +Cc: Ingo Molnar +Cc: "H. Peter Anvin" +Cc: Thomas Gleixner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/mm/gup.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/arch/x86/mm/gup.c ++++ b/arch/x86/mm/gup.c +@@ -120,6 +120,11 @@ static noinline int gup_pte_range(pmd_t + return 0; + } + ++ if (!pte_allows_gup(pte_val(pte), write)) { ++ pte_unmap(ptep); ++ return 0; ++ } ++ + if (pte_devmap(pte)) { + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); + if (unlikely(!pgmap)) { +@@ -127,8 +132,7 @@ static noinline int gup_pte_range(pmd_t + pte_unmap(ptep); + return 0; + } +- } else if (!pte_allows_gup(pte_val(pte), write) || +- pte_special(pte)) { ++ } else if (pte_special(pte)) { + pte_unmap(ptep); + return 0; + } diff --git a/queue-4.9/x86-tlb-fix-tlb-flushing-when-lguest-clears-pge.patch b/queue-4.9/x86-tlb-fix-tlb-flushing-when-lguest-clears-pge.patch new file mode 100644 index 00000000000..4bb33d749e0 --- /dev/null +++ b/queue-4.9/x86-tlb-fix-tlb-flushing-when-lguest-clears-pge.patch @@ -0,0 +1,92 @@ +From 2c4ea6e28dbf15ab93632c5c189f3948366b8885 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Sat, 11 Mar 2017 01:31:19 +0100 +Subject: x86/tlb: Fix tlb flushing when lguest clears PGE + +From: Daniel Borkmann + +commit 2c4ea6e28dbf15ab93632c5c189f3948366b8885 upstream. + +Fengguang reported random corruptions from various locations on x86-32 +after commits d2852a224050 ("arch: add ARCH_HAS_SET_MEMORY config") and +9d876e79df6a ("bpf: fix unlocking of jited image when module ronx not set") +that uses the former. While x86-32 doesn't have a JIT like x86_64, the +bpf_prog_lock_ro() and bpf_prog_unlock_ro() got enabled due to +ARCH_HAS_SET_MEMORY, whereas Fengguang's test kernel doesn't have module +support built in and therefore never had the DEBUG_SET_MODULE_RONX setting +enabled. + +After investigating the crashes further, it turned out that using +set_memory_ro() and set_memory_rw() didn't have the desired effect, for +example, setting the pages as read-only on x86-32 would still let +probe_kernel_write() succeed without error. This behavior would manifest +itself in situations where the vmalloc'ed buffer was accessed prior to +set_memory_*() such as in case of bpf_prog_alloc(). In cases where it +wasn't, the page attribute changes seemed to have taken effect, leading to +the conclusion that a TLB invalidate didn't happen. Moreover, it turned out +that this issue reproduced with qemu in "-cpu kvm64" mode, but not for +"-cpu host". When the issue occurs, change_page_attr_set_clr() did trigger +a TLB flush as expected via __flush_tlb_all() through cpa_flush_range(), +though. + +There are 3 variants for issuing a TLB flush: invpcid_flush_all() (depends +on CPU feature bits X86_FEATURE_INVPCID, X86_FEATURE_PGE), cr4 based flush +(depends on X86_FEATURE_PGE), and cr3 based flush. For "-cpu host" case in +my setup, the flush used invpcid_flush_all() variant, whereas for "-cpu +kvm64", the flush was cr4 based. Switching the kvm64 case to cr3 manually +worked fine, and further investigating the cr4 one turned out that +X86_CR4_PGE bit was not set in cr4 register, meaning the +__native_flush_tlb_global_irq_disabled() wrote cr4 twice with the same +value instead of clearing X86_CR4_PGE in the first write to trigger the +flush. + +It turned out that X86_CR4_PGE was cleared from cr4 during init from +lguest_arch_host_init() via adjust_pge(). The X86_FEATURE_PGE bit is also +cleared from there due to concerns of using PGE in guest kernel that can +lead to hard to trace bugs (see bff672e630a0 ("lguest: documentation V: +Host") in init()). The CPU feature bits are cleared in dynamic +boot_cpu_data, but they never propagated to __flush_tlb_all() as it uses +static_cpu_has() instead of boot_cpu_has() for testing which variant of TLB +flushing to use, meaning they still used the old setting of the host +kernel. + +Clearing via setup_clear_cpu_cap(X86_FEATURE_PGE) so this would propagate +to static_cpu_has() checks is too late at this point as sections have been +patched already, so for now, it seems reasonable to switch back to +boot_cpu_has(X86_FEATURE_PGE) as it was prior to commit c109bf95992b +("x86/cpufeature: Remove cpu_has_pge"). This lets the TLB flush trigger via +cr3 as originally intended, properly makes the new page attributes visible +and thus fixes the crashes seen by Fengguang. + +Fixes: c109bf95992b ("x86/cpufeature: Remove cpu_has_pge") +Reported-by: Fengguang Wu +Signed-off-by: Daniel Borkmann +Cc: bp@suse.de +Cc: Kees Cook +Cc: "David S. Miller" +Cc: netdev@vger.kernel.org +Cc: Rusty Russell +Cc: Alexei Starovoitov +Cc: Linus Torvalds +Cc: lkp@01.org +Cc: Laura Abbott +Link: http://lkml.kernrl.org/r/20170301125426.l4nf65rx4wahohyl@wfg-t540p.sh.intel.com +Link: http://lkml.kernel.org/r/25c41ad9eca164be4db9ad84f768965b7eb19d9e.1489191673.git.daniel@iogearbox.net +Signed-off-by: Thomas Gleixner +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/tlbflush.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -188,7 +188,7 @@ static inline void __native_flush_tlb_si + + static inline void __flush_tlb_all(void) + { +- if (static_cpu_has(X86_FEATURE_PGE)) ++ if (boot_cpu_has(X86_FEATURE_PGE)) + __flush_tlb_global(); + else + __flush_tlb();