From f9578d5644104a1e18475972a5f556773a7915b9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 2 Oct 2017 14:30:00 +0200 Subject: [PATCH] 4.13-stable patches added patches: arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch arm64-make-sure-spsel-is-always-set.patch arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch btrfs-prevent-to-set-invalid-default-subvolid.patch btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch etnaviv-fix-gem-object-list-corruption.patch etnaviv-fix-submit-error-path.patch fix-infoleak-in-waitid-2.patch futex-fix-pi_state-owner-serialization.patch irq-generic-chip-don-t-replace-domain-s-name.patch kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch kvm-nvmx-fix-host_cr3-host_cr4-cache.patch kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch kvm-vmx-extract-__pi_post_block.patch kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch md-fix-a-race-condition-for-flush-request-handling.patch md-separate-request-handling.patch mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch pci-fix-race-condition-with-driver_override.patch platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch pm-opp-call-notifier-without-holding-opp_table-lock.patch sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch xfs-validate-bdev-support-for-dax-inode-flag.patch --- ...tion-faults-via-do_translation_fault.patch | 65 ++++++ .../arm64-make-sure-spsel-is-always-set.patch | 40 ++++ ...n-dereferencing-pointer-to-pte-table.patch | 77 +++++++ ...-flag-on-cleaning-up-ordered-extents.patch | 58 +++++ ...ent-cleaning-if-no-progress-is-found.patch | 64 ++++++ ...er-dereference-from-free_reloc_roots.patch | 39 ++++ ...vent-to-set-invalid-default-subvolid.patch | 37 +++ ...ror-to-btrfs_cmp_data_prepare-caller.patch | 38 ++++ ...naviv-fix-gem-object-list-corruption.patch | 38 ++++ .../etnaviv-fix-submit-error-path.patch | 34 +++ queue-4.13/fix-infoleak-in-waitid-2.patch | 65 ++++++ ...tex-fix-pi_state-owner-serialization.patch | 124 ++++++++++ ...ric-chip-don-t-replace-domain-s-name.patch | 39 ++++ ...-allow-l2-to-access-the-hardware-cr8.patch | 39 ++++ ...kvm-nvmx-fix-host_cr3-host_cr4-cache.patch | 83 +++++++ ...list-add-with-vt-d-posted-interrupts.patch | 157 +++++++++++++ ...o-not-bug-on-out-of-bounds-guest-irq.patch | 57 +++++ .../kvm-vmx-extract-__pi_post_block.patch | 118 ++++++++++ ...mx-simplify-and-fix-vmx_vcpu_pi_load.patch | 130 +++++++++++ ...f-in-rcu-read-side-critical-sections.patch | 81 +++++++ ...condition-for-flush-request-handling.patch | 52 +++++ queue-4.13/md-separate-request-handling.patch | 122 ++++++++++ ...ent-check-on-multi-erasesize-devices.patch | 47 ++++ ...-buffer-overflow-in-atmel_pmecc_user.patch | 37 +++ ...-race-condition-with-driver_override.patch | 66 ++++++ ...on-t-oops-when-fuj02e3-is-not-presnt.patch | 48 ++++ ...ifier-without-holding-opp_table-lock.patch | 58 +++++ ...input-value-of-sysctl_sched_time_avg.patch | 83 +++++++ queue-4.13/series | 30 +++ ...-error-path-using-unsafe-vma-pointer.patch | 211 ++++++++++++++++++ ...date-bdev-support-for-dax-inode-flag.patch | 50 +++++ 31 files changed, 2187 insertions(+) create mode 100644 queue-4.13/arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch create mode 100644 queue-4.13/arm64-make-sure-spsel-is-always-set.patch create mode 100644 queue-4.13/arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch create mode 100644 queue-4.13/btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch create mode 100644 queue-4.13/btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch create mode 100644 queue-4.13/btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch create mode 100644 queue-4.13/btrfs-prevent-to-set-invalid-default-subvolid.patch create mode 100644 queue-4.13/btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch create mode 100644 queue-4.13/etnaviv-fix-gem-object-list-corruption.patch create mode 100644 queue-4.13/etnaviv-fix-submit-error-path.patch create mode 100644 queue-4.13/fix-infoleak-in-waitid-2.patch create mode 100644 queue-4.13/futex-fix-pi_state-owner-serialization.patch create mode 100644 queue-4.13/irq-generic-chip-don-t-replace-domain-s-name.patch create mode 100644 queue-4.13/kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch create mode 100644 queue-4.13/kvm-nvmx-fix-host_cr3-host_cr4-cache.patch create mode 100644 queue-4.13/kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch create mode 100644 queue-4.13/kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch create mode 100644 queue-4.13/kvm-vmx-extract-__pi_post_block.patch create mode 100644 queue-4.13/kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch create mode 100644 queue-4.13/kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch create mode 100644 queue-4.13/md-fix-a-race-condition-for-flush-request-handling.patch create mode 100644 queue-4.13/md-separate-request-handling.patch create mode 100644 queue-4.13/mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch create mode 100644 queue-4.13/mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch create mode 100644 queue-4.13/pci-fix-race-condition-with-driver_override.patch create mode 100644 queue-4.13/platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch create mode 100644 queue-4.13/pm-opp-call-notifier-without-holding-opp_table-lock.patch create mode 100644 queue-4.13/sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch create mode 100644 queue-4.13/x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch create mode 100644 queue-4.13/xfs-validate-bdev-support-for-dax-inode-flag.patch diff --git a/queue-4.13/arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch b/queue-4.13/arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch new file mode 100644 index 00000000000..bede0c894f9 --- /dev/null +++ b/queue-4.13/arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch @@ -0,0 +1,65 @@ +From 760bfb47c36a07741a089bf6a28e854ffbee7dc9 Mon Sep 17 00:00:00 2001 +From: Will Deacon +Date: Fri, 29 Sep 2017 12:27:41 +0100 +Subject: arm64: fault: Route pte translation faults via do_translation_fault + +From: Will Deacon + +commit 760bfb47c36a07741a089bf6a28e854ffbee7dc9 upstream. + +We currently route pte translation faults via do_page_fault, which elides +the address check against TASK_SIZE before invoking the mm fault handling +code. However, this can cause issues with the path walking code in +conjunction with our word-at-a-time implementation because +load_unaligned_zeropad can end up faulting in kernel space if it reads +across a page boundary and runs into a page fault (e.g. by attempting to +read from a guard region). + +In the case of such a fault, load_unaligned_zeropad has registered a +fixup to shift the valid data and pad with zeroes, however the abort is +reported as a level 3 translation fault and we dispatch it straight to +do_page_fault, despite it being a kernel address. This results in calling +a sleeping function from atomic context: + + BUG: sleeping function called from invalid context at arch/arm64/mm/fault.c:313 + in_atomic(): 0, irqs_disabled(): 0, pid: 10290 + Internal error: Oops - BUG: 0 [#1] PREEMPT SMP + [...] + [] ___might_sleep+0x134/0x144 + [] __might_sleep+0x7c/0x8c + [] do_page_fault+0x140/0x330 + [] do_mem_abort+0x54/0xb0 + Exception stack(0xfffffffb20247a70 to 0xfffffffb20247ba0) + [...] + [] el1_da+0x18/0x78 + [] path_parentat+0x44/0x88 + [] filename_parentat+0x5c/0xd8 + [] filename_create+0x4c/0x128 + [] SyS_mkdirat+0x50/0xc8 + [] el0_svc_naked+0x24/0x28 + Code: 36380080 d5384100 f9400800 9402566d (d4210000) + ---[ end trace 2d01889f2bca9b9f ]--- + +Fix this by dispatching all translation faults to do_translation_faults, +which avoids invoking the page fault logic for faults on kernel addresses. + +Reported-by: Ankit Jain +Signed-off-by: Will Deacon +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/mm/fault.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -614,7 +614,7 @@ static const struct fault_info fault_inf + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, +- { do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, ++ { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, + { do_bad, SIGBUS, 0, "unknown 8" }, + { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, + { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, diff --git a/queue-4.13/arm64-make-sure-spsel-is-always-set.patch b/queue-4.13/arm64-make-sure-spsel-is-always-set.patch new file mode 100644 index 00000000000..9aa07d7a523 --- /dev/null +++ b/queue-4.13/arm64-make-sure-spsel-is-always-set.patch @@ -0,0 +1,40 @@ +From 5371513fb338fb9989c569dc071326d369d6ade8 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Tue, 26 Sep 2017 15:57:16 +0100 +Subject: arm64: Make sure SPsel is always set + +From: Marc Zyngier + +commit 5371513fb338fb9989c569dc071326d369d6ade8 upstream. + +When the kernel is entered at EL2 on an ARMv8.0 system, we construct +the EL1 pstate and make sure this uses the the EL1 stack pointer +(we perform an exception return to EL1h). + +But if the kernel is either entered at EL1 or stays at EL2 (because +we're on a VHE-capable system), we fail to set SPsel, and use whatever +stack selection the higher exception level has choosen for us. + +Let's not take any chance, and make sure that SPsel is set to one +before we decide the mode we're going to run in. + +Acked-by: Mark Rutland +Signed-off-by: Marc Zyngier +Signed-off-by: Will Deacon +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/kernel/head.S | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/arm64/kernel/head.S ++++ b/arch/arm64/kernel/head.S +@@ -381,6 +381,7 @@ ENTRY(kimage_vaddr) + * booted in EL1 or EL2 respectively. + */ + ENTRY(el2_setup) ++ msr SPsel, #1 // We want to use SP_EL{1,2} + mrs x0, CurrentEL + cmp x0, #CurrentEL_EL2 + b.eq 1f diff --git a/queue-4.13/arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch b/queue-4.13/arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch new file mode 100644 index 00000000000..13b2e91617a --- /dev/null +++ b/queue-4.13/arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch @@ -0,0 +1,77 @@ +From f069faba688701c4d56b6c3452a130f97bf02e95 Mon Sep 17 00:00:00 2001 +From: Will Deacon +Date: Fri, 29 Sep 2017 11:29:55 +0100 +Subject: arm64: mm: Use READ_ONCE when dereferencing pointer to pte table + +From: Will Deacon + +commit f069faba688701c4d56b6c3452a130f97bf02e95 upstream. + +On kernels built with support for transparent huge pages, different CPUs +can access the PMD concurrently due to e.g. fast GUP or page_vma_mapped_walk +and they must take care to use READ_ONCE to avoid value tearing or caching +of stale values by the compiler. Unfortunately, these functions call into +our pgtable macros, which don't use READ_ONCE, and compiler caching has +been observed to cause the following crash during ext4 writeback: + +PC is at check_pte+0x20/0x170 +LR is at page_vma_mapped_walk+0x2e0/0x540 +[...] +Process doio (pid: 2463, stack limit = 0xffff00000f2e8000) +Call trace: +[] check_pte+0x20/0x170 +[] page_vma_mapped_walk+0x2e0/0x540 +[] page_mkclean_one+0xac/0x278 +[] rmap_walk_file+0xf0/0x238 +[] rmap_walk+0x64/0xa0 +[] page_mkclean+0x90/0xa8 +[] clear_page_dirty_for_io+0x84/0x2a8 +[] mpage_submit_page+0x34/0x98 +[] mpage_process_page_bufs+0x164/0x170 +[] mpage_prepare_extent_to_map+0x134/0x2b8 +[] ext4_writepages+0x484/0xe30 +[] do_writepages+0x44/0xe8 +[] __filemap_fdatawrite_range+0xbc/0x110 +[] file_write_and_wait_range+0x48/0xd8 +[] ext4_sync_file+0x80/0x4b8 +[] vfs_fsync_range+0x64/0xc0 +[] SyS_msync+0x194/0x1e8 + +This is because page_vma_mapped_walk loads the PMD twice before calling +pte_offset_map: the first time without READ_ONCE (where it gets all zeroes +due to a concurrent pmdp_invalidate) and the second time with READ_ONCE +(where it sees a valid table pointer due to a concurrent pmd_populate). +However, the compiler inlines everything and caches the first value in +a register, which is subsequently used in pte_offset_phys which returns +a junk pointer that is later dereferenced when attempting to access the +relevant pte. + +This patch fixes the issue by using READ_ONCE in pte_offset_phys to ensure +that a stale value is not used. Whilst this is a point fix for a known +failure (and simple to backport), a full fix moving all of our page table +accessors over to {READ,WRITE}_ONCE and consistently using READ_ONCE in +page_vma_mapped_walk is in the works for a future kernel release. + +Cc: Jon Masters +Cc: Timur Tabi +Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use page_vma_mapped_walk()") +Tested-by: Richard Ruigrok +Signed-off-by: Will Deacon +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/include/asm/pgtable.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -412,7 +412,7 @@ static inline phys_addr_t pmd_page_paddr + /* Find an entry in the third-level page table. */ + #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + +-#define pte_offset_phys(dir,addr) (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t)) ++#define pte_offset_phys(dir,addr) (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t)) + #define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) + + #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) diff --git a/queue-4.13/btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch b/queue-4.13/btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch new file mode 100644 index 00000000000..b3e6a61c312 --- /dev/null +++ b/queue-4.13/btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch @@ -0,0 +1,58 @@ +From 63d71450c8d817649a79e37d685523f988b9cc98 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Fri, 1 Sep 2017 17:58:47 +0900 +Subject: btrfs: clear ordered flag on cleaning up ordered extents + +From: Naohiro Aota + +commit 63d71450c8d817649a79e37d685523f988b9cc98 upstream. + +Commit 524272607e88 ("btrfs: Handle delalloc error correctly to avoid +ordered extent hang") introduced btrfs_cleanup_ordered_extents() to cleanup +submitted ordered extents. However, it does not clear the ordered bit +(Private2) of corresponding pages. Thus, the following BUG occurs from +free_pages_check_bad() (on btrfs/125 with nospace_cache). + +BUG: Bad page state in process btrfs pfn:3fa787 +page:ffffdf2acfe9e1c0 count:0 mapcount:0 mapping: (null) index:0xd +flags: 0x8000000000002008(uptodate|private_2) +raw: 8000000000002008 0000000000000000 000000000000000d 00000000ffffffff +raw: ffffdf2acf5c1b20 ffffb443802238b0 0000000000000000 0000000000000000 +page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set +bad because of flags: 0x2000(private_2) + +This patch clears the flag same as other places calling +btrfs_dec_test_ordered_pending() for every page in the specified range. + +Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") +Signed-off-by: Naohiro Aota +Reviewed-by: Qu Wenruo +Reviewed-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -135,6 +135,18 @@ static inline void btrfs_cleanup_ordered + const u64 offset, + const u64 bytes) + { ++ unsigned long index = offset >> PAGE_SHIFT; ++ unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; ++ struct page *page; ++ ++ while (index <= end_index) { ++ page = find_get_page(inode->i_mapping, index); ++ index++; ++ if (!page) ++ continue; ++ ClearPagePrivate2(page); ++ put_page(page); ++ } + return __endio_write_update_ordered(inode, offset + PAGE_SIZE, + bytes - PAGE_SIZE, false); + } diff --git a/queue-4.13/btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch b/queue-4.13/btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch new file mode 100644 index 00000000000..90355381414 --- /dev/null +++ b/queue-4.13/btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch @@ -0,0 +1,64 @@ +From 67c003f90fd68062d92a7ffade36f9b2a9098bd8 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Fri, 1 Sep 2017 17:59:07 +0900 +Subject: btrfs: finish ordered extent cleaning if no progress is found + +From: Naohiro Aota + +commit 67c003f90fd68062d92a7ffade36f9b2a9098bd8 upstream. + +__endio_write_update_ordered() repeats the search until it reaches the end +of the specified range. This works well with direct IO path, because before +the function is called, it's ensured that there are ordered extents filling +whole the range. It's not the case, however, when it's called from +run_delalloc_range(): it is possible to have error in the midle of the loop +in e.g. run_delalloc_nocow(), so that there exisits the range not covered +by any ordered extents. By cleaning such "uncomplete" range, +__endio_write_update_ordered() stucks at offset where there're no ordered +extents. + +Since the ordered extents are created from head to tail, we can stop the +search if there are no offset progress. + +Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang") +Signed-off-by: Naohiro Aota +Reviewed-by: Qu Wenruo +Reviewed-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -8309,6 +8309,7 @@ static void __endio_write_update_ordered + btrfs_work_func_t func; + u64 ordered_offset = offset; + u64 ordered_bytes = bytes; ++ u64 last_offset; + int ret; + + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { +@@ -8320,6 +8321,7 @@ static void __endio_write_update_ordered + } + + again: ++ last_offset = ordered_offset; + ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, + &ordered_offset, + ordered_bytes, +@@ -8331,6 +8333,12 @@ again: + btrfs_queue_work(wq, &ordered->work); + out_test: + /* ++ * If btrfs_dec_test_ordered_pending does not find any ordered extent ++ * in the range, we can exit. ++ */ ++ if (ordered_offset == last_offset) ++ return; ++ /* + * our bio might span multiple ordered extents. If we haven't + * completed the accounting for the whole dio, go back and try again + */ diff --git a/queue-4.13/btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch b/queue-4.13/btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch new file mode 100644 index 00000000000..a3dab80fab2 --- /dev/null +++ b/queue-4.13/btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch @@ -0,0 +1,39 @@ +From bb166d7207432d3c7d10c45dc052f12ba3a2121d Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Fri, 25 Aug 2017 14:15:14 +0900 +Subject: btrfs: fix NULL pointer dereference from free_reloc_roots() + +From: Naohiro Aota + +commit bb166d7207432d3c7d10c45dc052f12ba3a2121d upstream. + +__del_reloc_root should be called before freeing up reloc_root->node. +If not, calling __del_reloc_root() dereference reloc_root->node, causing +the system BUG. + +Fixes: 6bdf131fac23 ("Btrfs: don't leak reloc root nodes on error") +Signed-off-by: Naohiro Aota +Reviewed-by: Nikolay Borisov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/relocation.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -2393,11 +2393,11 @@ void free_reloc_roots(struct list_head * + while (!list_empty(list)) { + reloc_root = list_entry(list->next, struct btrfs_root, + root_list); ++ __del_reloc_root(reloc_root); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + reloc_root->node = NULL; + reloc_root->commit_root = NULL; +- __del_reloc_root(reloc_root); + } + } + diff --git a/queue-4.13/btrfs-prevent-to-set-invalid-default-subvolid.patch b/queue-4.13/btrfs-prevent-to-set-invalid-default-subvolid.patch new file mode 100644 index 00000000000..b1cfc8bdcc8 --- /dev/null +++ b/queue-4.13/btrfs-prevent-to-set-invalid-default-subvolid.patch @@ -0,0 +1,37 @@ +From 6d6d282932d1a609e60dc4467677e0e863682f57 Mon Sep 17 00:00:00 2001 +From: satoru takeuchi +Date: Tue, 12 Sep 2017 22:42:52 +0900 +Subject: btrfs: prevent to set invalid default subvolid + +From: satoru takeuchi + +commit 6d6d282932d1a609e60dc4467677e0e863682f57 upstream. + +`btrfs sub set-default` succeeds to set an ID which isn't corresponding to any +fs/file tree. If such the bad ID is set to a filesystem, we can't mount this +filesystem without specifying `subvol` or `subvolid` mount options. + +Fixes: 6ef5ed0d386b ("Btrfs: add ioctl and incompat flag to set the default mount subvol") +Signed-off-by: Satoru Takeuchi +Reviewed-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -4072,6 +4072,10 @@ static long btrfs_ioctl_default_subvol(s + ret = PTR_ERR(new_root); + goto out; + } ++ if (!is_fstree(new_root->objectid)) { ++ ret = -ENOENT; ++ goto out; ++ } + + path = btrfs_alloc_path(); + if (!path) { diff --git a/queue-4.13/btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch b/queue-4.13/btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch new file mode 100644 index 00000000000..ed4b66595e7 --- /dev/null +++ b/queue-4.13/btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch @@ -0,0 +1,38 @@ +From 78ad4ce014d025f41b8dde3a81876832ead643cf Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Fri, 8 Sep 2017 17:48:55 +0900 +Subject: btrfs: propagate error to btrfs_cmp_data_prepare caller + +From: Naohiro Aota + +commit 78ad4ce014d025f41b8dde3a81876832ead643cf upstream. + +btrfs_cmp_data_prepare() (almost) always returns 0 i.e. ignoring errors +from gather_extent_pages(). While the pages are freed by +btrfs_cmp_data_free(), cmp->num_pages still has > 0. Then, +btrfs_extent_same() try to access the already freed pages causing faults +(or violates PageLocked assertion). + +This patch just return the error as is so that the caller stop the process. + +Signed-off-by: Naohiro Aota +Fixes: f441460202cb ("btrfs: fix deadlock with extent-same and readpage") +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -3063,7 +3063,7 @@ static int btrfs_cmp_data_prepare(struct + out: + if (ret) + btrfs_cmp_data_free(cmp); +- return 0; ++ return ret; + } + + static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) diff --git a/queue-4.13/etnaviv-fix-gem-object-list-corruption.patch b/queue-4.13/etnaviv-fix-gem-object-list-corruption.patch new file mode 100644 index 00000000000..9cafa938fbd --- /dev/null +++ b/queue-4.13/etnaviv-fix-gem-object-list-corruption.patch @@ -0,0 +1,38 @@ +From 518417525f3652c12fb5fad6da4ade66c0072fa3 Mon Sep 17 00:00:00 2001 +From: Lucas Stach +Date: Mon, 11 Sep 2017 15:29:31 +0200 +Subject: etnaviv: fix gem object list corruption + +From: Lucas Stach + +commit 518417525f3652c12fb5fad6da4ade66c0072fa3 upstream. + +All manipulations of the gem_object list need to be protected by +the list mutex, as GEM objects can be created and freed in parallel. +This fixes a kernel memory corruption. + +Signed-off-by: Lucas Stach +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/etnaviv/etnaviv_gem.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c ++++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c +@@ -551,12 +551,15 @@ static const struct etnaviv_gem_ops etna + void etnaviv_gem_free_object(struct drm_gem_object *obj) + { + struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj); ++ struct etnaviv_drm_private *priv = obj->dev->dev_private; + struct etnaviv_vram_mapping *mapping, *tmp; + + /* object should not be active */ + WARN_ON(is_active(etnaviv_obj)); + ++ mutex_lock(&priv->gem_lock); + list_del(&etnaviv_obj->gem_node); ++ mutex_unlock(&priv->gem_lock); + + list_for_each_entry_safe(mapping, tmp, &etnaviv_obj->vram_list, + obj_node) { diff --git a/queue-4.13/etnaviv-fix-submit-error-path.patch b/queue-4.13/etnaviv-fix-submit-error-path.patch new file mode 100644 index 00000000000..29256dfdba8 --- /dev/null +++ b/queue-4.13/etnaviv-fix-submit-error-path.patch @@ -0,0 +1,34 @@ +From 5a642e6bc49f59922e19ebd639e74f72753fc77b Mon Sep 17 00:00:00 2001 +From: Lucas Stach +Date: Fri, 8 Sep 2017 16:24:32 +0200 +Subject: etnaviv: fix submit error path + +From: Lucas Stach + +commit 5a642e6bc49f59922e19ebd639e74f72753fc77b upstream. + +If the gpu submit fails, bail out to avoid accessing a potentially +unititalized fence. + +Signed-off-by: Lucas Stach +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c ++++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c +@@ -445,8 +445,10 @@ int etnaviv_ioctl_gem_submit(struct drm_ + cmdbuf->user_size = ALIGN(args->stream_size, 8); + + ret = etnaviv_gpu_submit(gpu, submit, cmdbuf); +- if (ret == 0) +- cmdbuf = NULL; ++ if (ret) ++ goto out; ++ ++ cmdbuf = NULL; + + if (args->flags & ETNA_SUBMIT_FENCE_FD_OUT) { + /* diff --git a/queue-4.13/fix-infoleak-in-waitid-2.patch b/queue-4.13/fix-infoleak-in-waitid-2.patch new file mode 100644 index 00000000000..e18e71c5c63 --- /dev/null +++ b/queue-4.13/fix-infoleak-in-waitid-2.patch @@ -0,0 +1,65 @@ +From 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Fri, 29 Sep 2017 13:43:15 -0400 +Subject: fix infoleak in waitid(2) + +From: Al Viro + +commit 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 upstream. + +kernel_waitid() can return a PID, an error or 0. rusage is filled in the first +case and waitid(2) rusage should've been copied out exactly in that case, *not* +whenever kernel_waitid() has not returned an error. Compat variant shares that +braino; none of kernel_wait4() callers do, so the below ought to fix it. + +Reported-and-tested-by: Alexander Potapenko +Fixes: ce72a16fa705 ("wait4(2)/waitid(2): separate copying rusage to userland") +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/exit.c | 23 ++++++++++------------- + 1 file changed, 10 insertions(+), 13 deletions(-) + +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -1601,12 +1601,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_ + struct waitid_info info = {.status = 0}; + long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); + int signo = 0; ++ + if (err > 0) { + signo = SIGCHLD; + err = 0; +- } +- +- if (!err) { + if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) + return -EFAULT; + } +@@ -1724,16 +1722,15 @@ COMPAT_SYSCALL_DEFINE5(waitid, + if (err > 0) { + signo = SIGCHLD; + err = 0; +- } +- +- if (!err && uru) { +- /* kernel_waitid() overwrites everything in ru */ +- if (COMPAT_USE_64BIT_TIME) +- err = copy_to_user(uru, &ru, sizeof(ru)); +- else +- err = put_compat_rusage(&ru, uru); +- if (err) +- return -EFAULT; ++ if (uru) { ++ /* kernel_waitid() overwrites everything in ru */ ++ if (COMPAT_USE_64BIT_TIME) ++ err = copy_to_user(uru, &ru, sizeof(ru)); ++ else ++ err = put_compat_rusage(&ru, uru); ++ if (err) ++ return -EFAULT; ++ } + } + + if (!infop) diff --git a/queue-4.13/futex-fix-pi_state-owner-serialization.patch b/queue-4.13/futex-fix-pi_state-owner-serialization.patch new file mode 100644 index 00000000000..7ca4c3426c8 --- /dev/null +++ b/queue-4.13/futex-fix-pi_state-owner-serialization.patch @@ -0,0 +1,124 @@ +From c74aef2d06a9f59cece89093eecc552933cba72a Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Fri, 22 Sep 2017 17:48:06 +0200 +Subject: futex: Fix pi_state->owner serialization + +From: Peter Zijlstra + +commit c74aef2d06a9f59cece89093eecc552933cba72a upstream. + +There was a reported suspicion about a race between exit_pi_state_list() +and put_pi_state(). The same report mentioned the comment with +put_pi_state() said it should be called with hb->lock held, and it no +longer is in all places. + +As it turns out, the pi_state->owner serialization is indeed broken. As per +the new rules: + + 734009e96d19 ("futex: Change locking rules") + +pi_state->owner should be serialized by pi_state->pi_mutex.wait_lock. +For the sites setting pi_state->owner we already hold wait_lock (where +required) but exit_pi_state_list() and put_pi_state() were not and +raced on clearing it. + +Fixes: 734009e96d19 ("futex: Change locking rules") +Reported-by: Gratian Crisan +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Thomas Gleixner +Cc: dvhart@infradead.org +Link: https://lkml.kernel.org/r/20170922154806.jd3ffltfk24m4o4y@hirez.programming.kicks-ass.net +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/futex.c | 33 ++++++++++++++++++++++----------- + 1 file changed, 22 insertions(+), 11 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi + /* + * Drops a reference to the pi_state object and frees or caches it + * when the last reference is gone. +- * +- * Must be called with the hb lock held. + */ + static void put_pi_state(struct futex_pi_state *pi_state) + { +@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi + * and has cleaned up the pi_state already + */ + if (pi_state->owner) { +- raw_spin_lock_irq(&pi_state->owner->pi_lock); +- list_del_init(&pi_state->list); +- raw_spin_unlock_irq(&pi_state->owner->pi_lock); ++ struct task_struct *owner; + +- rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ owner = pi_state->owner; ++ if (owner) { ++ raw_spin_lock(&owner->pi_lock); ++ list_del_init(&pi_state->list); ++ raw_spin_unlock(&owner->pi_lock); ++ } ++ rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + } + +- if (current->pi_state_cache) ++ if (current->pi_state_cache) { + kfree(pi_state); +- else { ++ } else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. +@@ -905,13 +909,14 @@ void exit_pi_state_list(struct task_stru + raw_spin_unlock_irq(&curr->pi_lock); + + spin_lock(&hb->lock); +- +- raw_spin_lock_irq(&curr->pi_lock); ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ raw_spin_lock(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ + if (head->next != next) { ++ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + continue; + } +@@ -920,9 +925,10 @@ void exit_pi_state_list(struct task_stru + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + pi_state->owner = NULL; +- raw_spin_unlock_irq(&curr->pi_lock); ++ raw_spin_unlock(&curr->pi_lock); + + get_pi_state(pi_state); ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + rt_mutex_futex_unlock(&pi_state->pi_mutex); +@@ -1204,6 +1210,10 @@ static int attach_to_pi_owner(u32 uval, + + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &p->pi_state_list); ++ /* ++ * Assignment without holding pi_state->pi_mutex.wait_lock is safe ++ * because there is no concurrency as the object is not published yet. ++ */ + pi_state->owner = p; + raw_spin_unlock_irq(&p->pi_lock); + +@@ -2820,6 +2830,7 @@ retry: + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + ++ /* drops pi_state->pi_mutex.wait_lock */ + ret = wake_futex_pi(uaddr, uval, pi_state); + + put_pi_state(pi_state); diff --git a/queue-4.13/irq-generic-chip-don-t-replace-domain-s-name.patch b/queue-4.13/irq-generic-chip-don-t-replace-domain-s-name.patch new file mode 100644 index 00000000000..96dbe054825 --- /dev/null +++ b/queue-4.13/irq-generic-chip-don-t-replace-domain-s-name.patch @@ -0,0 +1,39 @@ +From 72364d320644c12948786962673772f271039a4a Mon Sep 17 00:00:00 2001 +From: Jeffy Chen +Date: Thu, 28 Sep 2017 12:37:31 +0800 +Subject: irq/generic-chip: Don't replace domain's name + +From: Jeffy Chen + +commit 72364d320644c12948786962673772f271039a4a upstream. + +When generic irq chips are allocated for an irq domain the domain name is +set to the irq chip name. That was done to have named domains before the +recent changes which enforce domain naming were done. + +Since then the overwrite causes a memory leak when the domain name is +dynamically allocated and even worse it would cause the domain free code to +free the wrong name pointer, which might point to a constant. + +Remove the name assignment to prevent this. + +Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only") +Signed-off-by: Jeffy Chen +Signed-off-by: Thomas Gleixner +Link: https://lkml.kernel.org/r/20170928043731.4764-1-jeffy.chen@rock-chips.com +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/irq/generic-chip.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/kernel/irq/generic-chip.c ++++ b/kernel/irq/generic-chip.c +@@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(str + /* Calc pointer to the next generic chip */ + tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + } +- d->name = name; + return 0; + } + EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips); diff --git a/queue-4.13/kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch b/queue-4.13/kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch new file mode 100644 index 00000000000..26e4191311e --- /dev/null +++ b/queue-4.13/kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch @@ -0,0 +1,39 @@ +From 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f Mon Sep 17 00:00:00 2001 +From: Jim Mattson +Date: Tue, 12 Sep 2017 13:02:54 -0700 +Subject: kvm: nVMX: Don't allow L2 to access the hardware CR8 + +From: Jim Mattson + +commit 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f upstream. + +If L1 does not specify the "use TPR shadow" VM-execution control in +vmcs12, then L0 must specify the "CR8-load exiting" and "CR8-store +exiting" VM-execution controls in vmcs02. Failure to do so will give +the L2 VM unrestricted read/write access to the hardware CR8. + +This fixes CVE-2017-12154. + +Signed-off-by: Jim Mattson +Reviewed-by: David Hildenbrand +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -10271,6 +10271,11 @@ static int prepare_vmcs02(struct kvm_vcp + if (exec_control & CPU_BASED_TPR_SHADOW) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); ++ } else { ++#ifdef CONFIG_X86_64 ++ exec_control |= CPU_BASED_CR8_LOAD_EXITING | ++ CPU_BASED_CR8_STORE_EXITING; ++#endif + } + + /* diff --git a/queue-4.13/kvm-nvmx-fix-host_cr3-host_cr4-cache.patch b/queue-4.13/kvm-nvmx-fix-host_cr3-host_cr4-cache.patch new file mode 100644 index 00000000000..482ec795afe --- /dev/null +++ b/queue-4.13/kvm-nvmx-fix-host_cr3-host_cr4-cache.patch @@ -0,0 +1,83 @@ +From 44889942b6eb356eab27ce25fe10701adfec7776 Mon Sep 17 00:00:00 2001 +From: Ladi Prosek +Date: Fri, 22 Sep 2017 07:53:15 +0200 +Subject: KVM: nVMX: fix HOST_CR3/HOST_CR4 cache + +From: Ladi Prosek + +commit 44889942b6eb356eab27ce25fe10701adfec7776 upstream. + +For nested virt we maintain multiple VMCS that can run on a vCPU. So it is +incorrect to keep vmcs_host_cr3 and vmcs_host_cr4, whose purpose is caching +the value of the rarely changing HOST_CR3 and HOST_CR4 VMCS fields, in +vCPU-wide data structures. + +Hyper-V nested on KVM runs into this consistently for me with PCID enabled. +CR3 is updated with a new value, unlikely(cr3 != vmx->host_state.vmcs_host_cr3) +fires, and the currently loaded VMCS is updated. Then we switch from L2 to +L1 and the next exit reverts CR3 to its old value. + +Fixes: d6e41f1151fe ("x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant") +Signed-off-by: Ladi Prosek +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -200,6 +200,8 @@ struct loaded_vmcs { + int cpu; + bool launched; + bool nmi_known_unmasked; ++ unsigned long vmcs_host_cr3; /* May not match real cr3 */ ++ unsigned long vmcs_host_cr4; /* May not match real cr4 */ + struct list_head loaded_vmcss_on_cpu_link; + }; + +@@ -595,8 +597,6 @@ struct vcpu_vmx { + int gs_ldt_reload_needed; + int fs_reload_needed; + u64 msr_host_bndcfgs; +- unsigned long vmcs_host_cr3; /* May not match real cr3 */ +- unsigned long vmcs_host_cr4; /* May not match real cr4 */ + } host_state; + struct { + int vm86_active; +@@ -5138,12 +5138,12 @@ static void vmx_set_constant_host_state( + */ + cr3 = __read_cr3(); + vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ +- vmx->host_state.vmcs_host_cr3 = cr3; ++ vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + + /* Save the most likely value for this task's CR4 in the VMCS. */ + cr4 = cr4_read_shadow(); + vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ +- vmx->host_state.vmcs_host_cr4 = cr4; ++ vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ + #ifdef CONFIG_X86_64 +@@ -8992,15 +8992,15 @@ static void __noclone vmx_vcpu_run(struc + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + + cr3 = __get_current_cr3_fast(); +- if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { ++ if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { + vmcs_writel(HOST_CR3, cr3); +- vmx->host_state.vmcs_host_cr3 = cr3; ++ vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + } + + cr4 = cr4_read_shadow(); +- if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { ++ if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { + vmcs_writel(HOST_CR4, cr4); +- vmx->host_state.vmcs_host_cr4 = cr4; ++ vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + } + + /* When single-stepping over STI and MOV SS, we must clear the diff --git a/queue-4.13/kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch b/queue-4.13/kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch new file mode 100644 index 00000000000..9adfcbcbe2d --- /dev/null +++ b/queue-4.13/kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch @@ -0,0 +1,157 @@ +From 8b306e2f3c41939ea528e6174c88cfbfff893ce1 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Tue, 6 Jun 2017 12:57:05 +0200 +Subject: KVM: VMX: avoid double list add with VT-d posted interrupts +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Paolo Bonzini + +commit 8b306e2f3c41939ea528e6174c88cfbfff893ce1 upstream. + +In some cases, for example involving hot-unplug of assigned +devices, pi_post_block can forget to remove the vCPU from the +blocked_vcpu_list. When this happens, the next call to +pi_pre_block corrupts the list. + +Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block +and WARN instead of adding the element twice in the list. Second, +always do the list removal in pi_post_block if vcpu->pre_pcpu is +set (not -1). + +The new code keeps interrupts disabled for the whole duration of +pi_pre_block/pi_post_block. This is not strictly necessary, but +easier to follow. For the same reason, PI.ON is checked only +after the cmpxchg, and to handle it we just call the post-block +code. This removes duplication of the list removal code. + +Cc: Huangweidong +Cc: Gonglei +Cc: wangxin +Cc: Radim Krčmář +Tested-by: Longpeng (Mike) +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx.c | 62 +++++++++++++++++++++-------------------------------- + 1 file changed, 25 insertions(+), 37 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -11394,10 +11394,11 @@ static void __pi_post_block(struct kvm_v + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; + unsigned int dest; +- unsigned long flags; + + do { + old.control = new.control = pi_desc->control; ++ WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, ++ "Wakeup handler not enabled while the VCPU is blocked\n"); + + dest = cpu_physical_id(vcpu->cpu); + +@@ -11414,14 +11415,10 @@ static void __pi_post_block(struct kvm_v + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + +- if(vcpu->pre_pcpu != -1) { +- spin_lock_irqsave( +- &per_cpu(blocked_vcpu_on_cpu_lock, +- vcpu->pre_pcpu), flags); ++ if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { ++ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + list_del(&vcpu->blocked_vcpu_list); +- spin_unlock_irqrestore( +- &per_cpu(blocked_vcpu_on_cpu_lock, +- vcpu->pre_pcpu), flags); ++ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + vcpu->pre_pcpu = -1; + } + } +@@ -11441,7 +11438,6 @@ static void __pi_post_block(struct kvm_v + */ + static int pi_pre_block(struct kvm_vcpu *vcpu) + { +- unsigned long flags; + unsigned int dest; + struct pi_desc old, new; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); +@@ -11451,34 +11447,20 @@ static int pi_pre_block(struct kvm_vcpu + !kvm_vcpu_apicv_active(vcpu)) + return 0; + +- vcpu->pre_pcpu = vcpu->cpu; +- spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, +- vcpu->pre_pcpu), flags); +- list_add_tail(&vcpu->blocked_vcpu_list, +- &per_cpu(blocked_vcpu_on_cpu, +- vcpu->pre_pcpu)); +- spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock, +- vcpu->pre_pcpu), flags); ++ WARN_ON(irqs_disabled()); ++ local_irq_disable(); ++ if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { ++ vcpu->pre_pcpu = vcpu->cpu; ++ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); ++ list_add_tail(&vcpu->blocked_vcpu_list, ++ &per_cpu(blocked_vcpu_on_cpu, ++ vcpu->pre_pcpu)); ++ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); ++ } + + do { + old.control = new.control = pi_desc->control; + +- /* +- * We should not block the vCPU if +- * an interrupt is posted for it. +- */ +- if (pi_test_on(pi_desc) == 1) { +- spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock, +- vcpu->pre_pcpu), flags); +- list_del(&vcpu->blocked_vcpu_list); +- spin_unlock_irqrestore( +- &per_cpu(blocked_vcpu_on_cpu_lock, +- vcpu->pre_pcpu), flags); +- vcpu->pre_pcpu = -1; +- +- return 1; +- } +- + WARN((pi_desc->sn == 1), + "Warning: SN field of posted-interrupts " + "is set before blocking\n"); +@@ -11503,7 +11485,12 @@ static int pi_pre_block(struct kvm_vcpu + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); + +- return 0; ++ /* We should not block the vCPU if an interrupt is posted for it. */ ++ if (pi_test_on(pi_desc) == 1) ++ __pi_post_block(vcpu); ++ ++ local_irq_enable(); ++ return (vcpu->pre_pcpu == -1); + } + + static int vmx_pre_block(struct kvm_vcpu *vcpu) +@@ -11519,12 +11506,13 @@ static int vmx_pre_block(struct kvm_vcpu + + static void pi_post_block(struct kvm_vcpu *vcpu) + { +- if (!kvm_arch_has_assigned_device(vcpu->kvm) || +- !irq_remapping_cap(IRQ_POSTING_CAP) || +- !kvm_vcpu_apicv_active(vcpu)) ++ if (vcpu->pre_pcpu == -1) + return; + ++ WARN_ON(irqs_disabled()); ++ local_irq_disable(); + __pi_post_block(vcpu); ++ local_irq_enable(); + } + + static void vmx_post_block(struct kvm_vcpu *vcpu) diff --git a/queue-4.13/kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch b/queue-4.13/kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch new file mode 100644 index 00000000000..d2dde79d463 --- /dev/null +++ b/queue-4.13/kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch @@ -0,0 +1,57 @@ +From 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= +Date: Thu, 7 Sep 2017 19:02:30 +0100 +Subject: KVM: VMX: Do not BUG() on out-of-bounds guest IRQ +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Jan H. Schönherr + +commit 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb upstream. + +The value of the guest_irq argument to vmx_update_pi_irte() is +ultimately coming from a KVM_IRQFD API call. Do not BUG() in +vmx_update_pi_irte() if the value is out-of bounds. (Especially, +since KVM as a whole seems to hang after that.) + +Instead, print a message only once if we find that we don't have a +route for a certain IRQ (which can be out-of-bounds or within the +array). + +This fixes CVE-2017-1000252. + +Fixes: efc644048ecde54 ("KVM: x86: Update IRTE for posted-interrupts") +Signed-off-by: Jan H. Schönherr +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -11542,7 +11542,7 @@ static int vmx_update_pi_irte(struct kvm + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu; + struct vcpu_data vcpu_info; +- int idx, ret = -EINVAL; ++ int idx, ret = 0; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || +@@ -11551,7 +11551,12 @@ static int vmx_update_pi_irte(struct kvm + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); +- BUG_ON(guest_irq >= irq_rt->nr_rt_entries); ++ if (guest_irq >= irq_rt->nr_rt_entries || ++ hlist_empty(&irq_rt->map[guest_irq])) { ++ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", ++ guest_irq, irq_rt->nr_rt_entries); ++ goto out; ++ } + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + if (e->type != KVM_IRQ_ROUTING_MSI) diff --git a/queue-4.13/kvm-vmx-extract-__pi_post_block.patch b/queue-4.13/kvm-vmx-extract-__pi_post_block.patch new file mode 100644 index 00000000000..eb0d9cbb4e2 --- /dev/null +++ b/queue-4.13/kvm-vmx-extract-__pi_post_block.patch @@ -0,0 +1,118 @@ +From cd39e1176d320157831ce030b4c869bd2d5eb142 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Tue, 6 Jun 2017 12:57:04 +0200 +Subject: KVM: VMX: extract __pi_post_block +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Paolo Bonzini + +commit cd39e1176d320157831ce030b4c869bd2d5eb142 upstream. + +Simple code movement patch, preparing for the next one. + +Cc: Huangweidong +Cc: Gonglei +Cc: wangxin +Cc: Radim Krčmář +Tested-by: Longpeng (Mike) +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx.c | 71 ++++++++++++++++++++++++++++------------------------- + 1 file changed, 38 insertions(+), 33 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -11389,6 +11389,43 @@ static void vmx_enable_log_dirty_pt_mask + kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); + } + ++static void __pi_post_block(struct kvm_vcpu *vcpu) ++{ ++ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); ++ struct pi_desc old, new; ++ unsigned int dest; ++ unsigned long flags; ++ ++ do { ++ old.control = new.control = pi_desc->control; ++ ++ dest = cpu_physical_id(vcpu->cpu); ++ ++ if (x2apic_enabled()) ++ new.ndst = dest; ++ else ++ new.ndst = (dest << 8) & 0xFF00; ++ ++ /* Allow posting non-urgent interrupts */ ++ new.sn = 0; ++ ++ /* set 'NV' to 'notification vector' */ ++ new.nv = POSTED_INTR_VECTOR; ++ } while (cmpxchg(&pi_desc->control, old.control, ++ new.control) != old.control); ++ ++ if(vcpu->pre_pcpu != -1) { ++ spin_lock_irqsave( ++ &per_cpu(blocked_vcpu_on_cpu_lock, ++ vcpu->pre_pcpu), flags); ++ list_del(&vcpu->blocked_vcpu_list); ++ spin_unlock_irqrestore( ++ &per_cpu(blocked_vcpu_on_cpu_lock, ++ vcpu->pre_pcpu), flags); ++ vcpu->pre_pcpu = -1; ++ } ++} ++ + /* + * This routine does the following things for vCPU which is going + * to be blocked if VT-d PI is enabled. +@@ -11482,44 +11519,12 @@ static int vmx_pre_block(struct kvm_vcpu + + static void pi_post_block(struct kvm_vcpu *vcpu) + { +- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); +- struct pi_desc old, new; +- unsigned int dest; +- unsigned long flags; +- + if (!kvm_arch_has_assigned_device(vcpu->kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) + return; + +- do { +- old.control = new.control = pi_desc->control; +- +- dest = cpu_physical_id(vcpu->cpu); +- +- if (x2apic_enabled()) +- new.ndst = dest; +- else +- new.ndst = (dest << 8) & 0xFF00; +- +- /* Allow posting non-urgent interrupts */ +- new.sn = 0; +- +- /* set 'NV' to 'notification vector' */ +- new.nv = POSTED_INTR_VECTOR; +- } while (cmpxchg(&pi_desc->control, old.control, +- new.control) != old.control); +- +- if(vcpu->pre_pcpu != -1) { +- spin_lock_irqsave( +- &per_cpu(blocked_vcpu_on_cpu_lock, +- vcpu->pre_pcpu), flags); +- list_del(&vcpu->blocked_vcpu_list); +- spin_unlock_irqrestore( +- &per_cpu(blocked_vcpu_on_cpu_lock, +- vcpu->pre_pcpu), flags); +- vcpu->pre_pcpu = -1; +- } ++ __pi_post_block(vcpu); + } + + static void vmx_post_block(struct kvm_vcpu *vcpu) diff --git a/queue-4.13/kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch b/queue-4.13/kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch new file mode 100644 index 00000000000..ecd19c6f826 --- /dev/null +++ b/queue-4.13/kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch @@ -0,0 +1,130 @@ +From 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Tue, 6 Jun 2017 12:57:06 +0200 +Subject: KVM: VMX: simplify and fix vmx_vcpu_pi_load +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Paolo Bonzini + +commit 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a upstream. + +The simplify part: do not touch pi_desc.nv, we can set it when the +VCPU is first created. Likewise, pi_desc.sn is only handled by +vmx_vcpu_pi_load, do not touch it in __pi_post_block. + +The fix part: do not check kvm_arch_has_assigned_device, instead +check the SN bit to figure out whether vmx_vcpu_pi_put ran before. +This matches what the previous patch did in pi_post_block. + +Cc: Huangweidong +Cc: Gonglei +Cc: wangxin +Cc: Radim Krčmář +Tested-by: Longpeng (Mike) +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx.c | 68 +++++++++++++++++++++++++++-------------------------- + 1 file changed, 35 insertions(+), 33 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2187,43 +2187,41 @@ static void vmx_vcpu_pi_load(struct kvm_ + struct pi_desc old, new; + unsigned int dest; + +- if (!kvm_arch_has_assigned_device(vcpu->kvm) || +- !irq_remapping_cap(IRQ_POSTING_CAP) || +- !kvm_vcpu_apicv_active(vcpu)) ++ /* ++ * In case of hot-plug or hot-unplug, we may have to undo ++ * vmx_vcpu_pi_put even if there is no assigned device. And we ++ * always keep PI.NDST up to date for simplicity: it makes the ++ * code easier, and CPU migration is not a fast path. ++ */ ++ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) ++ return; ++ ++ /* ++ * First handle the simple case where no cmpxchg is necessary; just ++ * allow posting non-urgent interrupts. ++ * ++ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change ++ * PI.NDST: pi_post_block will do it for us and the wakeup_handler ++ * expects the VCPU to be on the blocked_vcpu_list that matches ++ * PI.NDST. ++ */ ++ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || ++ vcpu->cpu == cpu) { ++ pi_clear_sn(pi_desc); + return; ++ } + ++ /* The full case. */ + do { + old.control = new.control = pi_desc->control; + +- /* +- * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there +- * are two possible cases: +- * 1. After running 'pre_block', context switch +- * happened. For this case, 'sn' was set in +- * vmx_vcpu_put(), so we need to clear it here. +- * 2. After running 'pre_block', we were blocked, +- * and woken up by some other guy. For this case, +- * we don't need to do anything, 'pi_post_block' +- * will do everything for us. However, we cannot +- * check whether it is case #1 or case #2 here +- * (maybe, not needed), so we also clear sn here, +- * I think it is not a big deal. +- */ +- if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) { +- if (vcpu->cpu != cpu) { +- dest = cpu_physical_id(cpu); +- +- if (x2apic_enabled()) +- new.ndst = dest; +- else +- new.ndst = (dest << 8) & 0xFF00; +- } ++ dest = cpu_physical_id(cpu); + +- /* set 'NV' to 'notification vector' */ +- new.nv = POSTED_INTR_VECTOR; +- } ++ if (x2apic_enabled()) ++ new.ndst = dest; ++ else ++ new.ndst = (dest << 8) & 0xFF00; + +- /* Allow posting non-urgent interrupts */ + new.sn = 0; + } while (cmpxchg(&pi_desc->control, old.control, + new.control) != old.control); +@@ -9310,6 +9308,13 @@ static struct kvm_vcpu *vmx_create_vcpu( + + vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; + ++ /* ++ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR ++ * or POSTED_INTR_WAKEUP_VECTOR. ++ */ ++ vmx->pi_desc.nv = POSTED_INTR_VECTOR; ++ vmx->pi_desc.sn = 1; ++ + return &vmx->vcpu; + + free_vmcs: +@@ -11407,9 +11412,6 @@ static void __pi_post_block(struct kvm_v + else + new.ndst = (dest << 8) & 0xFF00; + +- /* Allow posting non-urgent interrupts */ +- new.sn = 0; +- + /* set 'NV' to 'notification vector' */ + new.nv = POSTED_INTR_VECTOR; + } while (cmpxchg(&pi_desc->control, old.control, diff --git a/queue-4.13/kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch b/queue-4.13/kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch new file mode 100644 index 00000000000..307113ab564 --- /dev/null +++ b/queue-4.13/kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch @@ -0,0 +1,81 @@ +From b862789aa5186d5ea3a024b7cfe0f80c3a38b980 Mon Sep 17 00:00:00 2001 +From: Boqun Feng +Date: Fri, 29 Sep 2017 19:01:45 +0800 +Subject: kvm/x86: Handle async PF in RCU read-side critical sections + +From: Boqun Feng + +commit b862789aa5186d5ea3a024b7cfe0f80c3a38b980 upstream. + +Sasha Levin reported a WARNING: + +| WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329 +| rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline] +| WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329 +| rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458 +... +| CPU: 0 PID: 6974 Comm: syz-fuzzer Not tainted 4.13.0-next-20170908+ #246 +| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS +| 1.10.1-1ubuntu1 04/01/2014 +| Call Trace: +... +| RIP: 0010:rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline] +| RIP: 0010:rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458 +| RSP: 0018:ffff88003b2debc8 EFLAGS: 00010002 +| RAX: 0000000000000001 RBX: 1ffff1000765bd85 RCX: 0000000000000000 +| RDX: 1ffff100075d7882 RSI: ffffffffb5c7da20 RDI: ffff88003aebc410 +| RBP: ffff88003b2def30 R08: dffffc0000000000 R09: 0000000000000001 +| R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003b2def08 +| R13: 0000000000000000 R14: ffff88003aebc040 R15: ffff88003aebc040 +| __schedule+0x201/0x2240 kernel/sched/core.c:3292 +| schedule+0x113/0x460 kernel/sched/core.c:3421 +| kvm_async_pf_task_wait+0x43f/0x940 arch/x86/kernel/kvm.c:158 +| do_async_page_fault+0x72/0x90 arch/x86/kernel/kvm.c:271 +| async_page_fault+0x22/0x30 arch/x86/entry/entry_64.S:1069 +| RIP: 0010:format_decode+0x240/0x830 lib/vsprintf.c:1996 +| RSP: 0018:ffff88003b2df520 EFLAGS: 00010283 +| RAX: 000000000000003f RBX: ffffffffb5d1e141 RCX: ffff88003b2df670 +| RDX: 0000000000000001 RSI: dffffc0000000000 RDI: ffffffffb5d1e140 +| RBP: ffff88003b2df560 R08: dffffc0000000000 R09: 0000000000000000 +| R10: ffff88003b2df718 R11: 0000000000000000 R12: ffff88003b2df5d8 +| R13: 0000000000000064 R14: ffffffffb5d1e140 R15: 0000000000000000 +| vsnprintf+0x173/0x1700 lib/vsprintf.c:2136 +| sprintf+0xbe/0xf0 lib/vsprintf.c:2386 +| proc_self_get_link+0xfb/0x1c0 fs/proc/self.c:23 +| get_link fs/namei.c:1047 [inline] +| link_path_walk+0x1041/0x1490 fs/namei.c:2127 +... + +This happened when the host hit a page fault, and delivered it as in an +async page fault, while the guest was in an RCU read-side critical +section. The guest then tries to reschedule in kvm_async_pf_task_wait(), +but rcu_preempt_note_context_switch() would treat the reschedule as a +sleep in RCU read-side critical section, which is not allowed (even in +preemptible RCU). Thus the WARN. + +To cure this, make kvm_async_pf_task_wait() go to the halt path if the +PF happens in a RCU read-side critical section. + +Reported-by: Sasha Levin +Cc: "Paul E. McKenney" +Cc: Peter Zijlstra +Signed-off-by: Boqun Feng +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/kvm.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -140,7 +140,8 @@ void kvm_async_pf_task_wait(u32 token) + + n.token = token; + n.cpu = smp_processor_id(); +- n.halted = is_idle_task(current) || preempt_count() > 1; ++ n.halted = is_idle_task(current) || preempt_count() > 1 || ++ rcu_preempt_depth(); + init_swait_queue_head(&n.wq); + hlist_add_head(&n.link, &b->list); + raw_spin_unlock(&b->lock); diff --git a/queue-4.13/md-fix-a-race-condition-for-flush-request-handling.patch b/queue-4.13/md-fix-a-race-condition-for-flush-request-handling.patch new file mode 100644 index 00000000000..753568e4d39 --- /dev/null +++ b/queue-4.13/md-fix-a-race-condition-for-flush-request-handling.patch @@ -0,0 +1,52 @@ +From 79bf31a3b2a7ca467cfec8ff97d359a77065d01f Mon Sep 17 00:00:00 2001 +From: Shaohua Li +Date: Thu, 21 Sep 2017 09:55:28 -0700 +Subject: md: fix a race condition for flush request handling + +From: Shaohua Li + +commit 79bf31a3b2a7ca467cfec8ff97d359a77065d01f upstream. + +md_submit_flush_data calls pers->make_request, which missed the suspend check. +Fix it with the new md_handle_request API. + +Reported-by: Nate Dailey +Tested-by: Nate Dailey +Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) +Reviewed-by: NeilBrown +Signed-off-by: Shaohua Li +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/md.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -439,16 +439,22 @@ static void md_submit_flush_data(struct + struct mddev *mddev = container_of(ws, struct mddev, flush_work); + struct bio *bio = mddev->flush_bio; + ++ /* ++ * must reset flush_bio before calling into md_handle_request to avoid a ++ * deadlock, because other bios passed md_handle_request suspend check ++ * could wait for this and below md_handle_request could wait for those ++ * bios because of suspend check ++ */ ++ mddev->flush_bio = NULL; ++ wake_up(&mddev->sb_wait); ++ + if (bio->bi_iter.bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio); + else { + bio->bi_opf &= ~REQ_PREFLUSH; +- mddev->pers->make_request(mddev, bio); ++ md_handle_request(mddev, bio); + } +- +- mddev->flush_bio = NULL; +- wake_up(&mddev->sb_wait); + } + + void md_flush_request(struct mddev *mddev, struct bio *bio) diff --git a/queue-4.13/md-separate-request-handling.patch b/queue-4.13/md-separate-request-handling.patch new file mode 100644 index 00000000000..c1f65bf3d53 --- /dev/null +++ b/queue-4.13/md-separate-request-handling.patch @@ -0,0 +1,122 @@ +From 393debc23c7820211d1c8253dd6a8408a7628fe7 Mon Sep 17 00:00:00 2001 +From: Shaohua Li +Date: Thu, 21 Sep 2017 10:23:35 -0700 +Subject: md: separate request handling + +From: Shaohua Li + +commit 393debc23c7820211d1c8253dd6a8408a7628fe7 upstream. + +With commit cc27b0c78c79, pers->make_request could bail out without handling +the bio. If that happens, we should retry. The commit fixes md_make_request +but not other call sites. Separate the request handling part, so other call +sites can use it. + +Reported-by: Nate Dailey +Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start()) +Reviewed-by: NeilBrown +Signed-off-by: Shaohua Li +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/md.c | 58 +++++++++++++++++++++++++++++++------------------------- + drivers/md/md.h | 1 + 2 files changed, 34 insertions(+), 25 deletions(-) + +--- a/drivers/md/md.c ++++ b/drivers/md/md.c +@@ -266,6 +266,37 @@ static DEFINE_SPINLOCK(all_mddevs_lock); + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ ++void md_handle_request(struct mddev *mddev, struct bio *bio) ++{ ++check_suspended: ++ rcu_read_lock(); ++ if (mddev->suspended) { ++ DEFINE_WAIT(__wait); ++ for (;;) { ++ prepare_to_wait(&mddev->sb_wait, &__wait, ++ TASK_UNINTERRUPTIBLE); ++ if (!mddev->suspended) ++ break; ++ rcu_read_unlock(); ++ schedule(); ++ rcu_read_lock(); ++ } ++ finish_wait(&mddev->sb_wait, &__wait); ++ } ++ atomic_inc(&mddev->active_io); ++ rcu_read_unlock(); ++ ++ if (!mddev->pers->make_request(mddev, bio)) { ++ atomic_dec(&mddev->active_io); ++ wake_up(&mddev->sb_wait); ++ goto check_suspended; ++ } ++ ++ if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) ++ wake_up(&mddev->sb_wait); ++} ++EXPORT_SYMBOL(md_handle_request); ++ + static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) + { + const int rw = bio_data_dir(bio); +@@ -285,23 +316,6 @@ static blk_qc_t md_make_request(struct r + bio_endio(bio); + return BLK_QC_T_NONE; + } +-check_suspended: +- rcu_read_lock(); +- if (mddev->suspended) { +- DEFINE_WAIT(__wait); +- for (;;) { +- prepare_to_wait(&mddev->sb_wait, &__wait, +- TASK_UNINTERRUPTIBLE); +- if (!mddev->suspended) +- break; +- rcu_read_unlock(); +- schedule(); +- rcu_read_lock(); +- } +- finish_wait(&mddev->sb_wait, &__wait); +- } +- atomic_inc(&mddev->active_io); +- rcu_read_unlock(); + + /* + * save the sectors now since our bio can +@@ -310,20 +324,14 @@ check_suspended: + sectors = bio_sectors(bio); + /* bio could be mergeable after passing to underlayer */ + bio->bi_opf &= ~REQ_NOMERGE; +- if (!mddev->pers->make_request(mddev, bio)) { +- atomic_dec(&mddev->active_io); +- wake_up(&mddev->sb_wait); +- goto check_suspended; +- } ++ ++ md_handle_request(mddev, bio); + + cpu = part_stat_lock(); + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); + part_stat_unlock(); + +- if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) +- wake_up(&mddev->sb_wait); +- + return BLK_QC_T_NONE; + } + +--- a/drivers/md/md.h ++++ b/drivers/md/md.h +@@ -686,6 +686,7 @@ extern void md_stop_writes(struct mddev + extern int md_rdev_init(struct md_rdev *rdev); + extern void md_rdev_clear(struct md_rdev *rdev); + ++extern void md_handle_request(struct mddev *mddev, struct bio *bio); + extern void mddev_suspend(struct mddev *mddev); + extern void mddev_resume(struct mddev *mddev); + extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, diff --git a/queue-4.13/mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch b/queue-4.13/mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch new file mode 100644 index 00000000000..a77be0ff342 --- /dev/null +++ b/queue-4.13/mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch @@ -0,0 +1,47 @@ +From 7e439681af82984045efc215437ebb2ca8d33a4c Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Mon, 25 Sep 2017 10:19:57 +0200 +Subject: mtd: Fix partition alignment check on multi-erasesize devices + +From: Boris Brezillon + +commit 7e439681af82984045efc215437ebb2ca8d33a4c upstream. + +Commit 1eeef2d7483a ("mtd: handle partitioning on devices with 0 +erasesize") introduced a regression on heterogeneous erase region +devices. Alignment of the partition was tested against the master +eraseblock size which can be bigger than the slave one, thus leading +to some partitions being marked as read-only. + +Update wr_alignment to match this slave erasesize after this erasesize +has been determined by picking the biggest erasesize of all the regions +embedded in the MTD partition. + +Reported-by: Mathias Thore +Fixes: 1eeef2d7483a ("mtd: handle partitioning on devices with 0 erasesize") +Signed-off-by: Boris Brezillon +Tested-by: Mathias Thore +Reviewed-by: Mathias Thore +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/mtd/mtdpart.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/drivers/mtd/mtdpart.c ++++ b/drivers/mtd/mtdpart.c +@@ -581,6 +581,14 @@ static struct mtd_part *allocate_partiti + slave->mtd.erasesize = parent->erasesize; + } + ++ /* ++ * Slave erasesize might differ from the master one if the master ++ * exposes several regions with different erasesize. Adjust ++ * wr_alignment accordingly. ++ */ ++ if (!(slave->mtd.flags & MTD_NO_ERASE)) ++ wr_alignment = slave->mtd.erasesize; ++ + tmp = slave->offset; + remainder = do_div(tmp, wr_alignment); + if ((slave->mtd.flags & MTD_WRITEABLE) && remainder) { diff --git a/queue-4.13/mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch b/queue-4.13/mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch new file mode 100644 index 00000000000..e7f5d1378d2 --- /dev/null +++ b/queue-4.13/mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch @@ -0,0 +1,37 @@ +From 36de80740008e6a4a55115b4a92e2059e47c1cba Mon Sep 17 00:00:00 2001 +From: Richard Genoud +Date: Wed, 27 Sep 2017 14:49:17 +0200 +Subject: mtd: nand: atmel: fix buffer overflow in atmel_pmecc_user + +From: Richard Genoud + +commit 36de80740008e6a4a55115b4a92e2059e47c1cba upstream. + +When calculating the size needed by struct atmel_pmecc_user *user, +the dmu and delta buffer sizes were forgotten. +This lead to a memory corruption (especially with a large ecc_strength). + +Link: http://lkml.kernel.org/r/1506503157.3016.5.camel@gmail.com +Fixes: f88fc122cc34 ("mtd: nand: Cleanup/rework the atmel_nand driver") +Reported-by: Richard Genoud +Pointed-at-by: Boris Brezillon +Signed-off-by: Richard Genoud +Reviewed-by: Nicolas Ferre +Signed-off-by: Boris Brezillon +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/mtd/nand/atmel/pmecc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/mtd/nand/atmel/pmecc.c ++++ b/drivers/mtd/nand/atmel/pmecc.c +@@ -363,7 +363,7 @@ atmel_pmecc_create_user(struct atmel_pme + size += (req->ecc.strength + 1) * sizeof(u16); + /* Reserve space for mu, dmu and delta. */ + size = ALIGN(size, sizeof(s32)); +- size += (req->ecc.strength + 1) * sizeof(s32); ++ size += (req->ecc.strength + 1) * sizeof(s32) * 3; + + user = kzalloc(size, GFP_KERNEL); + if (!user) diff --git a/queue-4.13/pci-fix-race-condition-with-driver_override.patch b/queue-4.13/pci-fix-race-condition-with-driver_override.patch new file mode 100644 index 00000000000..08a35f9d9fb --- /dev/null +++ b/queue-4.13/pci-fix-race-condition-with-driver_override.patch @@ -0,0 +1,66 @@ +From 9561475db680f7144d2223a409dd3d7e322aca03 Mon Sep 17 00:00:00 2001 +From: Nicolai Stange +Date: Mon, 11 Sep 2017 09:45:40 +0200 +Subject: PCI: Fix race condition with driver_override + +From: Nicolai Stange + +commit 9561475db680f7144d2223a409dd3d7e322aca03 upstream. + +The driver_override implementation is susceptible to a race condition when +different threads are reading vs. storing a different driver override. Add +locking to avoid the race condition. + +This is in close analogy to commit 6265539776a0 ("driver core: platform: +fix race condition with driver_override") from Adrian Salido. + +Fixes: 782a985d7af2 ("PCI: Introduce new device binding path using pci_dev.driver_override") +Signed-off-by: Nicolai Stange +Signed-off-by: Bjorn Helgaas +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/pci/pci-sysfs.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +--- a/drivers/pci/pci-sysfs.c ++++ b/drivers/pci/pci-sysfs.c +@@ -686,7 +686,7 @@ static ssize_t driver_override_store(str + const char *buf, size_t count) + { + struct pci_dev *pdev = to_pci_dev(dev); +- char *driver_override, *old = pdev->driver_override, *cp; ++ char *driver_override, *old, *cp; + + /* We need to keep extra room for a newline */ + if (count >= (PAGE_SIZE - 1)) +@@ -700,12 +700,15 @@ static ssize_t driver_override_store(str + if (cp) + *cp = '\0'; + ++ device_lock(dev); ++ old = pdev->driver_override; + if (strlen(driver_override)) { + pdev->driver_override = driver_override; + } else { + kfree(driver_override); + pdev->driver_override = NULL; + } ++ device_unlock(dev); + + kfree(old); + +@@ -716,8 +719,12 @@ static ssize_t driver_override_show(stru + struct device_attribute *attr, char *buf) + { + struct pci_dev *pdev = to_pci_dev(dev); ++ ssize_t len; + +- return snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); ++ device_lock(dev); ++ len = snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); ++ device_unlock(dev); ++ return len; + } + static DEVICE_ATTR_RW(driver_override); + diff --git a/queue-4.13/platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch b/queue-4.13/platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch new file mode 100644 index 00000000000..31d6e7cf7fd --- /dev/null +++ b/queue-4.13/platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch @@ -0,0 +1,48 @@ +From ce7c47d60bda6c7f09ccf16e978d971c8fa16ff0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= +Date: Mon, 18 Sep 2017 23:00:59 +0300 +Subject: platform/x86: fujitsu-laptop: Don't oops when FUJ02E3 is not presnt +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Ville Syrjälä + +commit ce7c47d60bda6c7f09ccf16e978d971c8fa16ff0 upstream. + +My Fujitsu-Siemens Lifebook S6120 doesn't have the FUJ02E3 device, +but it does have FUJ02B1. That means we do register the backlight +device (and it even seems to work), but the code will oops as soon +as we try to set the backlight brightness because it's trying to +call call_fext_func() with a NULL device. Let's just skip those +function calls when the FUJ02E3 device is not present. + +Cc: Jonathan Woithe +Cc: Andy Shevchenko +Signed-off-by: Ville Syrjälä +Signed-off-by: Darren Hart (VMware) +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/platform/x86/fujitsu-laptop.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/drivers/platform/x86/fujitsu-laptop.c ++++ b/drivers/platform/x86/fujitsu-laptop.c +@@ -254,10 +254,12 @@ static int bl_update_status(struct backl + { + struct acpi_device *device = bl_get_data(b); + +- if (b->props.power == FB_BLANK_POWERDOWN) +- call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3); +- else +- call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0); ++ if (fext) { ++ if (b->props.power == FB_BLANK_POWERDOWN) ++ call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3); ++ else ++ call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0); ++ } + + return set_lcd_level(device, b->props.brightness); + } diff --git a/queue-4.13/pm-opp-call-notifier-without-holding-opp_table-lock.patch b/queue-4.13/pm-opp-call-notifier-without-holding-opp_table-lock.patch new file mode 100644 index 00000000000..4d505627dcc --- /dev/null +++ b/queue-4.13/pm-opp-call-notifier-without-holding-opp_table-lock.patch @@ -0,0 +1,58 @@ +From e4d8ae00169f7686e1da5a62e5cf797d12bf8822 Mon Sep 17 00:00:00 2001 +From: Viresh Kumar +Date: Thu, 21 Sep 2017 10:44:36 -0700 +Subject: PM / OPP: Call notifier without holding opp_table->lock + +From: Viresh Kumar + +commit e4d8ae00169f7686e1da5a62e5cf797d12bf8822 upstream. + +The notifier callbacks may want to call some OPP helper routines which +may try to take the same opp_table->lock again and cause a deadlock. One +such usecase was reported by Chanwoo Choi, where calling +dev_pm_opp_disable() leads us to the devfreq's OPP notifier handler, +which further calls dev_pm_opp_find_freq_floor() and it deadlocks. + +We don't really need the opp_table->lock to be held across the notifier +call though, all we want to make sure is that the 'opp' doesn't get +freed while being used from within the notifier chain. We can do it with +help of dev_pm_opp_get/put() as well. Let's do it. + +Fixes: 5b650b388844 "PM / OPP: Take kref from _find_opp_table()" +Reported-by: Chanwoo Choi +Tested-by: Chanwoo Choi +Reviewed-by: Stephen Boyd +Reviewed-by: Chanwoo Choi +Signed-off-by: Viresh Kumar +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/base/power/opp/core.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/drivers/base/power/opp/core.c ++++ b/drivers/base/power/opp/core.c +@@ -1581,6 +1581,9 @@ static int _opp_set_availability(struct + + opp->available = availability_req; + ++ dev_pm_opp_get(opp); ++ mutex_unlock(&opp_table->lock); ++ + /* Notify the change of the OPP availability */ + if (availability_req) + blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE, +@@ -1589,8 +1592,12 @@ static int _opp_set_availability(struct + blocking_notifier_call_chain(&opp_table->head, + OPP_EVENT_DISABLE, opp); + ++ dev_pm_opp_put(opp); ++ goto put_table; ++ + unlock: + mutex_unlock(&opp_table->lock); ++put_table: + dev_pm_opp_put_opp_table(opp_table); + return r; + } diff --git a/queue-4.13/sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch b/queue-4.13/sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch new file mode 100644 index 00000000000..85f06a76500 --- /dev/null +++ b/queue-4.13/sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch @@ -0,0 +1,83 @@ +From 5ccba44ba118a5000cccc50076b0344632459779 Mon Sep 17 00:00:00 2001 +From: Ethan Zhao +Date: Mon, 4 Sep 2017 13:59:34 +0800 +Subject: sched/sysctl: Check user input value of sysctl_sched_time_avg + +From: Ethan Zhao + +commit 5ccba44ba118a5000cccc50076b0344632459779 upstream. + +System will hang if user set sysctl_sched_time_avg to 0: + + [root@XXX ~]# sysctl kernel.sched_time_avg_ms=0 + + Stack traceback for pid 0 + 0xffff883f6406c600 0 0 1 3 R 0xffff883f6406cf50 *swapper/3 + ffff883f7ccc3ae8 0000000000000018 ffffffff810c4dd0 0000000000000000 + 0000000000017800 ffff883f7ccc3d78 0000000000000003 ffff883f7ccc3bf8 + ffffffff810c4fc9 ffff883f7ccc3c08 00000000810c5043 ffff883f7ccc3c08 + Call Trace: + [] ? update_group_capacity+0x110/0x200 + [] ? update_sd_lb_stats+0x109/0x600 + [] ? find_busiest_group+0x47/0x530 + [] ? load_balance+0x194/0x900 + [] ? update_rq_clock.part.83+0x1a/0xe0 + [] ? rebalance_domains+0x152/0x290 + [] ? run_rebalance_domains+0xdc/0x1d0 + [] ? __do_softirq+0xfb/0x320 + [] ? irq_exit+0x125/0x130 + [] ? scheduler_ipi+0x97/0x160 + [] ? smp_reschedule_interrupt+0x29/0x30 + [] ? reschedule_interrupt+0x6e/0x80 + [] ? cpuidle_enter_state+0xcc/0x230 + [] ? cpuidle_enter_state+0x9c/0x230 + [] ? cpuidle_enter+0x17/0x20 + [] ? cpu_startup_entry+0x38c/0x420 + [] ? start_secondary+0x173/0x1e0 + +Because divide-by-zero error happens in function: + +update_group_capacity() + update_cpu_capacity() + scale_rt_capacity() + { + ... + total = sched_avg_period() + delta; + used = div_u64(avg, total); + ... + } + +To fix this issue, check user input value of sysctl_sched_time_avg, keep +it unchanged when hitting invalid input, and set the minimum limit of +sysctl_sched_time_avg to 1 ms. + +Reported-by: James Puthukattukaran +Signed-off-by: Ethan Zhao +Signed-off-by: Peter Zijlstra (Intel) +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: efault@gmx.de +Cc: ethan.kernel@gmail.com +Cc: keescook@chromium.org +Cc: mcgrof@kernel.org +Link: http://lkml.kernel.org/r/1504504774-18253-1-git-send-email-ethan.zhao@oracle.com +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sysctl.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = { + .data = &sysctl_sched_time_avg, + .maxlen = sizeof(unsigned int), + .mode = 0644, +- .proc_handler = proc_dointvec, ++ .proc_handler = proc_dointvec_minmax, ++ .extra1 = &one, + }, + #ifdef CONFIG_SCHEDSTATS + { diff --git a/queue-4.13/series b/queue-4.13/series index 44184652780..5c7ecc85572 100644 --- a/queue-4.13/series +++ b/queue-4.13/series @@ -73,3 +73,33 @@ extable-consolidate-kernel_text_address-functions.patch extable-enable-rcu-if-it-is-not-watching-in-kernel_text_address.patch selftests-seccomp-support-glibc-2.26-siginfo_t.h.patch seccomp-fix-the-usage-of-get-put_seccomp_filter-in-seccomp_get_filter.patch +arm64-make-sure-spsel-is-always-set.patch +arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch +arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch +kvm-vmx-extract-__pi_post_block.patch +kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch +kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch +kvm-nvmx-fix-host_cr3-host_cr4-cache.patch +kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch +kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch +kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch +xfs-validate-bdev-support-for-dax-inode-flag.patch +fix-infoleak-in-waitid-2.patch +sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch +irq-generic-chip-don-t-replace-domain-s-name.patch +mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch +mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch +etnaviv-fix-submit-error-path.patch +etnaviv-fix-gem-object-list-corruption.patch +futex-fix-pi_state-owner-serialization.patch +md-fix-a-race-condition-for-flush-request-handling.patch +md-separate-request-handling.patch +pci-fix-race-condition-with-driver_override.patch +btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch +btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch +btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch +btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch +btrfs-prevent-to-set-invalid-default-subvolid.patch +platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch +pm-opp-call-notifier-without-holding-opp_table-lock.patch +x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch diff --git a/queue-4.13/x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch b/queue-4.13/x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch new file mode 100644 index 00000000000..0c7db6b89b5 --- /dev/null +++ b/queue-4.13/x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch @@ -0,0 +1,211 @@ +From a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 Mon Sep 17 00:00:00 2001 +From: Laurent Dufour +Date: Mon, 4 Sep 2017 10:32:15 +0200 +Subject: x86/mm: Fix fault error path using unsafe vma pointer + +From: Laurent Dufour + +commit a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 upstream. + +commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal +generation code") passes down a vma pointer to the error path, but that is +done once the mmap_sem is released when calling mm_fault_error() from +__do_page_fault(). + +This is dangerous as the vma structure is no more safe to be used once the +mmap_sem has been released. As only the protection key value is required in +the error processing, we could just pass down this value. + +Fix it by passing a pointer to a protection key value down to the fault +signal generation code. The use of a pointer allows to keep the check +generating a warning message in fill_sig_info_pkey() when the vma was not +known. If the pointer is valid, the protection value can be accessed by +deferencing the pointer. + +[ tglx: Made *pkey u32 as that's the type which is passed in siginfo ] + +Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code") +Signed-off-by: Laurent Dufour +Signed-off-by: Thomas Gleixner +Cc: linux-mm@kvack.org +Cc: Dave Hansen +Link: http://lkml.kernel.org/r/1504513935-12742-1-git-send-email-ldufour@linux.vnet.ibm.com +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/mm/fault.c | 47 ++++++++++++++++++++++++----------------------- + 1 file changed, 24 insertions(+), 23 deletions(-) + +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsign + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really + * faulted on a pte with its pkey=4. + */ +-static void fill_sig_info_pkey(int si_code, siginfo_t *info, +- struct vm_area_struct *vma) ++static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey) + { + /* This is effectively an #ifdef */ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) +@@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_co + * valid VMA, so we should never reach this without a + * valid VMA. + */ +- if (!vma) { ++ if (!pkey) { + WARN_ONCE(1, "PKU fault with no VMA passed in"); + info->si_pkey = 0; + return; +@@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_co + * absolutely guranteed to be 100% accurate because of + * the race explained above. + */ +- info->si_pkey = vma_pkey(vma); ++ info->si_pkey = *pkey; + } + + static void + force_sig_info_fault(int si_signo, int si_code, unsigned long address, +- struct task_struct *tsk, struct vm_area_struct *vma, +- int fault) ++ struct task_struct *tsk, u32 *pkey, int fault) + { + unsigned lsb = 0; + siginfo_t info; +@@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int s + lsb = PAGE_SHIFT; + info.si_addr_lsb = lsb; + +- fill_sig_info_pkey(si_code, &info, vma); ++ fill_sig_info_pkey(si_code, &info, pkey); + + force_sig_info(si_signo, &info, tsk); + } +@@ -758,8 +756,6 @@ no_context(struct pt_regs *regs, unsigne + struct task_struct *tsk = current; + unsigned long flags; + int sig; +- /* No context means no VMA to pass down */ +- struct vm_area_struct *vma = NULL; + + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs, X86_TRAP_PF)) { +@@ -784,7 +780,7 @@ no_context(struct pt_regs *regs, unsigne + + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_info_fault(signal, si_code, address, +- tsk, vma, 0); ++ tsk, NULL, 0); + } + + /* +@@ -893,8 +889,7 @@ show_signal_msg(struct pt_regs *regs, un + + static void + __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, +- unsigned long address, struct vm_area_struct *vma, +- int si_code) ++ unsigned long address, u32 *pkey, int si_code) + { + struct task_struct *tsk = current; + +@@ -942,7 +937,7 @@ __bad_area_nosemaphore(struct pt_regs *r + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_PF; + +- force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0); ++ force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); + + return; + } +@@ -955,9 +950,9 @@ __bad_area_nosemaphore(struct pt_regs *r + + static noinline void + bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, +- unsigned long address, struct vm_area_struct *vma) ++ unsigned long address, u32 *pkey) + { +- __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR); ++ __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); + } + + static void +@@ -965,6 +960,10 @@ __bad_area(struct pt_regs *regs, unsigne + unsigned long address, struct vm_area_struct *vma, int si_code) + { + struct mm_struct *mm = current->mm; ++ u32 pkey; ++ ++ if (vma) ++ pkey = vma_pkey(vma); + + /* + * Something tried to access memory that isn't in our memory map.. +@@ -972,7 +971,8 @@ __bad_area(struct pt_regs *regs, unsigne + */ + up_read(&mm->mmap_sem); + +- __bad_area_nosemaphore(regs, error_code, address, vma, si_code); ++ __bad_area_nosemaphore(regs, error_code, address, ++ (vma) ? &pkey : NULL, si_code); + } + + static noinline void +@@ -1015,7 +1015,7 @@ bad_area_access_error(struct pt_regs *re + + static void + do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, +- struct vm_area_struct *vma, unsigned int fault) ++ u32 *pkey, unsigned int fault) + { + struct task_struct *tsk = current; + int code = BUS_ADRERR; +@@ -1042,13 +1042,12 @@ do_sigbus(struct pt_regs *regs, unsigned + code = BUS_MCEERR_AR; + } + #endif +- force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault); ++ force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); + } + + static noinline void + mm_fault_error(struct pt_regs *regs, unsigned long error_code, +- unsigned long address, struct vm_area_struct *vma, +- unsigned int fault) ++ unsigned long address, u32 *pkey, unsigned int fault) + { + if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + no_context(regs, error_code, address, 0, 0); +@@ -1072,9 +1071,9 @@ mm_fault_error(struct pt_regs *regs, uns + } else { + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) +- do_sigbus(regs, error_code, address, vma, fault); ++ do_sigbus(regs, error_code, address, pkey, fault); + else if (fault & VM_FAULT_SIGSEGV) +- bad_area_nosemaphore(regs, error_code, address, vma); ++ bad_area_nosemaphore(regs, error_code, address, pkey); + else + BUG(); + } +@@ -1268,6 +1267,7 @@ __do_page_fault(struct pt_regs *regs, un + struct mm_struct *mm; + int fault, major = 0; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; ++ u32 pkey; + + tsk = current; + mm = tsk->mm; +@@ -1468,9 +1468,10 @@ good_area: + return; + } + ++ pkey = vma_pkey(vma); + up_read(&mm->mmap_sem); + if (unlikely(fault & VM_FAULT_ERROR)) { +- mm_fault_error(regs, error_code, address, vma, fault); ++ mm_fault_error(regs, error_code, address, &pkey, fault); + return; + } + diff --git a/queue-4.13/xfs-validate-bdev-support-for-dax-inode-flag.patch b/queue-4.13/xfs-validate-bdev-support-for-dax-inode-flag.patch new file mode 100644 index 00000000000..513d66724f8 --- /dev/null +++ b/queue-4.13/xfs-validate-bdev-support-for-dax-inode-flag.patch @@ -0,0 +1,50 @@ +From 6851a3db7e224bbb85e23b3c64a506c9e0904382 Mon Sep 17 00:00:00 2001 +From: Ross Zwisler +Date: Mon, 18 Sep 2017 14:46:03 -0700 +Subject: xfs: validate bdev support for DAX inode flag + +From: Ross Zwisler + +commit 6851a3db7e224bbb85e23b3c64a506c9e0904382 upstream. + +Currently only the blocksize is checked, but we should really be calling +bdev_dax_supported() which also tests to make sure we can get a +struct dax_device and that the dax_direct_access() path is working. + +This is the same check that we do for the "-o dax" mount option in +xfs_fs_fill_super(). + +This does not fix the race issues that caused the XFS DAX inode option to +be disabled, so that option will still be disabled. If/when we re-enable +it, though, I think we will want this issue to have been fixed. I also do +think that we want to fix this in stable kernels. + +Signed-off-by: Ross Zwisler +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_ioctl.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_ioctl.c ++++ b/fs/xfs/xfs_ioctl.c +@@ -1088,6 +1088,7 @@ xfs_ioctl_setattr_dax_invalidate( + int *join_flags) + { + struct inode *inode = VFS_I(ip); ++ struct super_block *sb = inode->i_sb; + int error; + + *join_flags = 0; +@@ -1100,7 +1101,7 @@ xfs_ioctl_setattr_dax_invalidate( + if (fa->fsx_xflags & FS_XFLAG_DAX) { + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EINVAL; +- if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE) ++ if (bdev_dax_supported(sb, sb->s_blocksize) < 0) + return -EINVAL; + } + -- 2.47.3