]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.13-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 2 Oct 2017 12:30:00 +0000 (14:30 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 2 Oct 2017 12:30:00 +0000 (14:30 +0200)
added patches:
arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch
arm64-make-sure-spsel-is-always-set.patch
arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch
btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch
btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch
btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch
btrfs-prevent-to-set-invalid-default-subvolid.patch
btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch
etnaviv-fix-gem-object-list-corruption.patch
etnaviv-fix-submit-error-path.patch
fix-infoleak-in-waitid-2.patch
futex-fix-pi_state-owner-serialization.patch
irq-generic-chip-don-t-replace-domain-s-name.patch
kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch
kvm-nvmx-fix-host_cr3-host_cr4-cache.patch
kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch
kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch
kvm-vmx-extract-__pi_post_block.patch
kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch
kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch
md-fix-a-race-condition-for-flush-request-handling.patch
md-separate-request-handling.patch
mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch
mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch
pci-fix-race-condition-with-driver_override.patch
platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch
pm-opp-call-notifier-without-holding-opp_table-lock.patch
sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch
x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch
xfs-validate-bdev-support-for-dax-inode-flag.patch

31 files changed:
queue-4.13/arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch [new file with mode: 0644]
queue-4.13/arm64-make-sure-spsel-is-always-set.patch [new file with mode: 0644]
queue-4.13/arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch [new file with mode: 0644]
queue-4.13/btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch [new file with mode: 0644]
queue-4.13/btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch [new file with mode: 0644]
queue-4.13/btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch [new file with mode: 0644]
queue-4.13/btrfs-prevent-to-set-invalid-default-subvolid.patch [new file with mode: 0644]
queue-4.13/btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch [new file with mode: 0644]
queue-4.13/etnaviv-fix-gem-object-list-corruption.patch [new file with mode: 0644]
queue-4.13/etnaviv-fix-submit-error-path.patch [new file with mode: 0644]
queue-4.13/fix-infoleak-in-waitid-2.patch [new file with mode: 0644]
queue-4.13/futex-fix-pi_state-owner-serialization.patch [new file with mode: 0644]
queue-4.13/irq-generic-chip-don-t-replace-domain-s-name.patch [new file with mode: 0644]
queue-4.13/kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch [new file with mode: 0644]
queue-4.13/kvm-nvmx-fix-host_cr3-host_cr4-cache.patch [new file with mode: 0644]
queue-4.13/kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch [new file with mode: 0644]
queue-4.13/kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch [new file with mode: 0644]
queue-4.13/kvm-vmx-extract-__pi_post_block.patch [new file with mode: 0644]
queue-4.13/kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch [new file with mode: 0644]
queue-4.13/kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch [new file with mode: 0644]
queue-4.13/md-fix-a-race-condition-for-flush-request-handling.patch [new file with mode: 0644]
queue-4.13/md-separate-request-handling.patch [new file with mode: 0644]
queue-4.13/mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch [new file with mode: 0644]
queue-4.13/mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch [new file with mode: 0644]
queue-4.13/pci-fix-race-condition-with-driver_override.patch [new file with mode: 0644]
queue-4.13/platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch [new file with mode: 0644]
queue-4.13/pm-opp-call-notifier-without-holding-opp_table-lock.patch [new file with mode: 0644]
queue-4.13/sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch [new file with mode: 0644]
queue-4.13/series
queue-4.13/x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch [new file with mode: 0644]
queue-4.13/xfs-validate-bdev-support-for-dax-inode-flag.patch [new file with mode: 0644]

diff --git a/queue-4.13/arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch b/queue-4.13/arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch
new file mode 100644 (file)
index 0000000..bede0c8
--- /dev/null
@@ -0,0 +1,65 @@
+From 760bfb47c36a07741a089bf6a28e854ffbee7dc9 Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Fri, 29 Sep 2017 12:27:41 +0100
+Subject: arm64: fault: Route pte translation faults via do_translation_fault
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 760bfb47c36a07741a089bf6a28e854ffbee7dc9 upstream.
+
+We currently route pte translation faults via do_page_fault, which elides
+the address check against TASK_SIZE before invoking the mm fault handling
+code. However, this can cause issues with the path walking code in
+conjunction with our word-at-a-time implementation because
+load_unaligned_zeropad can end up faulting in kernel space if it reads
+across a page boundary and runs into a page fault (e.g. by attempting to
+read from a guard region).
+
+In the case of such a fault, load_unaligned_zeropad has registered a
+fixup to shift the valid data and pad with zeroes, however the abort is
+reported as a level 3 translation fault and we dispatch it straight to
+do_page_fault, despite it being a kernel address. This results in calling
+a sleeping function from atomic context:
+
+  BUG: sleeping function called from invalid context at arch/arm64/mm/fault.c:313
+  in_atomic(): 0, irqs_disabled(): 0, pid: 10290
+  Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
+  [...]
+  [<ffffff8e016cd0cc>] ___might_sleep+0x134/0x144
+  [<ffffff8e016cd158>] __might_sleep+0x7c/0x8c
+  [<ffffff8e016977f0>] do_page_fault+0x140/0x330
+  [<ffffff8e01681328>] do_mem_abort+0x54/0xb0
+  Exception stack(0xfffffffb20247a70 to 0xfffffffb20247ba0)
+  [...]
+  [<ffffff8e016844fc>] el1_da+0x18/0x78
+  [<ffffff8e017f399c>] path_parentat+0x44/0x88
+  [<ffffff8e017f4c9c>] filename_parentat+0x5c/0xd8
+  [<ffffff8e017f5044>] filename_create+0x4c/0x128
+  [<ffffff8e017f59e4>] SyS_mkdirat+0x50/0xc8
+  [<ffffff8e01684e30>] el0_svc_naked+0x24/0x28
+  Code: 36380080 d5384100 f9400800 9402566d (d4210000)
+  ---[ end trace 2d01889f2bca9b9f ]---
+
+Fix this by dispatching all translation faults to do_translation_faults,
+which avoids invoking the page fault logic for faults on kernel addresses.
+
+Reported-by: Ankit Jain <ankijain@codeaurora.org>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/mm/fault.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -614,7 +614,7 @@ static const struct fault_info fault_inf
+       { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 0 translation fault"     },
+       { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 1 translation fault"     },
+       { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 2 translation fault"     },
+-      { do_page_fault,        SIGSEGV, SEGV_MAPERR,   "level 3 translation fault"     },
++      { do_translation_fault, SIGSEGV, SEGV_MAPERR,   "level 3 translation fault"     },
+       { do_bad,               SIGBUS,  0,             "unknown 8"                     },
+       { do_page_fault,        SIGSEGV, SEGV_ACCERR,   "level 1 access flag fault"     },
+       { do_page_fault,        SIGSEGV, SEGV_ACCERR,   "level 2 access flag fault"     },
diff --git a/queue-4.13/arm64-make-sure-spsel-is-always-set.patch b/queue-4.13/arm64-make-sure-spsel-is-always-set.patch
new file mode 100644 (file)
index 0000000..9aa07d7
--- /dev/null
@@ -0,0 +1,40 @@
+From 5371513fb338fb9989c569dc071326d369d6ade8 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <marc.zyngier@arm.com>
+Date: Tue, 26 Sep 2017 15:57:16 +0100
+Subject: arm64: Make sure SPsel is always set
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 5371513fb338fb9989c569dc071326d369d6ade8 upstream.
+
+When the kernel is entered at EL2 on an ARMv8.0 system, we construct
+the EL1 pstate and make sure this uses the the EL1 stack pointer
+(we perform an exception return to EL1h).
+
+But if the kernel is either entered at EL1 or stays at EL2 (because
+we're on a VHE-capable system), we fail to set SPsel, and use whatever
+stack selection the higher exception level has choosen for us.
+
+Let's not take any chance, and make sure that SPsel is set to one
+before we decide the mode we're going to run in.
+
+Acked-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/kernel/head.S |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm64/kernel/head.S
++++ b/arch/arm64/kernel/head.S
+@@ -381,6 +381,7 @@ ENTRY(kimage_vaddr)
+  * booted in EL1 or EL2 respectively.
+  */
+ ENTRY(el2_setup)
++      msr     SPsel, #1                       // We want to use SP_EL{1,2}
+       mrs     x0, CurrentEL
+       cmp     x0, #CurrentEL_EL2
+       b.eq    1f
diff --git a/queue-4.13/arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch b/queue-4.13/arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch
new file mode 100644 (file)
index 0000000..13b2e91
--- /dev/null
@@ -0,0 +1,77 @@
+From f069faba688701c4d56b6c3452a130f97bf02e95 Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Fri, 29 Sep 2017 11:29:55 +0100
+Subject: arm64: mm: Use READ_ONCE when dereferencing pointer to pte table
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit f069faba688701c4d56b6c3452a130f97bf02e95 upstream.
+
+On kernels built with support for transparent huge pages, different CPUs
+can access the PMD concurrently due to e.g. fast GUP or page_vma_mapped_walk
+and they must take care to use READ_ONCE to avoid value tearing or caching
+of stale values by the compiler. Unfortunately, these functions call into
+our pgtable macros, which don't use READ_ONCE, and compiler caching has
+been observed to cause the following crash during ext4 writeback:
+
+PC is at check_pte+0x20/0x170
+LR is at page_vma_mapped_walk+0x2e0/0x540
+[...]
+Process doio (pid: 2463, stack limit = 0xffff00000f2e8000)
+Call trace:
+[<ffff000008233328>] check_pte+0x20/0x170
+[<ffff000008233758>] page_vma_mapped_walk+0x2e0/0x540
+[<ffff000008234adc>] page_mkclean_one+0xac/0x278
+[<ffff000008234d98>] rmap_walk_file+0xf0/0x238
+[<ffff000008236e74>] rmap_walk+0x64/0xa0
+[<ffff0000082370c8>] page_mkclean+0x90/0xa8
+[<ffff0000081f3c64>] clear_page_dirty_for_io+0x84/0x2a8
+[<ffff00000832f984>] mpage_submit_page+0x34/0x98
+[<ffff00000832fb4c>] mpage_process_page_bufs+0x164/0x170
+[<ffff00000832fc8c>] mpage_prepare_extent_to_map+0x134/0x2b8
+[<ffff00000833530c>] ext4_writepages+0x484/0xe30
+[<ffff0000081f6ab4>] do_writepages+0x44/0xe8
+[<ffff0000081e5bd4>] __filemap_fdatawrite_range+0xbc/0x110
+[<ffff0000081e5e68>] file_write_and_wait_range+0x48/0xd8
+[<ffff000008324310>] ext4_sync_file+0x80/0x4b8
+[<ffff0000082bd434>] vfs_fsync_range+0x64/0xc0
+[<ffff0000082332b4>] SyS_msync+0x194/0x1e8
+
+This is because page_vma_mapped_walk loads the PMD twice before calling
+pte_offset_map: the first time without READ_ONCE (where it gets all zeroes
+due to a concurrent pmdp_invalidate) and the second time with READ_ONCE
+(where it sees a valid table pointer due to a concurrent pmd_populate).
+However, the compiler inlines everything and caches the first value in
+a register, which is subsequently used in pte_offset_phys which returns
+a junk pointer that is later dereferenced when attempting to access the
+relevant pte.
+
+This patch fixes the issue by using READ_ONCE in pte_offset_phys to ensure
+that a stale value is not used. Whilst this is a point fix for a known
+failure (and simple to backport), a full fix moving all of our page table
+accessors over to {READ,WRITE}_ONCE and consistently using READ_ONCE in
+page_vma_mapped_walk is in the works for a future kernel release.
+
+Cc: Jon Masters <jcm@redhat.com>
+Cc: Timur Tabi <timur@codeaurora.org>
+Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use page_vma_mapped_walk()")
+Tested-by: Richard Ruigrok <rruigrok@codeaurora.org>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/include/asm/pgtable.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/arm64/include/asm/pgtable.h
++++ b/arch/arm64/include/asm/pgtable.h
+@@ -412,7 +412,7 @@ static inline phys_addr_t pmd_page_paddr
+ /* Find an entry in the third-level page table. */
+ #define pte_index(addr)               (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+-#define pte_offset_phys(dir,addr)     (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t))
++#define pte_offset_phys(dir,addr)     (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t))
+ #define pte_offset_kernel(dir,addr)   ((pte_t *)__va(pte_offset_phys((dir), (addr))))
+ #define pte_offset_map(dir,addr)      pte_offset_kernel((dir), (addr))
diff --git a/queue-4.13/btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch b/queue-4.13/btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch
new file mode 100644 (file)
index 0000000..b3e6a61
--- /dev/null
@@ -0,0 +1,58 @@
+From 63d71450c8d817649a79e37d685523f988b9cc98 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Fri, 1 Sep 2017 17:58:47 +0900
+Subject: btrfs: clear ordered flag on cleaning up ordered extents
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 63d71450c8d817649a79e37d685523f988b9cc98 upstream.
+
+Commit 524272607e88 ("btrfs: Handle delalloc error correctly to avoid
+ordered extent hang") introduced btrfs_cleanup_ordered_extents() to cleanup
+submitted ordered extents. However, it does not clear the ordered bit
+(Private2) of corresponding pages. Thus, the following BUG occurs from
+free_pages_check_bad() (on btrfs/125 with nospace_cache).
+
+BUG: Bad page state in process btrfs  pfn:3fa787
+page:ffffdf2acfe9e1c0 count:0 mapcount:0 mapping:          (null) index:0xd
+flags: 0x8000000000002008(uptodate|private_2)
+raw: 8000000000002008 0000000000000000 000000000000000d 00000000ffffffff
+raw: ffffdf2acf5c1b20 ffffb443802238b0 0000000000000000 0000000000000000
+page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
+bad because of flags: 0x2000(private_2)
+
+This patch clears the flag same as other places calling
+btrfs_dec_test_ordered_pending() for every page in the specified range.
+
+Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
+Reviewed-by: Josef Bacik <jbacik@fb.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |   12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -135,6 +135,18 @@ static inline void btrfs_cleanup_ordered
+                                                const u64 offset,
+                                                const u64 bytes)
+ {
++      unsigned long index = offset >> PAGE_SHIFT;
++      unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
++      struct page *page;
++
++      while (index <= end_index) {
++              page = find_get_page(inode->i_mapping, index);
++              index++;
++              if (!page)
++                      continue;
++              ClearPagePrivate2(page);
++              put_page(page);
++      }
+       return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
+                                           bytes - PAGE_SIZE, false);
+ }
diff --git a/queue-4.13/btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch b/queue-4.13/btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch
new file mode 100644 (file)
index 0000000..9035538
--- /dev/null
@@ -0,0 +1,64 @@
+From 67c003f90fd68062d92a7ffade36f9b2a9098bd8 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Fri, 1 Sep 2017 17:59:07 +0900
+Subject: btrfs: finish ordered extent cleaning if no progress is found
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 67c003f90fd68062d92a7ffade36f9b2a9098bd8 upstream.
+
+__endio_write_update_ordered() repeats the search until it reaches the end
+of the specified range. This works well with direct IO path, because before
+the function is called, it's ensured that there are ordered extents filling
+whole the range. It's not the case, however, when it's called from
+run_delalloc_range(): it is possible to have error in the midle of the loop
+in e.g. run_delalloc_nocow(), so that there exisits the range not covered
+by any ordered extents. By cleaning such "uncomplete" range,
+__endio_write_update_ordered() stucks at offset where there're no ordered
+extents.
+
+Since the ordered extents are created from head to tail, we can stop the
+search if there are no offset progress.
+
+Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
+Reviewed-by: Josef Bacik <jbacik@fb.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -8309,6 +8309,7 @@ static void __endio_write_update_ordered
+       btrfs_work_func_t func;
+       u64 ordered_offset = offset;
+       u64 ordered_bytes = bytes;
++      u64 last_offset;
+       int ret;
+       if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+@@ -8320,6 +8321,7 @@ static void __endio_write_update_ordered
+       }
+ again:
++      last_offset = ordered_offset;
+       ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
+                                                  &ordered_offset,
+                                                  ordered_bytes,
+@@ -8331,6 +8333,12 @@ again:
+       btrfs_queue_work(wq, &ordered->work);
+ out_test:
+       /*
++       * If btrfs_dec_test_ordered_pending does not find any ordered extent
++       * in the range, we can exit.
++       */
++      if (ordered_offset == last_offset)
++              return;
++      /*
+        * our bio might span multiple ordered extents.  If we haven't
+        * completed the accounting for the whole dio, go back and try again
+        */
diff --git a/queue-4.13/btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch b/queue-4.13/btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch
new file mode 100644 (file)
index 0000000..a3dab80
--- /dev/null
@@ -0,0 +1,39 @@
+From bb166d7207432d3c7d10c45dc052f12ba3a2121d Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Fri, 25 Aug 2017 14:15:14 +0900
+Subject: btrfs: fix NULL pointer dereference from free_reloc_roots()
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit bb166d7207432d3c7d10c45dc052f12ba3a2121d upstream.
+
+__del_reloc_root should be called before freeing up reloc_root->node.
+If not, calling __del_reloc_root() dereference reloc_root->node, causing
+the system BUG.
+
+Fixes: 6bdf131fac23 ("Btrfs: don't leak reloc root nodes on error")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/relocation.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -2393,11 +2393,11 @@ void free_reloc_roots(struct list_head *
+       while (!list_empty(list)) {
+               reloc_root = list_entry(list->next, struct btrfs_root,
+                                       root_list);
++              __del_reloc_root(reloc_root);
+               free_extent_buffer(reloc_root->node);
+               free_extent_buffer(reloc_root->commit_root);
+               reloc_root->node = NULL;
+               reloc_root->commit_root = NULL;
+-              __del_reloc_root(reloc_root);
+       }
+ }
diff --git a/queue-4.13/btrfs-prevent-to-set-invalid-default-subvolid.patch b/queue-4.13/btrfs-prevent-to-set-invalid-default-subvolid.patch
new file mode 100644 (file)
index 0000000..b1cfc8b
--- /dev/null
@@ -0,0 +1,37 @@
+From 6d6d282932d1a609e60dc4467677e0e863682f57 Mon Sep 17 00:00:00 2001
+From: satoru takeuchi <satoru.takeuchi@gmail.com>
+Date: Tue, 12 Sep 2017 22:42:52 +0900
+Subject: btrfs: prevent to set invalid default subvolid
+
+From: satoru takeuchi <satoru.takeuchi@gmail.com>
+
+commit 6d6d282932d1a609e60dc4467677e0e863682f57 upstream.
+
+`btrfs sub set-default` succeeds to set an ID which isn't corresponding to any
+fs/file tree. If such the bad ID is set to a filesystem, we can't mount this
+filesystem without specifying `subvol` or `subvolid` mount options.
+
+Fixes: 6ef5ed0d386b ("Btrfs: add ioctl and incompat flag to set the default mount subvol")
+Signed-off-by: Satoru Takeuchi <satoru.takeuchi@gmail.com>
+Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -4072,6 +4072,10 @@ static long btrfs_ioctl_default_subvol(s
+               ret = PTR_ERR(new_root);
+               goto out;
+       }
++      if (!is_fstree(new_root->objectid)) {
++              ret = -ENOENT;
++              goto out;
++      }
+       path = btrfs_alloc_path();
+       if (!path) {
diff --git a/queue-4.13/btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch b/queue-4.13/btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch
new file mode 100644 (file)
index 0000000..ed4b665
--- /dev/null
@@ -0,0 +1,38 @@
+From 78ad4ce014d025f41b8dde3a81876832ead643cf Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Fri, 8 Sep 2017 17:48:55 +0900
+Subject: btrfs: propagate error to btrfs_cmp_data_prepare caller
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 78ad4ce014d025f41b8dde3a81876832ead643cf upstream.
+
+btrfs_cmp_data_prepare() (almost) always returns 0 i.e. ignoring errors
+from gather_extent_pages(). While the pages are freed by
+btrfs_cmp_data_free(), cmp->num_pages still has > 0. Then,
+btrfs_extent_same() try to access the already freed pages causing faults
+(or violates PageLocked assertion).
+
+This patch just return the error as is so that the caller stop the process.
+
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Fixes: f441460202cb ("btrfs: fix deadlock with extent-same and readpage")
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -3063,7 +3063,7 @@ static int btrfs_cmp_data_prepare(struct
+ out:
+       if (ret)
+               btrfs_cmp_data_free(cmp);
+-      return 0;
++      return ret;
+ }
+ static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp)
diff --git a/queue-4.13/etnaviv-fix-gem-object-list-corruption.patch b/queue-4.13/etnaviv-fix-gem-object-list-corruption.patch
new file mode 100644 (file)
index 0000000..9cafa93
--- /dev/null
@@ -0,0 +1,38 @@
+From 518417525f3652c12fb5fad6da4ade66c0072fa3 Mon Sep 17 00:00:00 2001
+From: Lucas Stach <l.stach@pengutronix.de>
+Date: Mon, 11 Sep 2017 15:29:31 +0200
+Subject: etnaviv: fix gem object list corruption
+
+From: Lucas Stach <l.stach@pengutronix.de>
+
+commit 518417525f3652c12fb5fad6da4ade66c0072fa3 upstream.
+
+All manipulations of the gem_object list need to be protected by
+the list mutex, as GEM objects can be created and freed in parallel.
+This fixes a kernel memory corruption.
+
+Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/etnaviv/etnaviv_gem.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
++++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+@@ -551,12 +551,15 @@ static const struct etnaviv_gem_ops etna
+ void etnaviv_gem_free_object(struct drm_gem_object *obj)
+ {
+       struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
++      struct etnaviv_drm_private *priv = obj->dev->dev_private;
+       struct etnaviv_vram_mapping *mapping, *tmp;
+       /* object should not be active */
+       WARN_ON(is_active(etnaviv_obj));
++      mutex_lock(&priv->gem_lock);
+       list_del(&etnaviv_obj->gem_node);
++      mutex_unlock(&priv->gem_lock);
+       list_for_each_entry_safe(mapping, tmp, &etnaviv_obj->vram_list,
+                                obj_node) {
diff --git a/queue-4.13/etnaviv-fix-submit-error-path.patch b/queue-4.13/etnaviv-fix-submit-error-path.patch
new file mode 100644 (file)
index 0000000..29256df
--- /dev/null
@@ -0,0 +1,34 @@
+From 5a642e6bc49f59922e19ebd639e74f72753fc77b Mon Sep 17 00:00:00 2001
+From: Lucas Stach <l.stach@pengutronix.de>
+Date: Fri, 8 Sep 2017 16:24:32 +0200
+Subject: etnaviv: fix submit error path
+
+From: Lucas Stach <l.stach@pengutronix.de>
+
+commit 5a642e6bc49f59922e19ebd639e74f72753fc77b upstream.
+
+If the gpu submit fails, bail out to avoid accessing a potentially
+unititalized fence.
+
+Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
++++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+@@ -445,8 +445,10 @@ int etnaviv_ioctl_gem_submit(struct drm_
+       cmdbuf->user_size = ALIGN(args->stream_size, 8);
+       ret = etnaviv_gpu_submit(gpu, submit, cmdbuf);
+-      if (ret == 0)
+-              cmdbuf = NULL;
++      if (ret)
++              goto out;
++
++      cmdbuf = NULL;
+       if (args->flags & ETNA_SUBMIT_FENCE_FD_OUT) {
+               /*
diff --git a/queue-4.13/fix-infoleak-in-waitid-2.patch b/queue-4.13/fix-infoleak-in-waitid-2.patch
new file mode 100644 (file)
index 0000000..e18e71c
--- /dev/null
@@ -0,0 +1,65 @@
+From 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Fri, 29 Sep 2017 13:43:15 -0400
+Subject: fix infoleak in waitid(2)
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 upstream.
+
+kernel_waitid() can return a PID, an error or 0.  rusage is filled in the first
+case and waitid(2) rusage should've been copied out exactly in that case, *not*
+whenever kernel_waitid() has not returned an error.  Compat variant shares that
+braino; none of kernel_wait4() callers do, so the below ought to fix it.
+
+Reported-and-tested-by: Alexander Potapenko <glider@google.com>
+Fixes: ce72a16fa705 ("wait4(2)/waitid(2): separate copying rusage to userland")
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/exit.c |   23 ++++++++++-------------
+ 1 file changed, 10 insertions(+), 13 deletions(-)
+
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -1601,12 +1601,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_
+       struct waitid_info info = {.status = 0};
+       long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
+       int signo = 0;
++
+       if (err > 0) {
+               signo = SIGCHLD;
+               err = 0;
+-      }
+-
+-      if (!err) {
+               if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+                       return -EFAULT;
+       }
+@@ -1724,16 +1722,15 @@ COMPAT_SYSCALL_DEFINE5(waitid,
+       if (err > 0) {
+               signo = SIGCHLD;
+               err = 0;
+-      }
+-
+-      if (!err && uru) {
+-              /* kernel_waitid() overwrites everything in ru */
+-              if (COMPAT_USE_64BIT_TIME)
+-                      err = copy_to_user(uru, &ru, sizeof(ru));
+-              else
+-                      err = put_compat_rusage(&ru, uru);
+-              if (err)
+-                      return -EFAULT;
++              if (uru) {
++                      /* kernel_waitid() overwrites everything in ru */
++                      if (COMPAT_USE_64BIT_TIME)
++                              err = copy_to_user(uru, &ru, sizeof(ru));
++                      else
++                              err = put_compat_rusage(&ru, uru);
++                      if (err)
++                              return -EFAULT;
++              }
+       }
+       if (!infop)
diff --git a/queue-4.13/futex-fix-pi_state-owner-serialization.patch b/queue-4.13/futex-fix-pi_state-owner-serialization.patch
new file mode 100644 (file)
index 0000000..7ca4c34
--- /dev/null
@@ -0,0 +1,124 @@
+From c74aef2d06a9f59cece89093eecc552933cba72a Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Fri, 22 Sep 2017 17:48:06 +0200
+Subject: futex: Fix pi_state->owner serialization
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit c74aef2d06a9f59cece89093eecc552933cba72a upstream.
+
+There was a reported suspicion about a race between exit_pi_state_list()
+and put_pi_state(). The same report mentioned the comment with
+put_pi_state() said it should be called with hb->lock held, and it no
+longer is in all places.
+
+As it turns out, the pi_state->owner serialization is indeed broken. As per
+the new rules:
+
+  734009e96d19 ("futex: Change locking rules")
+
+pi_state->owner should be serialized by pi_state->pi_mutex.wait_lock.
+For the sites setting pi_state->owner we already hold wait_lock (where
+required) but exit_pi_state_list() and put_pi_state() were not and
+raced on clearing it.
+
+Fixes: 734009e96d19 ("futex: Change locking rules")
+Reported-by: Gratian Crisan <gratian.crisan@ni.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: dvhart@infradead.org
+Link: https://lkml.kernel.org/r/20170922154806.jd3ffltfk24m4o4y@hirez.programming.kicks-ass.net
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/futex.c |   33 ++++++++++++++++++++++-----------
+ 1 file changed, 22 insertions(+), 11 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi
+ /*
+  * Drops a reference to the pi_state object and frees or caches it
+  * when the last reference is gone.
+- *
+- * Must be called with the hb lock held.
+  */
+ static void put_pi_state(struct futex_pi_state *pi_state)
+ {
+@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi
+        * and has cleaned up the pi_state already
+        */
+       if (pi_state->owner) {
+-              raw_spin_lock_irq(&pi_state->owner->pi_lock);
+-              list_del_init(&pi_state->list);
+-              raw_spin_unlock_irq(&pi_state->owner->pi_lock);
++              struct task_struct *owner;
+-              rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
++              raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++              owner = pi_state->owner;
++              if (owner) {
++                      raw_spin_lock(&owner->pi_lock);
++                      list_del_init(&pi_state->list);
++                      raw_spin_unlock(&owner->pi_lock);
++              }
++              rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
++              raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       }
+-      if (current->pi_state_cache)
++      if (current->pi_state_cache) {
+               kfree(pi_state);
+-      else {
++      } else {
+               /*
+                * pi_state->list is already empty.
+                * clear pi_state->owner.
+@@ -905,13 +909,14 @@ void exit_pi_state_list(struct task_stru
+               raw_spin_unlock_irq(&curr->pi_lock);
+               spin_lock(&hb->lock);
+-
+-              raw_spin_lock_irq(&curr->pi_lock);
++              raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++              raw_spin_lock(&curr->pi_lock);
+               /*
+                * We dropped the pi-lock, so re-check whether this
+                * task still owns the PI-state:
+                */
+               if (head->next != next) {
++                      raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+                       spin_unlock(&hb->lock);
+                       continue;
+               }
+@@ -920,9 +925,10 @@ void exit_pi_state_list(struct task_stru
+               WARN_ON(list_empty(&pi_state->list));
+               list_del_init(&pi_state->list);
+               pi_state->owner = NULL;
+-              raw_spin_unlock_irq(&curr->pi_lock);
++              raw_spin_unlock(&curr->pi_lock);
+               get_pi_state(pi_state);
++              raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+               spin_unlock(&hb->lock);
+               rt_mutex_futex_unlock(&pi_state->pi_mutex);
+@@ -1204,6 +1210,10 @@ static int attach_to_pi_owner(u32 uval,
+       WARN_ON(!list_empty(&pi_state->list));
+       list_add(&pi_state->list, &p->pi_state_list);
++      /*
++       * Assignment without holding pi_state->pi_mutex.wait_lock is safe
++       * because there is no concurrency as the object is not published yet.
++       */
+       pi_state->owner = p;
+       raw_spin_unlock_irq(&p->pi_lock);
+@@ -2820,6 +2830,7 @@ retry:
+               raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+               spin_unlock(&hb->lock);
++              /* drops pi_state->pi_mutex.wait_lock */
+               ret = wake_futex_pi(uaddr, uval, pi_state);
+               put_pi_state(pi_state);
diff --git a/queue-4.13/irq-generic-chip-don-t-replace-domain-s-name.patch b/queue-4.13/irq-generic-chip-don-t-replace-domain-s-name.patch
new file mode 100644 (file)
index 0000000..96dbe05
--- /dev/null
@@ -0,0 +1,39 @@
+From 72364d320644c12948786962673772f271039a4a Mon Sep 17 00:00:00 2001
+From: Jeffy Chen <jeffy.chen@rock-chips.com>
+Date: Thu, 28 Sep 2017 12:37:31 +0800
+Subject: irq/generic-chip: Don't replace domain's name
+
+From: Jeffy Chen <jeffy.chen@rock-chips.com>
+
+commit 72364d320644c12948786962673772f271039a4a upstream.
+
+When generic irq chips are allocated for an irq domain the domain name is
+set to the irq chip name. That was done to have named domains before the
+recent changes which enforce domain naming were done.
+
+Since then the overwrite causes a memory leak when the domain name is
+dynamically allocated and even worse it would cause the domain free code to
+free the wrong name pointer, which might point to a constant.
+
+Remove the name assignment to prevent this.
+
+Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only")
+Signed-off-by: Jeffy Chen <jeffy.chen@rock-chips.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://lkml.kernel.org/r/20170928043731.4764-1-jeffy.chen@rock-chips.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/irq/generic-chip.c |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/kernel/irq/generic-chip.c
++++ b/kernel/irq/generic-chip.c
+@@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(str
+               /* Calc pointer to the next generic chip */
+               tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+       }
+-      d->name = name;
+       return 0;
+ }
+ EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
diff --git a/queue-4.13/kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch b/queue-4.13/kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch
new file mode 100644 (file)
index 0000000..26e4191
--- /dev/null
@@ -0,0 +1,39 @@
+From 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Tue, 12 Sep 2017 13:02:54 -0700
+Subject: kvm: nVMX: Don't allow L2 to access the hardware CR8
+
+From: Jim Mattson <jmattson@google.com>
+
+commit 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f upstream.
+
+If L1 does not specify the "use TPR shadow" VM-execution control in
+vmcs12, then L0 must specify the "CR8-load exiting" and "CR8-store
+exiting" VM-execution controls in vmcs02. Failure to do so will give
+the L2 VM unrestricted read/write access to the hardware CR8.
+
+This fixes CVE-2017-12154.
+
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -10271,6 +10271,11 @@ static int prepare_vmcs02(struct kvm_vcp
+       if (exec_control & CPU_BASED_TPR_SHADOW) {
+               vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+               vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
++      } else {
++#ifdef CONFIG_X86_64
++              exec_control |= CPU_BASED_CR8_LOAD_EXITING |
++                              CPU_BASED_CR8_STORE_EXITING;
++#endif
+       }
+       /*
diff --git a/queue-4.13/kvm-nvmx-fix-host_cr3-host_cr4-cache.patch b/queue-4.13/kvm-nvmx-fix-host_cr3-host_cr4-cache.patch
new file mode 100644 (file)
index 0000000..482ec79
--- /dev/null
@@ -0,0 +1,83 @@
+From 44889942b6eb356eab27ce25fe10701adfec7776 Mon Sep 17 00:00:00 2001
+From: Ladi Prosek <lprosek@redhat.com>
+Date: Fri, 22 Sep 2017 07:53:15 +0200
+Subject: KVM: nVMX: fix HOST_CR3/HOST_CR4 cache
+
+From: Ladi Prosek <lprosek@redhat.com>
+
+commit 44889942b6eb356eab27ce25fe10701adfec7776 upstream.
+
+For nested virt we maintain multiple VMCS that can run on a vCPU. So it is
+incorrect to keep vmcs_host_cr3 and vmcs_host_cr4, whose purpose is caching
+the value of the rarely changing HOST_CR3 and HOST_CR4 VMCS fields, in
+vCPU-wide data structures.
+
+Hyper-V nested on KVM runs into this consistently for me with PCID enabled.
+CR3 is updated with a new value, unlikely(cr3 != vmx->host_state.vmcs_host_cr3)
+fires, and the currently loaded VMCS is updated. Then we switch from L2 to
+L1 and the next exit reverts CR3 to its old value.
+
+Fixes: d6e41f1151fe ("x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant")
+Signed-off-by: Ladi Prosek <lprosek@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c |   16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -200,6 +200,8 @@ struct loaded_vmcs {
+       int cpu;
+       bool launched;
+       bool nmi_known_unmasked;
++      unsigned long vmcs_host_cr3;    /* May not match real cr3 */
++      unsigned long vmcs_host_cr4;    /* May not match real cr4 */
+       struct list_head loaded_vmcss_on_cpu_link;
+ };
+@@ -595,8 +597,6 @@ struct vcpu_vmx {
+               int           gs_ldt_reload_needed;
+               int           fs_reload_needed;
+               u64           msr_host_bndcfgs;
+-              unsigned long vmcs_host_cr3;    /* May not match real cr3 */
+-              unsigned long vmcs_host_cr4;    /* May not match real cr4 */
+       } host_state;
+       struct {
+               int vm86_active;
+@@ -5138,12 +5138,12 @@ static void vmx_set_constant_host_state(
+        */
+       cr3 = __read_cr3();
+       vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
+-      vmx->host_state.vmcs_host_cr3 = cr3;
++      vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
+       /* Save the most likely value for this task's CR4 in the VMCS. */
+       cr4 = cr4_read_shadow();
+       vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
+-      vmx->host_state.vmcs_host_cr4 = cr4;
++      vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
+       vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+ #ifdef CONFIG_X86_64
+@@ -8992,15 +8992,15 @@ static void __noclone vmx_vcpu_run(struc
+               vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+       cr3 = __get_current_cr3_fast();
+-      if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
++      if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
+               vmcs_writel(HOST_CR3, cr3);
+-              vmx->host_state.vmcs_host_cr3 = cr3;
++              vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
+       }
+       cr4 = cr4_read_shadow();
+-      if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
++      if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
+               vmcs_writel(HOST_CR4, cr4);
+-              vmx->host_state.vmcs_host_cr4 = cr4;
++              vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
+       }
+       /* When single-stepping over STI and MOV SS, we must clear the
diff --git a/queue-4.13/kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch b/queue-4.13/kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch
new file mode 100644 (file)
index 0000000..9adfcbc
--- /dev/null
@@ -0,0 +1,157 @@
+From 8b306e2f3c41939ea528e6174c88cfbfff893ce1 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Tue, 6 Jun 2017 12:57:05 +0200
+Subject: KVM: VMX: avoid double list add with VT-d posted interrupts
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 8b306e2f3c41939ea528e6174c88cfbfff893ce1 upstream.
+
+In some cases, for example involving hot-unplug of assigned
+devices, pi_post_block can forget to remove the vCPU from the
+blocked_vcpu_list.  When this happens, the next call to
+pi_pre_block corrupts the list.
+
+Fix this in two ways.  First, check vcpu->pre_pcpu in pi_pre_block
+and WARN instead of adding the element twice in the list.  Second,
+always do the list removal in pi_post_block if vcpu->pre_pcpu is
+set (not -1).
+
+The new code keeps interrupts disabled for the whole duration of
+pi_pre_block/pi_post_block.  This is not strictly necessary, but
+easier to follow.  For the same reason, PI.ON is checked only
+after the cmpxchg, and to handle it we just call the post-block
+code.  This removes duplication of the list removal code.
+
+Cc: Huangweidong <weidong.huang@huawei.com>
+Cc: Gonglei <arei.gonglei@huawei.com>
+Cc: wangxin <wangxinxin.wang@huawei.com>
+Cc: Radim Krčmář <rkrcmar@redhat.com>
+Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c |   62 +++++++++++++++++++++--------------------------------
+ 1 file changed, 25 insertions(+), 37 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -11394,10 +11394,11 @@ static void __pi_post_block(struct kvm_v
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+       struct pi_desc old, new;
+       unsigned int dest;
+-      unsigned long flags;
+       do {
+               old.control = new.control = pi_desc->control;
++              WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
++                   "Wakeup handler not enabled while the VCPU is blocked\n");
+               dest = cpu_physical_id(vcpu->cpu);
+@@ -11414,14 +11415,10 @@ static void __pi_post_block(struct kvm_v
+       } while (cmpxchg(&pi_desc->control, old.control,
+                       new.control) != old.control);
+-      if(vcpu->pre_pcpu != -1) {
+-              spin_lock_irqsave(
+-                      &per_cpu(blocked_vcpu_on_cpu_lock,
+-                      vcpu->pre_pcpu), flags);
++      if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
++              spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+               list_del(&vcpu->blocked_vcpu_list);
+-              spin_unlock_irqrestore(
+-                      &per_cpu(blocked_vcpu_on_cpu_lock,
+-                      vcpu->pre_pcpu), flags);
++              spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+               vcpu->pre_pcpu = -1;
+       }
+ }
+@@ -11441,7 +11438,6 @@ static void __pi_post_block(struct kvm_v
+  */
+ static int pi_pre_block(struct kvm_vcpu *vcpu)
+ {
+-      unsigned long flags;
+       unsigned int dest;
+       struct pi_desc old, new;
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+@@ -11451,34 +11447,20 @@ static int pi_pre_block(struct kvm_vcpu
+               !kvm_vcpu_apicv_active(vcpu))
+               return 0;
+-      vcpu->pre_pcpu = vcpu->cpu;
+-      spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+-                        vcpu->pre_pcpu), flags);
+-      list_add_tail(&vcpu->blocked_vcpu_list,
+-                    &per_cpu(blocked_vcpu_on_cpu,
+-                    vcpu->pre_pcpu));
+-      spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+-                             vcpu->pre_pcpu), flags);
++      WARN_ON(irqs_disabled());
++      local_irq_disable();
++      if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
++              vcpu->pre_pcpu = vcpu->cpu;
++              spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
++              list_add_tail(&vcpu->blocked_vcpu_list,
++                            &per_cpu(blocked_vcpu_on_cpu,
++                                     vcpu->pre_pcpu));
++              spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
++      }
+       do {
+               old.control = new.control = pi_desc->control;
+-              /*
+-               * We should not block the vCPU if
+-               * an interrupt is posted for it.
+-               */
+-              if (pi_test_on(pi_desc) == 1) {
+-                      spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+-                                        vcpu->pre_pcpu), flags);
+-                      list_del(&vcpu->blocked_vcpu_list);
+-                      spin_unlock_irqrestore(
+-                                      &per_cpu(blocked_vcpu_on_cpu_lock,
+-                                      vcpu->pre_pcpu), flags);
+-                      vcpu->pre_pcpu = -1;
+-
+-                      return 1;
+-              }
+-
+               WARN((pi_desc->sn == 1),
+                    "Warning: SN field of posted-interrupts "
+                    "is set before blocking\n");
+@@ -11503,7 +11485,12 @@ static int pi_pre_block(struct kvm_vcpu
+       } while (cmpxchg(&pi_desc->control, old.control,
+                       new.control) != old.control);
+-      return 0;
++      /* We should not block the vCPU if an interrupt is posted for it.  */
++      if (pi_test_on(pi_desc) == 1)
++              __pi_post_block(vcpu);
++
++      local_irq_enable();
++      return (vcpu->pre_pcpu == -1);
+ }
+ static int vmx_pre_block(struct kvm_vcpu *vcpu)
+@@ -11519,12 +11506,13 @@ static int vmx_pre_block(struct kvm_vcpu
+ static void pi_post_block(struct kvm_vcpu *vcpu)
+ {
+-      if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+-              !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+-              !kvm_vcpu_apicv_active(vcpu))
++      if (vcpu->pre_pcpu == -1)
+               return;
++      WARN_ON(irqs_disabled());
++      local_irq_disable();
+       __pi_post_block(vcpu);
++      local_irq_enable();
+ }
+ static void vmx_post_block(struct kvm_vcpu *vcpu)
diff --git a/queue-4.13/kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch b/queue-4.13/kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch
new file mode 100644 (file)
index 0000000..d2dde79
--- /dev/null
@@ -0,0 +1,57 @@
+From 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= <jschoenh@amazon.de>
+Date: Thu, 7 Sep 2017 19:02:30 +0100
+Subject: KVM: VMX: Do not BUG() on out-of-bounds guest IRQ
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jan H. Schönherr <jschoenh@amazon.de>
+
+commit 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb upstream.
+
+The value of the guest_irq argument to vmx_update_pi_irte() is
+ultimately coming from a KVM_IRQFD API call. Do not BUG() in
+vmx_update_pi_irte() if the value is out-of bounds. (Especially,
+since KVM as a whole seems to hang after that.)
+
+Instead, print a message only once if we find that we don't have a
+route for a certain IRQ (which can be out-of-bounds or within the
+array).
+
+This fixes CVE-2017-1000252.
+
+Fixes: efc644048ecde54 ("KVM: x86: Update IRTE for posted-interrupts")
+Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -11542,7 +11542,7 @@ static int vmx_update_pi_irte(struct kvm
+       struct kvm_lapic_irq irq;
+       struct kvm_vcpu *vcpu;
+       struct vcpu_data vcpu_info;
+-      int idx, ret = -EINVAL;
++      int idx, ret = 0;
+       if (!kvm_arch_has_assigned_device(kvm) ||
+               !irq_remapping_cap(IRQ_POSTING_CAP) ||
+@@ -11551,7 +11551,12 @@ static int vmx_update_pi_irte(struct kvm
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+-      BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
++      if (guest_irq >= irq_rt->nr_rt_entries ||
++          hlist_empty(&irq_rt->map[guest_irq])) {
++              pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
++                           guest_irq, irq_rt->nr_rt_entries);
++              goto out;
++      }
+       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+               if (e->type != KVM_IRQ_ROUTING_MSI)
diff --git a/queue-4.13/kvm-vmx-extract-__pi_post_block.patch b/queue-4.13/kvm-vmx-extract-__pi_post_block.patch
new file mode 100644 (file)
index 0000000..eb0d9cb
--- /dev/null
@@ -0,0 +1,118 @@
+From cd39e1176d320157831ce030b4c869bd2d5eb142 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Tue, 6 Jun 2017 12:57:04 +0200
+Subject: KVM: VMX: extract __pi_post_block
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit cd39e1176d320157831ce030b4c869bd2d5eb142 upstream.
+
+Simple code movement patch, preparing for the next one.
+
+Cc: Huangweidong <weidong.huang@huawei.com>
+Cc: Gonglei <arei.gonglei@huawei.com>
+Cc: wangxin <wangxinxin.wang@huawei.com>
+Cc: Radim Krčmář <rkrcmar@redhat.com>
+Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c |   71 ++++++++++++++++++++++++++++-------------------------
+ 1 file changed, 38 insertions(+), 33 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -11389,6 +11389,43 @@ static void vmx_enable_log_dirty_pt_mask
+       kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+ }
++static void __pi_post_block(struct kvm_vcpu *vcpu)
++{
++      struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
++      struct pi_desc old, new;
++      unsigned int dest;
++      unsigned long flags;
++
++      do {
++              old.control = new.control = pi_desc->control;
++
++              dest = cpu_physical_id(vcpu->cpu);
++
++              if (x2apic_enabled())
++                      new.ndst = dest;
++              else
++                      new.ndst = (dest << 8) & 0xFF00;
++
++              /* Allow posting non-urgent interrupts */
++              new.sn = 0;
++
++              /* set 'NV' to 'notification vector' */
++              new.nv = POSTED_INTR_VECTOR;
++      } while (cmpxchg(&pi_desc->control, old.control,
++                      new.control) != old.control);
++
++      if(vcpu->pre_pcpu != -1) {
++              spin_lock_irqsave(
++                      &per_cpu(blocked_vcpu_on_cpu_lock,
++                      vcpu->pre_pcpu), flags);
++              list_del(&vcpu->blocked_vcpu_list);
++              spin_unlock_irqrestore(
++                      &per_cpu(blocked_vcpu_on_cpu_lock,
++                      vcpu->pre_pcpu), flags);
++              vcpu->pre_pcpu = -1;
++      }
++}
++
+ /*
+  * This routine does the following things for vCPU which is going
+  * to be blocked if VT-d PI is enabled.
+@@ -11482,44 +11519,12 @@ static int vmx_pre_block(struct kvm_vcpu
+ static void pi_post_block(struct kvm_vcpu *vcpu)
+ {
+-      struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+-      struct pi_desc old, new;
+-      unsigned int dest;
+-      unsigned long flags;
+-
+       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+               !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+               !kvm_vcpu_apicv_active(vcpu))
+               return;
+-      do {
+-              old.control = new.control = pi_desc->control;
+-
+-              dest = cpu_physical_id(vcpu->cpu);
+-
+-              if (x2apic_enabled())
+-                      new.ndst = dest;
+-              else
+-                      new.ndst = (dest << 8) & 0xFF00;
+-
+-              /* Allow posting non-urgent interrupts */
+-              new.sn = 0;
+-
+-              /* set 'NV' to 'notification vector' */
+-              new.nv = POSTED_INTR_VECTOR;
+-      } while (cmpxchg(&pi_desc->control, old.control,
+-                      new.control) != old.control);
+-
+-      if(vcpu->pre_pcpu != -1) {
+-              spin_lock_irqsave(
+-                      &per_cpu(blocked_vcpu_on_cpu_lock,
+-                      vcpu->pre_pcpu), flags);
+-              list_del(&vcpu->blocked_vcpu_list);
+-              spin_unlock_irqrestore(
+-                      &per_cpu(blocked_vcpu_on_cpu_lock,
+-                      vcpu->pre_pcpu), flags);
+-              vcpu->pre_pcpu = -1;
+-      }
++      __pi_post_block(vcpu);
+ }
+ static void vmx_post_block(struct kvm_vcpu *vcpu)
diff --git a/queue-4.13/kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch b/queue-4.13/kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch
new file mode 100644 (file)
index 0000000..ecd19c6
--- /dev/null
@@ -0,0 +1,130 @@
+From 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Tue, 6 Jun 2017 12:57:06 +0200
+Subject: KVM: VMX: simplify and fix vmx_vcpu_pi_load
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a upstream.
+
+The simplify part: do not touch pi_desc.nv, we can set it when the
+VCPU is first created.  Likewise, pi_desc.sn is only handled by
+vmx_vcpu_pi_load, do not touch it in __pi_post_block.
+
+The fix part: do not check kvm_arch_has_assigned_device, instead
+check the SN bit to figure out whether vmx_vcpu_pi_put ran before.
+This matches what the previous patch did in pi_post_block.
+
+Cc: Huangweidong <weidong.huang@huawei.com>
+Cc: Gonglei <arei.gonglei@huawei.com>
+Cc: wangxin <wangxinxin.wang@huawei.com>
+Cc: Radim Krčmář <rkrcmar@redhat.com>
+Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c |   68 +++++++++++++++++++++++++++--------------------------
+ 1 file changed, 35 insertions(+), 33 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -2187,43 +2187,41 @@ static void vmx_vcpu_pi_load(struct kvm_
+       struct pi_desc old, new;
+       unsigned int dest;
+-      if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+-              !irq_remapping_cap(IRQ_POSTING_CAP)  ||
+-              !kvm_vcpu_apicv_active(vcpu))
++      /*
++       * In case of hot-plug or hot-unplug, we may have to undo
++       * vmx_vcpu_pi_put even if there is no assigned device.  And we
++       * always keep PI.NDST up to date for simplicity: it makes the
++       * code easier, and CPU migration is not a fast path.
++       */
++      if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
++              return;
++
++      /*
++       * First handle the simple case where no cmpxchg is necessary; just
++       * allow posting non-urgent interrupts.
++       *
++       * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
++       * PI.NDST: pi_post_block will do it for us and the wakeup_handler
++       * expects the VCPU to be on the blocked_vcpu_list that matches
++       * PI.NDST.
++       */
++      if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
++          vcpu->cpu == cpu) {
++              pi_clear_sn(pi_desc);
+               return;
++      }
++      /* The full case.  */
+       do {
+               old.control = new.control = pi_desc->control;
+-              /*
+-               * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
+-               * are two possible cases:
+-               * 1. After running 'pre_block', context switch
+-               *    happened. For this case, 'sn' was set in
+-               *    vmx_vcpu_put(), so we need to clear it here.
+-               * 2. After running 'pre_block', we were blocked,
+-               *    and woken up by some other guy. For this case,
+-               *    we don't need to do anything, 'pi_post_block'
+-               *    will do everything for us. However, we cannot
+-               *    check whether it is case #1 or case #2 here
+-               *    (maybe, not needed), so we also clear sn here,
+-               *    I think it is not a big deal.
+-               */
+-              if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
+-                      if (vcpu->cpu != cpu) {
+-                              dest = cpu_physical_id(cpu);
+-
+-                              if (x2apic_enabled())
+-                                      new.ndst = dest;
+-                              else
+-                                      new.ndst = (dest << 8) & 0xFF00;
+-                      }
++              dest = cpu_physical_id(cpu);
+-                      /* set 'NV' to 'notification vector' */
+-                      new.nv = POSTED_INTR_VECTOR;
+-              }
++              if (x2apic_enabled())
++                      new.ndst = dest;
++              else
++                      new.ndst = (dest << 8) & 0xFF00;
+-              /* Allow posting non-urgent interrupts */
+               new.sn = 0;
+       } while (cmpxchg(&pi_desc->control, old.control,
+                       new.control) != old.control);
+@@ -9310,6 +9308,13 @@ static struct kvm_vcpu *vmx_create_vcpu(
+       vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
++      /*
++       * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
++       * or POSTED_INTR_WAKEUP_VECTOR.
++       */
++      vmx->pi_desc.nv = POSTED_INTR_VECTOR;
++      vmx->pi_desc.sn = 1;
++
+       return &vmx->vcpu;
+ free_vmcs:
+@@ -11407,9 +11412,6 @@ static void __pi_post_block(struct kvm_v
+               else
+                       new.ndst = (dest << 8) & 0xFF00;
+-              /* Allow posting non-urgent interrupts */
+-              new.sn = 0;
+-
+               /* set 'NV' to 'notification vector' */
+               new.nv = POSTED_INTR_VECTOR;
+       } while (cmpxchg(&pi_desc->control, old.control,
diff --git a/queue-4.13/kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch b/queue-4.13/kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch
new file mode 100644 (file)
index 0000000..307113a
--- /dev/null
@@ -0,0 +1,81 @@
+From b862789aa5186d5ea3a024b7cfe0f80c3a38b980 Mon Sep 17 00:00:00 2001
+From: Boqun Feng <boqun.feng@gmail.com>
+Date: Fri, 29 Sep 2017 19:01:45 +0800
+Subject: kvm/x86: Handle async PF in RCU read-side critical sections
+
+From: Boqun Feng <boqun.feng@gmail.com>
+
+commit b862789aa5186d5ea3a024b7cfe0f80c3a38b980 upstream.
+
+Sasha Levin reported a WARNING:
+
+| WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329
+| rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline]
+| WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329
+| rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458
+...
+| CPU: 0 PID: 6974 Comm: syz-fuzzer Not tainted 4.13.0-next-20170908+ #246
+| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
+| 1.10.1-1ubuntu1 04/01/2014
+| Call Trace:
+...
+| RIP: 0010:rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline]
+| RIP: 0010:rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458
+| RSP: 0018:ffff88003b2debc8 EFLAGS: 00010002
+| RAX: 0000000000000001 RBX: 1ffff1000765bd85 RCX: 0000000000000000
+| RDX: 1ffff100075d7882 RSI: ffffffffb5c7da20 RDI: ffff88003aebc410
+| RBP: ffff88003b2def30 R08: dffffc0000000000 R09: 0000000000000001
+| R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003b2def08
+| R13: 0000000000000000 R14: ffff88003aebc040 R15: ffff88003aebc040
+| __schedule+0x201/0x2240 kernel/sched/core.c:3292
+| schedule+0x113/0x460 kernel/sched/core.c:3421
+| kvm_async_pf_task_wait+0x43f/0x940 arch/x86/kernel/kvm.c:158
+| do_async_page_fault+0x72/0x90 arch/x86/kernel/kvm.c:271
+| async_page_fault+0x22/0x30 arch/x86/entry/entry_64.S:1069
+| RIP: 0010:format_decode+0x240/0x830 lib/vsprintf.c:1996
+| RSP: 0018:ffff88003b2df520 EFLAGS: 00010283
+| RAX: 000000000000003f RBX: ffffffffb5d1e141 RCX: ffff88003b2df670
+| RDX: 0000000000000001 RSI: dffffc0000000000 RDI: ffffffffb5d1e140
+| RBP: ffff88003b2df560 R08: dffffc0000000000 R09: 0000000000000000
+| R10: ffff88003b2df718 R11: 0000000000000000 R12: ffff88003b2df5d8
+| R13: 0000000000000064 R14: ffffffffb5d1e140 R15: 0000000000000000
+| vsnprintf+0x173/0x1700 lib/vsprintf.c:2136
+| sprintf+0xbe/0xf0 lib/vsprintf.c:2386
+| proc_self_get_link+0xfb/0x1c0 fs/proc/self.c:23
+| get_link fs/namei.c:1047 [inline]
+| link_path_walk+0x1041/0x1490 fs/namei.c:2127
+...
+
+This happened when the host hit a page fault, and delivered it as in an
+async page fault, while the guest was in an RCU read-side critical
+section.  The guest then tries to reschedule in kvm_async_pf_task_wait(),
+but rcu_preempt_note_context_switch() would treat the reschedule as a
+sleep in RCU read-side critical section, which is not allowed (even in
+preemptible RCU).  Thus the WARN.
+
+To cure this, make kvm_async_pf_task_wait() go to the halt path if the
+PF happens in a RCU read-side critical section.
+
+Reported-by: Sasha Levin <levinsasha928@gmail.com>
+Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/kvm.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/kvm.c
++++ b/arch/x86/kernel/kvm.c
+@@ -140,7 +140,8 @@ void kvm_async_pf_task_wait(u32 token)
+       n.token = token;
+       n.cpu = smp_processor_id();
+-      n.halted = is_idle_task(current) || preempt_count() > 1;
++      n.halted = is_idle_task(current) || preempt_count() > 1 ||
++                 rcu_preempt_depth();
+       init_swait_queue_head(&n.wq);
+       hlist_add_head(&n.link, &b->list);
+       raw_spin_unlock(&b->lock);
diff --git a/queue-4.13/md-fix-a-race-condition-for-flush-request-handling.patch b/queue-4.13/md-fix-a-race-condition-for-flush-request-handling.patch
new file mode 100644 (file)
index 0000000..753568e
--- /dev/null
@@ -0,0 +1,52 @@
+From 79bf31a3b2a7ca467cfec8ff97d359a77065d01f Mon Sep 17 00:00:00 2001
+From: Shaohua Li <shli@fb.com>
+Date: Thu, 21 Sep 2017 09:55:28 -0700
+Subject: md: fix a race condition for flush request handling
+
+From: Shaohua Li <shli@fb.com>
+
+commit 79bf31a3b2a7ca467cfec8ff97d359a77065d01f upstream.
+
+md_submit_flush_data calls pers->make_request, which missed the suspend check.
+Fix it with the new md_handle_request API.
+
+Reported-by: Nate Dailey <nate.dailey@stratus.com>
+Tested-by: Nate Dailey <nate.dailey@stratus.com>
+Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start())
+Reviewed-by: NeilBrown <neilb@suse.com>
+Signed-off-by: Shaohua Li <shli@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/md.c |   14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+--- a/drivers/md/md.c
++++ b/drivers/md/md.c
+@@ -439,16 +439,22 @@ static void md_submit_flush_data(struct
+       struct mddev *mddev = container_of(ws, struct mddev, flush_work);
+       struct bio *bio = mddev->flush_bio;
++      /*
++       * must reset flush_bio before calling into md_handle_request to avoid a
++       * deadlock, because other bios passed md_handle_request suspend check
++       * could wait for this and below md_handle_request could wait for those
++       * bios because of suspend check
++       */
++      mddev->flush_bio = NULL;
++      wake_up(&mddev->sb_wait);
++
+       if (bio->bi_iter.bi_size == 0)
+               /* an empty barrier - all done */
+               bio_endio(bio);
+       else {
+               bio->bi_opf &= ~REQ_PREFLUSH;
+-              mddev->pers->make_request(mddev, bio);
++              md_handle_request(mddev, bio);
+       }
+-
+-      mddev->flush_bio = NULL;
+-      wake_up(&mddev->sb_wait);
+ }
+ void md_flush_request(struct mddev *mddev, struct bio *bio)
diff --git a/queue-4.13/md-separate-request-handling.patch b/queue-4.13/md-separate-request-handling.patch
new file mode 100644 (file)
index 0000000..c1f65bf
--- /dev/null
@@ -0,0 +1,122 @@
+From 393debc23c7820211d1c8253dd6a8408a7628fe7 Mon Sep 17 00:00:00 2001
+From: Shaohua Li <shli@fb.com>
+Date: Thu, 21 Sep 2017 10:23:35 -0700
+Subject: md: separate request handling
+
+From: Shaohua Li <shli@fb.com>
+
+commit 393debc23c7820211d1c8253dd6a8408a7628fe7 upstream.
+
+With commit cc27b0c78c79, pers->make_request could bail out without handling
+the bio. If that happens, we should retry.  The commit fixes md_make_request
+but not other call sites. Separate the request handling part, so other call
+sites can use it.
+
+Reported-by: Nate Dailey <nate.dailey@stratus.com>
+Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start())
+Reviewed-by: NeilBrown <neilb@suse.com>
+Signed-off-by: Shaohua Li <shli@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/md.c |   58 +++++++++++++++++++++++++++++++-------------------------
+ drivers/md/md.h |    1 
+ 2 files changed, 34 insertions(+), 25 deletions(-)
+
+--- a/drivers/md/md.c
++++ b/drivers/md/md.c
+@@ -266,6 +266,37 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
+  * call has finished, the bio has been linked into some internal structure
+  * and so is visible to ->quiesce(), so we don't need the refcount any more.
+  */
++void md_handle_request(struct mddev *mddev, struct bio *bio)
++{
++check_suspended:
++      rcu_read_lock();
++      if (mddev->suspended) {
++              DEFINE_WAIT(__wait);
++              for (;;) {
++                      prepare_to_wait(&mddev->sb_wait, &__wait,
++                                      TASK_UNINTERRUPTIBLE);
++                      if (!mddev->suspended)
++                              break;
++                      rcu_read_unlock();
++                      schedule();
++                      rcu_read_lock();
++              }
++              finish_wait(&mddev->sb_wait, &__wait);
++      }
++      atomic_inc(&mddev->active_io);
++      rcu_read_unlock();
++
++      if (!mddev->pers->make_request(mddev, bio)) {
++              atomic_dec(&mddev->active_io);
++              wake_up(&mddev->sb_wait);
++              goto check_suspended;
++      }
++
++      if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
++              wake_up(&mddev->sb_wait);
++}
++EXPORT_SYMBOL(md_handle_request);
++
+ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
+ {
+       const int rw = bio_data_dir(bio);
+@@ -285,23 +316,6 @@ static blk_qc_t md_make_request(struct r
+               bio_endio(bio);
+               return BLK_QC_T_NONE;
+       }
+-check_suspended:
+-      rcu_read_lock();
+-      if (mddev->suspended) {
+-              DEFINE_WAIT(__wait);
+-              for (;;) {
+-                      prepare_to_wait(&mddev->sb_wait, &__wait,
+-                                      TASK_UNINTERRUPTIBLE);
+-                      if (!mddev->suspended)
+-                              break;
+-                      rcu_read_unlock();
+-                      schedule();
+-                      rcu_read_lock();
+-              }
+-              finish_wait(&mddev->sb_wait, &__wait);
+-      }
+-      atomic_inc(&mddev->active_io);
+-      rcu_read_unlock();
+       /*
+        * save the sectors now since our bio can
+@@ -310,20 +324,14 @@ check_suspended:
+       sectors = bio_sectors(bio);
+       /* bio could be mergeable after passing to underlayer */
+       bio->bi_opf &= ~REQ_NOMERGE;
+-      if (!mddev->pers->make_request(mddev, bio)) {
+-              atomic_dec(&mddev->active_io);
+-              wake_up(&mddev->sb_wait);
+-              goto check_suspended;
+-      }
++
++      md_handle_request(mddev, bio);
+       cpu = part_stat_lock();
+       part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+       part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+       part_stat_unlock();
+-      if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+-              wake_up(&mddev->sb_wait);
+-
+       return BLK_QC_T_NONE;
+ }
+--- a/drivers/md/md.h
++++ b/drivers/md/md.h
+@@ -686,6 +686,7 @@ extern void md_stop_writes(struct mddev
+ extern int md_rdev_init(struct md_rdev *rdev);
+ extern void md_rdev_clear(struct md_rdev *rdev);
++extern void md_handle_request(struct mddev *mddev, struct bio *bio);
+ extern void mddev_suspend(struct mddev *mddev);
+ extern void mddev_resume(struct mddev *mddev);
+ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
diff --git a/queue-4.13/mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch b/queue-4.13/mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch
new file mode 100644 (file)
index 0000000..a77be0f
--- /dev/null
@@ -0,0 +1,47 @@
+From 7e439681af82984045efc215437ebb2ca8d33a4c Mon Sep 17 00:00:00 2001
+From: Boris Brezillon <boris.brezillon@free-electrons.com>
+Date: Mon, 25 Sep 2017 10:19:57 +0200
+Subject: mtd: Fix partition alignment check on multi-erasesize devices
+
+From: Boris Brezillon <boris.brezillon@free-electrons.com>
+
+commit 7e439681af82984045efc215437ebb2ca8d33a4c upstream.
+
+Commit 1eeef2d7483a ("mtd: handle partitioning on devices with 0
+erasesize") introduced a regression on heterogeneous erase region
+devices. Alignment of the partition was tested against the master
+eraseblock size which can be bigger than the slave one, thus leading
+to some partitions being marked as read-only.
+
+Update wr_alignment to match this slave erasesize after this erasesize
+has been determined by picking the biggest erasesize of all the regions
+embedded in the MTD partition.
+
+Reported-by: Mathias Thore <Mathias.Thore@infinera.com>
+Fixes: 1eeef2d7483a ("mtd: handle partitioning on devices with 0 erasesize")
+Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
+Tested-by: Mathias Thore <Mathias.Thore@infinera.com>
+Reviewed-by: Mathias Thore <Mathias.Thore@infinera.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/mtd/mtdpart.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/drivers/mtd/mtdpart.c
++++ b/drivers/mtd/mtdpart.c
+@@ -581,6 +581,14 @@ static struct mtd_part *allocate_partiti
+               slave->mtd.erasesize = parent->erasesize;
+       }
++      /*
++       * Slave erasesize might differ from the master one if the master
++       * exposes several regions with different erasesize. Adjust
++       * wr_alignment accordingly.
++       */
++      if (!(slave->mtd.flags & MTD_NO_ERASE))
++              wr_alignment = slave->mtd.erasesize;
++
+       tmp = slave->offset;
+       remainder = do_div(tmp, wr_alignment);
+       if ((slave->mtd.flags & MTD_WRITEABLE) && remainder) {
diff --git a/queue-4.13/mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch b/queue-4.13/mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch
new file mode 100644 (file)
index 0000000..e7f5d13
--- /dev/null
@@ -0,0 +1,37 @@
+From 36de80740008e6a4a55115b4a92e2059e47c1cba Mon Sep 17 00:00:00 2001
+From: Richard Genoud <richard.genoud@gmail.com>
+Date: Wed, 27 Sep 2017 14:49:17 +0200
+Subject: mtd: nand: atmel: fix buffer overflow in atmel_pmecc_user
+
+From: Richard Genoud <richard.genoud@gmail.com>
+
+commit 36de80740008e6a4a55115b4a92e2059e47c1cba upstream.
+
+When calculating the size needed by struct atmel_pmecc_user *user,
+the dmu and delta buffer sizes were forgotten.
+This lead to a memory corruption (especially with a large ecc_strength).
+
+Link: http://lkml.kernel.org/r/1506503157.3016.5.camel@gmail.com
+Fixes: f88fc122cc34 ("mtd: nand: Cleanup/rework the atmel_nand driver")
+Reported-by: Richard Genoud <richard.genoud@gmail.com>
+Pointed-at-by: Boris Brezillon <boris.brezillon@free-electrons.com>
+Signed-off-by: Richard Genoud <richard.genoud@gmail.com>
+Reviewed-by: Nicolas Ferre <nicolas.ferre@microchip.com>
+Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/mtd/nand/atmel/pmecc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/mtd/nand/atmel/pmecc.c
++++ b/drivers/mtd/nand/atmel/pmecc.c
+@@ -363,7 +363,7 @@ atmel_pmecc_create_user(struct atmel_pme
+       size += (req->ecc.strength + 1) * sizeof(u16);
+       /* Reserve space for mu, dmu and delta. */
+       size = ALIGN(size, sizeof(s32));
+-      size += (req->ecc.strength + 1) * sizeof(s32);
++      size += (req->ecc.strength + 1) * sizeof(s32) * 3;
+       user = kzalloc(size, GFP_KERNEL);
+       if (!user)
diff --git a/queue-4.13/pci-fix-race-condition-with-driver_override.patch b/queue-4.13/pci-fix-race-condition-with-driver_override.patch
new file mode 100644 (file)
index 0000000..08a35f9
--- /dev/null
@@ -0,0 +1,66 @@
+From 9561475db680f7144d2223a409dd3d7e322aca03 Mon Sep 17 00:00:00 2001
+From: Nicolai Stange <nstange@suse.de>
+Date: Mon, 11 Sep 2017 09:45:40 +0200
+Subject: PCI: Fix race condition with driver_override
+
+From: Nicolai Stange <nstange@suse.de>
+
+commit 9561475db680f7144d2223a409dd3d7e322aca03 upstream.
+
+The driver_override implementation is susceptible to a race condition when
+different threads are reading vs. storing a different driver override.  Add
+locking to avoid the race condition.
+
+This is in close analogy to commit 6265539776a0 ("driver core: platform:
+fix race condition with driver_override") from Adrian Salido.
+
+Fixes: 782a985d7af2 ("PCI: Introduce new device binding path using pci_dev.driver_override")
+Signed-off-by: Nicolai Stange <nstange@suse.de>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pci/pci-sysfs.c |   11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+--- a/drivers/pci/pci-sysfs.c
++++ b/drivers/pci/pci-sysfs.c
+@@ -686,7 +686,7 @@ static ssize_t driver_override_store(str
+                                    const char *buf, size_t count)
+ {
+       struct pci_dev *pdev = to_pci_dev(dev);
+-      char *driver_override, *old = pdev->driver_override, *cp;
++      char *driver_override, *old, *cp;
+       /* We need to keep extra room for a newline */
+       if (count >= (PAGE_SIZE - 1))
+@@ -700,12 +700,15 @@ static ssize_t driver_override_store(str
+       if (cp)
+               *cp = '\0';
++      device_lock(dev);
++      old = pdev->driver_override;
+       if (strlen(driver_override)) {
+               pdev->driver_override = driver_override;
+       } else {
+               kfree(driver_override);
+               pdev->driver_override = NULL;
+       }
++      device_unlock(dev);
+       kfree(old);
+@@ -716,8 +719,12 @@ static ssize_t driver_override_show(stru
+                                   struct device_attribute *attr, char *buf)
+ {
+       struct pci_dev *pdev = to_pci_dev(dev);
++      ssize_t len;
+-      return snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override);
++      device_lock(dev);
++      len = snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override);
++      device_unlock(dev);
++      return len;
+ }
+ static DEVICE_ATTR_RW(driver_override);
diff --git a/queue-4.13/platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch b/queue-4.13/platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch
new file mode 100644 (file)
index 0000000..31d6e7c
--- /dev/null
@@ -0,0 +1,48 @@
+From ce7c47d60bda6c7f09ccf16e978d971c8fa16ff0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
+Date: Mon, 18 Sep 2017 23:00:59 +0300
+Subject: platform/x86: fujitsu-laptop: Don't oops when FUJ02E3 is not presnt
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ville Syrjälä <ville.syrjala@linux.intel.com>
+
+commit ce7c47d60bda6c7f09ccf16e978d971c8fa16ff0 upstream.
+
+My Fujitsu-Siemens Lifebook S6120 doesn't have the FUJ02E3 device,
+but it does have FUJ02B1. That means we do register the backlight
+device (and it even seems to work), but the code will oops as soon
+as we try to set the backlight brightness because it's trying to
+call call_fext_func() with a NULL device. Let's just skip those
+function calls when the FUJ02E3 device is not present.
+
+Cc: Jonathan Woithe <jwoithe@just42.net>
+Cc: Andy Shevchenko <andy@infradead.org>
+Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
+Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/platform/x86/fujitsu-laptop.c |   10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/drivers/platform/x86/fujitsu-laptop.c
++++ b/drivers/platform/x86/fujitsu-laptop.c
+@@ -254,10 +254,12 @@ static int bl_update_status(struct backl
+ {
+       struct acpi_device *device = bl_get_data(b);
+-      if (b->props.power == FB_BLANK_POWERDOWN)
+-              call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3);
+-      else
+-              call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0);
++      if (fext) {
++              if (b->props.power == FB_BLANK_POWERDOWN)
++                      call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3);
++              else
++                      call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0);
++      }
+       return set_lcd_level(device, b->props.brightness);
+ }
diff --git a/queue-4.13/pm-opp-call-notifier-without-holding-opp_table-lock.patch b/queue-4.13/pm-opp-call-notifier-without-holding-opp_table-lock.patch
new file mode 100644 (file)
index 0000000..4d50562
--- /dev/null
@@ -0,0 +1,58 @@
+From e4d8ae00169f7686e1da5a62e5cf797d12bf8822 Mon Sep 17 00:00:00 2001
+From: Viresh Kumar <viresh.kumar@linaro.org>
+Date: Thu, 21 Sep 2017 10:44:36 -0700
+Subject: PM / OPP: Call notifier without holding opp_table->lock
+
+From: Viresh Kumar <viresh.kumar@linaro.org>
+
+commit e4d8ae00169f7686e1da5a62e5cf797d12bf8822 upstream.
+
+The notifier callbacks may want to call some OPP helper routines which
+may try to take the same opp_table->lock again and cause a deadlock. One
+such usecase was reported by Chanwoo Choi, where calling
+dev_pm_opp_disable() leads us to the devfreq's OPP notifier handler,
+which further calls dev_pm_opp_find_freq_floor() and it deadlocks.
+
+We don't really need the opp_table->lock to be held across the notifier
+call though, all we want to make sure is that the 'opp' doesn't get
+freed while being used from within the notifier chain. We can do it with
+help of dev_pm_opp_get/put() as well. Let's do it.
+
+Fixes: 5b650b388844 "PM / OPP: Take kref from _find_opp_table()"
+Reported-by: Chanwoo Choi <cw00.choi@samsung.com>
+Tested-by: Chanwoo Choi <cw00.choi@samsung.com>
+Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
+Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
+Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/base/power/opp/core.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/drivers/base/power/opp/core.c
++++ b/drivers/base/power/opp/core.c
+@@ -1581,6 +1581,9 @@ static int _opp_set_availability(struct
+       opp->available = availability_req;
++      dev_pm_opp_get(opp);
++      mutex_unlock(&opp_table->lock);
++
+       /* Notify the change of the OPP availability */
+       if (availability_req)
+               blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE,
+@@ -1589,8 +1592,12 @@ static int _opp_set_availability(struct
+               blocking_notifier_call_chain(&opp_table->head,
+                                            OPP_EVENT_DISABLE, opp);
++      dev_pm_opp_put(opp);
++      goto put_table;
++
+ unlock:
+       mutex_unlock(&opp_table->lock);
++put_table:
+       dev_pm_opp_put_opp_table(opp_table);
+       return r;
+ }
diff --git a/queue-4.13/sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch b/queue-4.13/sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch
new file mode 100644 (file)
index 0000000..85f06a7
--- /dev/null
@@ -0,0 +1,83 @@
+From 5ccba44ba118a5000cccc50076b0344632459779 Mon Sep 17 00:00:00 2001
+From: Ethan Zhao <ethan.zhao@oracle.com>
+Date: Mon, 4 Sep 2017 13:59:34 +0800
+Subject: sched/sysctl: Check user input value of sysctl_sched_time_avg
+
+From: Ethan Zhao <ethan.zhao@oracle.com>
+
+commit 5ccba44ba118a5000cccc50076b0344632459779 upstream.
+
+System will hang if user set sysctl_sched_time_avg to 0:
+
+  [root@XXX ~]# sysctl kernel.sched_time_avg_ms=0
+
+  Stack traceback for pid 0
+  0xffff883f6406c600 0 0 1 3 R 0xffff883f6406cf50 *swapper/3
+  ffff883f7ccc3ae8 0000000000000018 ffffffff810c4dd0 0000000000000000
+  0000000000017800 ffff883f7ccc3d78 0000000000000003 ffff883f7ccc3bf8
+  ffffffff810c4fc9 ffff883f7ccc3c08 00000000810c5043 ffff883f7ccc3c08
+  Call Trace:
+  <IRQ> [<ffffffff810c4dd0>] ? update_group_capacity+0x110/0x200
+  [<ffffffff810c4fc9>] ? update_sd_lb_stats+0x109/0x600
+  [<ffffffff810c5507>] ? find_busiest_group+0x47/0x530
+  [<ffffffff810c5b84>] ? load_balance+0x194/0x900
+  [<ffffffff810ad5ca>] ? update_rq_clock.part.83+0x1a/0xe0
+  [<ffffffff810c6d42>] ? rebalance_domains+0x152/0x290
+  [<ffffffff810c6f5c>] ? run_rebalance_domains+0xdc/0x1d0
+  [<ffffffff8108a75b>] ? __do_softirq+0xfb/0x320
+  [<ffffffff8108ac85>] ? irq_exit+0x125/0x130
+  [<ffffffff810b3a17>] ? scheduler_ipi+0x97/0x160
+  [<ffffffff81052709>] ? smp_reschedule_interrupt+0x29/0x30
+  [<ffffffff8173a1be>] ? reschedule_interrupt+0x6e/0x80
+   <EOI> [<ffffffff815bc83c>] ? cpuidle_enter_state+0xcc/0x230
+  [<ffffffff815bc80c>] ? cpuidle_enter_state+0x9c/0x230
+  [<ffffffff815bc9d7>] ? cpuidle_enter+0x17/0x20
+  [<ffffffff810cd6dc>] ? cpu_startup_entry+0x38c/0x420
+  [<ffffffff81053373>] ? start_secondary+0x173/0x1e0
+
+Because divide-by-zero error happens in function:
+
+update_group_capacity()
+  update_cpu_capacity()
+    scale_rt_capacity()
+     {
+          ...
+          total = sched_avg_period() + delta;
+          used = div_u64(avg, total);
+          ...
+     }
+
+To fix this issue, check user input value of sysctl_sched_time_avg, keep
+it unchanged when hitting invalid input, and set the minimum limit of
+sysctl_sched_time_avg to 1 ms.
+
+Reported-by: James Puthukattukaran <james.puthukattukaran@oracle.com>
+Signed-off-by: Ethan Zhao <ethan.zhao@oracle.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: efault@gmx.de
+Cc: ethan.kernel@gmail.com
+Cc: keescook@chromium.org
+Cc: mcgrof@kernel.org
+Link: http://lkml.kernel.org/r/1504504774-18253-1-git-send-email-ethan.zhao@oracle.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sysctl.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = {
+               .data           = &sysctl_sched_time_avg,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+-              .proc_handler   = proc_dointvec,
++              .proc_handler   = proc_dointvec_minmax,
++              .extra1         = &one,
+       },
+ #ifdef CONFIG_SCHEDSTATS
+       {
index 44184652780419235f5b88b1ea10e83ca5cde309..5c7ecc8557261fa9e0e59e2aaf3cb5cec5ae6400 100644 (file)
@@ -73,3 +73,33 @@ extable-consolidate-kernel_text_address-functions.patch
 extable-enable-rcu-if-it-is-not-watching-in-kernel_text_address.patch
 selftests-seccomp-support-glibc-2.26-siginfo_t.h.patch
 seccomp-fix-the-usage-of-get-put_seccomp_filter-in-seccomp_get_filter.patch
+arm64-make-sure-spsel-is-always-set.patch
+arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch
+arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch
+kvm-vmx-extract-__pi_post_block.patch
+kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch
+kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch
+kvm-nvmx-fix-host_cr3-host_cr4-cache.patch
+kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch
+kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch
+kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch
+xfs-validate-bdev-support-for-dax-inode-flag.patch
+fix-infoleak-in-waitid-2.patch
+sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch
+irq-generic-chip-don-t-replace-domain-s-name.patch
+mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch
+mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch
+etnaviv-fix-submit-error-path.patch
+etnaviv-fix-gem-object-list-corruption.patch
+futex-fix-pi_state-owner-serialization.patch
+md-fix-a-race-condition-for-flush-request-handling.patch
+md-separate-request-handling.patch
+pci-fix-race-condition-with-driver_override.patch
+btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch
+btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch
+btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch
+btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch
+btrfs-prevent-to-set-invalid-default-subvolid.patch
+platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch
+pm-opp-call-notifier-without-holding-opp_table-lock.patch
+x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch
diff --git a/queue-4.13/x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch b/queue-4.13/x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch
new file mode 100644 (file)
index 0000000..0c7db6b
--- /dev/null
@@ -0,0 +1,211 @@
+From a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 Mon Sep 17 00:00:00 2001
+From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+Date: Mon, 4 Sep 2017 10:32:15 +0200
+Subject: x86/mm: Fix fault error path using unsafe vma pointer
+
+From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+
+commit a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 upstream.
+
+commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal
+generation code") passes down a vma pointer to the error path, but that is
+done once the mmap_sem is released when calling mm_fault_error() from
+__do_page_fault().
+
+This is dangerous as the vma structure is no more safe to be used once the
+mmap_sem has been released. As only the protection key value is required in
+the error processing, we could just pass down this value.
+
+Fix it by passing a pointer to a protection key value down to the fault
+signal generation code. The use of a pointer allows to keep the check
+generating a warning message in fill_sig_info_pkey() when the vma was not
+known. If the pointer is valid, the protection value can be accessed by
+deferencing the pointer.
+
+[ tglx: Made *pkey u32 as that's the type which is passed in siginfo ]
+
+Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code")
+Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-mm@kvack.org
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Link: http://lkml.kernel.org/r/1504513935-12742-1-git-send-email-ldufour@linux.vnet.ibm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/fault.c |   47 ++++++++++++++++++++++++-----------------------
+ 1 file changed, 24 insertions(+), 23 deletions(-)
+
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsign
+  * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
+  *         faulted on a pte with its pkey=4.
+  */
+-static void fill_sig_info_pkey(int si_code, siginfo_t *info,
+-              struct vm_area_struct *vma)
++static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
+ {
+       /* This is effectively an #ifdef */
+       if (!boot_cpu_has(X86_FEATURE_OSPKE))
+@@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_co
+        * valid VMA, so we should never reach this without a
+        * valid VMA.
+        */
+-      if (!vma) {
++      if (!pkey) {
+               WARN_ONCE(1, "PKU fault with no VMA passed in");
+               info->si_pkey = 0;
+               return;
+@@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_co
+        * absolutely guranteed to be 100% accurate because of
+        * the race explained above.
+        */
+-      info->si_pkey = vma_pkey(vma);
++      info->si_pkey = *pkey;
+ }
+ static void
+ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+-                   struct task_struct *tsk, struct vm_area_struct *vma,
+-                   int fault)
++                   struct task_struct *tsk, u32 *pkey, int fault)
+ {
+       unsigned lsb = 0;
+       siginfo_t info;
+@@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int s
+               lsb = PAGE_SHIFT;
+       info.si_addr_lsb = lsb;
+-      fill_sig_info_pkey(si_code, &info, vma);
++      fill_sig_info_pkey(si_code, &info, pkey);
+       force_sig_info(si_signo, &info, tsk);
+ }
+@@ -758,8 +756,6 @@ no_context(struct pt_regs *regs, unsigne
+       struct task_struct *tsk = current;
+       unsigned long flags;
+       int sig;
+-      /* No context means no VMA to pass down */
+-      struct vm_area_struct *vma = NULL;
+       /* Are we prepared to handle this kernel fault? */
+       if (fixup_exception(regs, X86_TRAP_PF)) {
+@@ -784,7 +780,7 @@ no_context(struct pt_regs *regs, unsigne
+                       /* XXX: hwpoison faults will set the wrong code. */
+                       force_sig_info_fault(signal, si_code, address,
+-                                           tsk, vma, 0);
++                                           tsk, NULL, 0);
+               }
+               /*
+@@ -893,8 +889,7 @@ show_signal_msg(struct pt_regs *regs, un
+ static void
+ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+-                     unsigned long address, struct vm_area_struct *vma,
+-                     int si_code)
++                     unsigned long address, u32 *pkey, int si_code)
+ {
+       struct task_struct *tsk = current;
+@@ -942,7 +937,7 @@ __bad_area_nosemaphore(struct pt_regs *r
+               tsk->thread.error_code  = error_code;
+               tsk->thread.trap_nr     = X86_TRAP_PF;
+-              force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
++              force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
+               return;
+       }
+@@ -955,9 +950,9 @@ __bad_area_nosemaphore(struct pt_regs *r
+ static noinline void
+ bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+-                   unsigned long address, struct vm_area_struct *vma)
++                   unsigned long address, u32 *pkey)
+ {
+-      __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
++      __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
+ }
+ static void
+@@ -965,6 +960,10 @@ __bad_area(struct pt_regs *regs, unsigne
+          unsigned long address,  struct vm_area_struct *vma, int si_code)
+ {
+       struct mm_struct *mm = current->mm;
++      u32 pkey;
++
++      if (vma)
++              pkey = vma_pkey(vma);
+       /*
+        * Something tried to access memory that isn't in our memory map..
+@@ -972,7 +971,8 @@ __bad_area(struct pt_regs *regs, unsigne
+        */
+       up_read(&mm->mmap_sem);
+-      __bad_area_nosemaphore(regs, error_code, address, vma, si_code);
++      __bad_area_nosemaphore(regs, error_code, address,
++                             (vma) ? &pkey : NULL, si_code);
+ }
+ static noinline void
+@@ -1015,7 +1015,7 @@ bad_area_access_error(struct pt_regs *re
+ static void
+ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+-        struct vm_area_struct *vma, unsigned int fault)
++        u32 *pkey, unsigned int fault)
+ {
+       struct task_struct *tsk = current;
+       int code = BUS_ADRERR;
+@@ -1042,13 +1042,12 @@ do_sigbus(struct pt_regs *regs, unsigned
+               code = BUS_MCEERR_AR;
+       }
+ #endif
+-      force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
++      force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
+ }
+ static noinline void
+ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+-             unsigned long address, struct vm_area_struct *vma,
+-             unsigned int fault)
++             unsigned long address, u32 *pkey, unsigned int fault)
+ {
+       if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+               no_context(regs, error_code, address, 0, 0);
+@@ -1072,9 +1071,9 @@ mm_fault_error(struct pt_regs *regs, uns
+       } else {
+               if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+                            VM_FAULT_HWPOISON_LARGE))
+-                      do_sigbus(regs, error_code, address, vma, fault);
++                      do_sigbus(regs, error_code, address, pkey, fault);
+               else if (fault & VM_FAULT_SIGSEGV)
+-                      bad_area_nosemaphore(regs, error_code, address, vma);
++                      bad_area_nosemaphore(regs, error_code, address, pkey);
+               else
+                       BUG();
+       }
+@@ -1268,6 +1267,7 @@ __do_page_fault(struct pt_regs *regs, un
+       struct mm_struct *mm;
+       int fault, major = 0;
+       unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
++      u32 pkey;
+       tsk = current;
+       mm = tsk->mm;
+@@ -1468,9 +1468,10 @@ good_area:
+               return;
+       }
++      pkey = vma_pkey(vma);
+       up_read(&mm->mmap_sem);
+       if (unlikely(fault & VM_FAULT_ERROR)) {
+-              mm_fault_error(regs, error_code, address, vma, fault);
++              mm_fault_error(regs, error_code, address, &pkey, fault);
+               return;
+       }
diff --git a/queue-4.13/xfs-validate-bdev-support-for-dax-inode-flag.patch b/queue-4.13/xfs-validate-bdev-support-for-dax-inode-flag.patch
new file mode 100644 (file)
index 0000000..513d667
--- /dev/null
@@ -0,0 +1,50 @@
+From 6851a3db7e224bbb85e23b3c64a506c9e0904382 Mon Sep 17 00:00:00 2001
+From: Ross Zwisler <ross.zwisler@linux.intel.com>
+Date: Mon, 18 Sep 2017 14:46:03 -0700
+Subject: xfs: validate bdev support for DAX inode flag
+
+From: Ross Zwisler <ross.zwisler@linux.intel.com>
+
+commit 6851a3db7e224bbb85e23b3c64a506c9e0904382 upstream.
+
+Currently only the blocksize is checked, but we should really be calling
+bdev_dax_supported() which also tests to make sure we can get a
+struct dax_device and that the dax_direct_access() path is working.
+
+This is the same check that we do for the "-o dax" mount option in
+xfs_fs_fill_super().
+
+This does not fix the race issues that caused the XFS DAX inode option to
+be disabled, so that option will still be disabled.  If/when we re-enable
+it, though, I think we will want this issue to have been fixed.  I also do
+think that we want to fix this in stable kernels.
+
+Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_ioctl.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -1088,6 +1088,7 @@ xfs_ioctl_setattr_dax_invalidate(
+       int                     *join_flags)
+ {
+       struct inode            *inode = VFS_I(ip);
++      struct super_block      *sb = inode->i_sb;
+       int                     error;
+       *join_flags = 0;
+@@ -1100,7 +1101,7 @@ xfs_ioctl_setattr_dax_invalidate(
+       if (fa->fsx_xflags & FS_XFLAG_DAX) {
+               if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+                       return -EINVAL;
+-              if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
++              if (bdev_dax_supported(sb, sb->s_blocksize) < 0)
+                       return -EINVAL;
+       }