--- /dev/null
+From 760bfb47c36a07741a089bf6a28e854ffbee7dc9 Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Fri, 29 Sep 2017 12:27:41 +0100
+Subject: arm64: fault: Route pte translation faults via do_translation_fault
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 760bfb47c36a07741a089bf6a28e854ffbee7dc9 upstream.
+
+We currently route pte translation faults via do_page_fault, which elides
+the address check against TASK_SIZE before invoking the mm fault handling
+code. However, this can cause issues with the path walking code in
+conjunction with our word-at-a-time implementation because
+load_unaligned_zeropad can end up faulting in kernel space if it reads
+across a page boundary and runs into a page fault (e.g. by attempting to
+read from a guard region).
+
+In the case of such a fault, load_unaligned_zeropad has registered a
+fixup to shift the valid data and pad with zeroes, however the abort is
+reported as a level 3 translation fault and we dispatch it straight to
+do_page_fault, despite it being a kernel address. This results in calling
+a sleeping function from atomic context:
+
+ BUG: sleeping function called from invalid context at arch/arm64/mm/fault.c:313
+ in_atomic(): 0, irqs_disabled(): 0, pid: 10290
+ Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
+ [...]
+ [<ffffff8e016cd0cc>] ___might_sleep+0x134/0x144
+ [<ffffff8e016cd158>] __might_sleep+0x7c/0x8c
+ [<ffffff8e016977f0>] do_page_fault+0x140/0x330
+ [<ffffff8e01681328>] do_mem_abort+0x54/0xb0
+ Exception stack(0xfffffffb20247a70 to 0xfffffffb20247ba0)
+ [...]
+ [<ffffff8e016844fc>] el1_da+0x18/0x78
+ [<ffffff8e017f399c>] path_parentat+0x44/0x88
+ [<ffffff8e017f4c9c>] filename_parentat+0x5c/0xd8
+ [<ffffff8e017f5044>] filename_create+0x4c/0x128
+ [<ffffff8e017f59e4>] SyS_mkdirat+0x50/0xc8
+ [<ffffff8e01684e30>] el0_svc_naked+0x24/0x28
+ Code: 36380080 d5384100 f9400800 9402566d (d4210000)
+ ---[ end trace 2d01889f2bca9b9f ]---
+
+Fix this by dispatching all translation faults to do_translation_faults,
+which avoids invoking the page fault logic for faults on kernel addresses.
+
+Reported-by: Ankit Jain <ankijain@codeaurora.org>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/mm/fault.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -614,7 +614,7 @@ static const struct fault_info fault_inf
+ { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" },
+ { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" },
+ { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" },
+- { do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
++ { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
+ { do_bad, SIGBUS, 0, "unknown 8" },
+ { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" },
+ { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" },
--- /dev/null
+From 5371513fb338fb9989c569dc071326d369d6ade8 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <marc.zyngier@arm.com>
+Date: Tue, 26 Sep 2017 15:57:16 +0100
+Subject: arm64: Make sure SPsel is always set
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 5371513fb338fb9989c569dc071326d369d6ade8 upstream.
+
+When the kernel is entered at EL2 on an ARMv8.0 system, we construct
+the EL1 pstate and make sure this uses the the EL1 stack pointer
+(we perform an exception return to EL1h).
+
+But if the kernel is either entered at EL1 or stays at EL2 (because
+we're on a VHE-capable system), we fail to set SPsel, and use whatever
+stack selection the higher exception level has choosen for us.
+
+Let's not take any chance, and make sure that SPsel is set to one
+before we decide the mode we're going to run in.
+
+Acked-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/kernel/head.S | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm64/kernel/head.S
++++ b/arch/arm64/kernel/head.S
+@@ -381,6 +381,7 @@ ENTRY(kimage_vaddr)
+ * booted in EL1 or EL2 respectively.
+ */
+ ENTRY(el2_setup)
++ msr SPsel, #1 // We want to use SP_EL{1,2}
+ mrs x0, CurrentEL
+ cmp x0, #CurrentEL_EL2
+ b.eq 1f
--- /dev/null
+From f069faba688701c4d56b6c3452a130f97bf02e95 Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Fri, 29 Sep 2017 11:29:55 +0100
+Subject: arm64: mm: Use READ_ONCE when dereferencing pointer to pte table
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit f069faba688701c4d56b6c3452a130f97bf02e95 upstream.
+
+On kernels built with support for transparent huge pages, different CPUs
+can access the PMD concurrently due to e.g. fast GUP or page_vma_mapped_walk
+and they must take care to use READ_ONCE to avoid value tearing or caching
+of stale values by the compiler. Unfortunately, these functions call into
+our pgtable macros, which don't use READ_ONCE, and compiler caching has
+been observed to cause the following crash during ext4 writeback:
+
+PC is at check_pte+0x20/0x170
+LR is at page_vma_mapped_walk+0x2e0/0x540
+[...]
+Process doio (pid: 2463, stack limit = 0xffff00000f2e8000)
+Call trace:
+[<ffff000008233328>] check_pte+0x20/0x170
+[<ffff000008233758>] page_vma_mapped_walk+0x2e0/0x540
+[<ffff000008234adc>] page_mkclean_one+0xac/0x278
+[<ffff000008234d98>] rmap_walk_file+0xf0/0x238
+[<ffff000008236e74>] rmap_walk+0x64/0xa0
+[<ffff0000082370c8>] page_mkclean+0x90/0xa8
+[<ffff0000081f3c64>] clear_page_dirty_for_io+0x84/0x2a8
+[<ffff00000832f984>] mpage_submit_page+0x34/0x98
+[<ffff00000832fb4c>] mpage_process_page_bufs+0x164/0x170
+[<ffff00000832fc8c>] mpage_prepare_extent_to_map+0x134/0x2b8
+[<ffff00000833530c>] ext4_writepages+0x484/0xe30
+[<ffff0000081f6ab4>] do_writepages+0x44/0xe8
+[<ffff0000081e5bd4>] __filemap_fdatawrite_range+0xbc/0x110
+[<ffff0000081e5e68>] file_write_and_wait_range+0x48/0xd8
+[<ffff000008324310>] ext4_sync_file+0x80/0x4b8
+[<ffff0000082bd434>] vfs_fsync_range+0x64/0xc0
+[<ffff0000082332b4>] SyS_msync+0x194/0x1e8
+
+This is because page_vma_mapped_walk loads the PMD twice before calling
+pte_offset_map: the first time without READ_ONCE (where it gets all zeroes
+due to a concurrent pmdp_invalidate) and the second time with READ_ONCE
+(where it sees a valid table pointer due to a concurrent pmd_populate).
+However, the compiler inlines everything and caches the first value in
+a register, which is subsequently used in pte_offset_phys which returns
+a junk pointer that is later dereferenced when attempting to access the
+relevant pte.
+
+This patch fixes the issue by using READ_ONCE in pte_offset_phys to ensure
+that a stale value is not used. Whilst this is a point fix for a known
+failure (and simple to backport), a full fix moving all of our page table
+accessors over to {READ,WRITE}_ONCE and consistently using READ_ONCE in
+page_vma_mapped_walk is in the works for a future kernel release.
+
+Cc: Jon Masters <jcm@redhat.com>
+Cc: Timur Tabi <timur@codeaurora.org>
+Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use page_vma_mapped_walk()")
+Tested-by: Richard Ruigrok <rruigrok@codeaurora.org>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/include/asm/pgtable.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/arm64/include/asm/pgtable.h
++++ b/arch/arm64/include/asm/pgtable.h
+@@ -412,7 +412,7 @@ static inline phys_addr_t pmd_page_paddr
+ /* Find an entry in the third-level page table. */
+ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
+
+-#define pte_offset_phys(dir,addr) (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t))
++#define pte_offset_phys(dir,addr) (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t))
+ #define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr))))
+
+ #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr))
--- /dev/null
+From 63d71450c8d817649a79e37d685523f988b9cc98 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Fri, 1 Sep 2017 17:58:47 +0900
+Subject: btrfs: clear ordered flag on cleaning up ordered extents
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 63d71450c8d817649a79e37d685523f988b9cc98 upstream.
+
+Commit 524272607e88 ("btrfs: Handle delalloc error correctly to avoid
+ordered extent hang") introduced btrfs_cleanup_ordered_extents() to cleanup
+submitted ordered extents. However, it does not clear the ordered bit
+(Private2) of corresponding pages. Thus, the following BUG occurs from
+free_pages_check_bad() (on btrfs/125 with nospace_cache).
+
+BUG: Bad page state in process btrfs pfn:3fa787
+page:ffffdf2acfe9e1c0 count:0 mapcount:0 mapping: (null) index:0xd
+flags: 0x8000000000002008(uptodate|private_2)
+raw: 8000000000002008 0000000000000000 000000000000000d 00000000ffffffff
+raw: ffffdf2acf5c1b20 ffffb443802238b0 0000000000000000 0000000000000000
+page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
+bad because of flags: 0x2000(private_2)
+
+This patch clears the flag same as other places calling
+btrfs_dec_test_ordered_pending() for every page in the specified range.
+
+Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
+Reviewed-by: Josef Bacik <jbacik@fb.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -135,6 +135,18 @@ static inline void btrfs_cleanup_ordered
+ const u64 offset,
+ const u64 bytes)
+ {
++ unsigned long index = offset >> PAGE_SHIFT;
++ unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
++ struct page *page;
++
++ while (index <= end_index) {
++ page = find_get_page(inode->i_mapping, index);
++ index++;
++ if (!page)
++ continue;
++ ClearPagePrivate2(page);
++ put_page(page);
++ }
+ return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
+ bytes - PAGE_SIZE, false);
+ }
--- /dev/null
+From 67c003f90fd68062d92a7ffade36f9b2a9098bd8 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Fri, 1 Sep 2017 17:59:07 +0900
+Subject: btrfs: finish ordered extent cleaning if no progress is found
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 67c003f90fd68062d92a7ffade36f9b2a9098bd8 upstream.
+
+__endio_write_update_ordered() repeats the search until it reaches the end
+of the specified range. This works well with direct IO path, because before
+the function is called, it's ensured that there are ordered extents filling
+whole the range. It's not the case, however, when it's called from
+run_delalloc_range(): it is possible to have error in the midle of the loop
+in e.g. run_delalloc_nocow(), so that there exisits the range not covered
+by any ordered extents. By cleaning such "uncomplete" range,
+__endio_write_update_ordered() stucks at offset where there're no ordered
+extents.
+
+Since the ordered extents are created from head to tail, we can stop the
+search if there are no offset progress.
+
+Fixes: 524272607e88 ("btrfs: Handle delalloc error correctly to avoid ordered extent hang")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
+Reviewed-by: Josef Bacik <jbacik@fb.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -8309,6 +8309,7 @@ static void __endio_write_update_ordered
+ btrfs_work_func_t func;
+ u64 ordered_offset = offset;
+ u64 ordered_bytes = bytes;
++ u64 last_offset;
+ int ret;
+
+ if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+@@ -8320,6 +8321,7 @@ static void __endio_write_update_ordered
+ }
+
+ again:
++ last_offset = ordered_offset;
+ ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
+ &ordered_offset,
+ ordered_bytes,
+@@ -8331,6 +8333,12 @@ again:
+ btrfs_queue_work(wq, &ordered->work);
+ out_test:
+ /*
++ * If btrfs_dec_test_ordered_pending does not find any ordered extent
++ * in the range, we can exit.
++ */
++ if (ordered_offset == last_offset)
++ return;
++ /*
+ * our bio might span multiple ordered extents. If we haven't
+ * completed the accounting for the whole dio, go back and try again
+ */
--- /dev/null
+From bb166d7207432d3c7d10c45dc052f12ba3a2121d Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Fri, 25 Aug 2017 14:15:14 +0900
+Subject: btrfs: fix NULL pointer dereference from free_reloc_roots()
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit bb166d7207432d3c7d10c45dc052f12ba3a2121d upstream.
+
+__del_reloc_root should be called before freeing up reloc_root->node.
+If not, calling __del_reloc_root() dereference reloc_root->node, causing
+the system BUG.
+
+Fixes: 6bdf131fac23 ("Btrfs: don't leak reloc root nodes on error")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/relocation.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -2393,11 +2393,11 @@ void free_reloc_roots(struct list_head *
+ while (!list_empty(list)) {
+ reloc_root = list_entry(list->next, struct btrfs_root,
+ root_list);
++ __del_reloc_root(reloc_root);
+ free_extent_buffer(reloc_root->node);
+ free_extent_buffer(reloc_root->commit_root);
+ reloc_root->node = NULL;
+ reloc_root->commit_root = NULL;
+- __del_reloc_root(reloc_root);
+ }
+ }
+
--- /dev/null
+From 6d6d282932d1a609e60dc4467677e0e863682f57 Mon Sep 17 00:00:00 2001
+From: satoru takeuchi <satoru.takeuchi@gmail.com>
+Date: Tue, 12 Sep 2017 22:42:52 +0900
+Subject: btrfs: prevent to set invalid default subvolid
+
+From: satoru takeuchi <satoru.takeuchi@gmail.com>
+
+commit 6d6d282932d1a609e60dc4467677e0e863682f57 upstream.
+
+`btrfs sub set-default` succeeds to set an ID which isn't corresponding to any
+fs/file tree. If such the bad ID is set to a filesystem, we can't mount this
+filesystem without specifying `subvol` or `subvolid` mount options.
+
+Fixes: 6ef5ed0d386b ("Btrfs: add ioctl and incompat flag to set the default mount subvol")
+Signed-off-by: Satoru Takeuchi <satoru.takeuchi@gmail.com>
+Reviewed-by: Qu Wenruo <quwenruo.btrfs@gmx.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -4072,6 +4072,10 @@ static long btrfs_ioctl_default_subvol(s
+ ret = PTR_ERR(new_root);
+ goto out;
+ }
++ if (!is_fstree(new_root->objectid)) {
++ ret = -ENOENT;
++ goto out;
++ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
--- /dev/null
+From 78ad4ce014d025f41b8dde3a81876832ead643cf Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Fri, 8 Sep 2017 17:48:55 +0900
+Subject: btrfs: propagate error to btrfs_cmp_data_prepare caller
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 78ad4ce014d025f41b8dde3a81876832ead643cf upstream.
+
+btrfs_cmp_data_prepare() (almost) always returns 0 i.e. ignoring errors
+from gather_extent_pages(). While the pages are freed by
+btrfs_cmp_data_free(), cmp->num_pages still has > 0. Then,
+btrfs_extent_same() try to access the already freed pages causing faults
+(or violates PageLocked assertion).
+
+This patch just return the error as is so that the caller stop the process.
+
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Fixes: f441460202cb ("btrfs: fix deadlock with extent-same and readpage")
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -3063,7 +3063,7 @@ static int btrfs_cmp_data_prepare(struct
+ out:
+ if (ret)
+ btrfs_cmp_data_free(cmp);
+- return 0;
++ return ret;
+ }
+
+ static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp)
--- /dev/null
+From 518417525f3652c12fb5fad6da4ade66c0072fa3 Mon Sep 17 00:00:00 2001
+From: Lucas Stach <l.stach@pengutronix.de>
+Date: Mon, 11 Sep 2017 15:29:31 +0200
+Subject: etnaviv: fix gem object list corruption
+
+From: Lucas Stach <l.stach@pengutronix.de>
+
+commit 518417525f3652c12fb5fad6da4ade66c0072fa3 upstream.
+
+All manipulations of the gem_object list need to be protected by
+the list mutex, as GEM objects can be created and freed in parallel.
+This fixes a kernel memory corruption.
+
+Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/etnaviv/etnaviv_gem.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
++++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+@@ -551,12 +551,15 @@ static const struct etnaviv_gem_ops etna
+ void etnaviv_gem_free_object(struct drm_gem_object *obj)
+ {
+ struct etnaviv_gem_object *etnaviv_obj = to_etnaviv_bo(obj);
++ struct etnaviv_drm_private *priv = obj->dev->dev_private;
+ struct etnaviv_vram_mapping *mapping, *tmp;
+
+ /* object should not be active */
+ WARN_ON(is_active(etnaviv_obj));
+
++ mutex_lock(&priv->gem_lock);
+ list_del(&etnaviv_obj->gem_node);
++ mutex_unlock(&priv->gem_lock);
+
+ list_for_each_entry_safe(mapping, tmp, &etnaviv_obj->vram_list,
+ obj_node) {
--- /dev/null
+From 5a642e6bc49f59922e19ebd639e74f72753fc77b Mon Sep 17 00:00:00 2001
+From: Lucas Stach <l.stach@pengutronix.de>
+Date: Fri, 8 Sep 2017 16:24:32 +0200
+Subject: etnaviv: fix submit error path
+
+From: Lucas Stach <l.stach@pengutronix.de>
+
+commit 5a642e6bc49f59922e19ebd639e74f72753fc77b upstream.
+
+If the gpu submit fails, bail out to avoid accessing a potentially
+unititalized fence.
+
+Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
++++ b/drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c
+@@ -445,8 +445,10 @@ int etnaviv_ioctl_gem_submit(struct drm_
+ cmdbuf->user_size = ALIGN(args->stream_size, 8);
+
+ ret = etnaviv_gpu_submit(gpu, submit, cmdbuf);
+- if (ret == 0)
+- cmdbuf = NULL;
++ if (ret)
++ goto out;
++
++ cmdbuf = NULL;
+
+ if (args->flags & ETNA_SUBMIT_FENCE_FD_OUT) {
+ /*
--- /dev/null
+From 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 Mon Sep 17 00:00:00 2001
+From: Al Viro <viro@zeniv.linux.org.uk>
+Date: Fri, 29 Sep 2017 13:43:15 -0400
+Subject: fix infoleak in waitid(2)
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+commit 6c85501f2fabcfc4fc6ed976543d252c4eaf4be9 upstream.
+
+kernel_waitid() can return a PID, an error or 0. rusage is filled in the first
+case and waitid(2) rusage should've been copied out exactly in that case, *not*
+whenever kernel_waitid() has not returned an error. Compat variant shares that
+braino; none of kernel_wait4() callers do, so the below ought to fix it.
+
+Reported-and-tested-by: Alexander Potapenko <glider@google.com>
+Fixes: ce72a16fa705 ("wait4(2)/waitid(2): separate copying rusage to userland")
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/exit.c | 23 ++++++++++-------------
+ 1 file changed, 10 insertions(+), 13 deletions(-)
+
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -1601,12 +1601,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_
+ struct waitid_info info = {.status = 0};
+ long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
+ int signo = 0;
++
+ if (err > 0) {
+ signo = SIGCHLD;
+ err = 0;
+- }
+-
+- if (!err) {
+ if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+ return -EFAULT;
+ }
+@@ -1724,16 +1722,15 @@ COMPAT_SYSCALL_DEFINE5(waitid,
+ if (err > 0) {
+ signo = SIGCHLD;
+ err = 0;
+- }
+-
+- if (!err && uru) {
+- /* kernel_waitid() overwrites everything in ru */
+- if (COMPAT_USE_64BIT_TIME)
+- err = copy_to_user(uru, &ru, sizeof(ru));
+- else
+- err = put_compat_rusage(&ru, uru);
+- if (err)
+- return -EFAULT;
++ if (uru) {
++ /* kernel_waitid() overwrites everything in ru */
++ if (COMPAT_USE_64BIT_TIME)
++ err = copy_to_user(uru, &ru, sizeof(ru));
++ else
++ err = put_compat_rusage(&ru, uru);
++ if (err)
++ return -EFAULT;
++ }
+ }
+
+ if (!infop)
--- /dev/null
+From c74aef2d06a9f59cece89093eecc552933cba72a Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Fri, 22 Sep 2017 17:48:06 +0200
+Subject: futex: Fix pi_state->owner serialization
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit c74aef2d06a9f59cece89093eecc552933cba72a upstream.
+
+There was a reported suspicion about a race between exit_pi_state_list()
+and put_pi_state(). The same report mentioned the comment with
+put_pi_state() said it should be called with hb->lock held, and it no
+longer is in all places.
+
+As it turns out, the pi_state->owner serialization is indeed broken. As per
+the new rules:
+
+ 734009e96d19 ("futex: Change locking rules")
+
+pi_state->owner should be serialized by pi_state->pi_mutex.wait_lock.
+For the sites setting pi_state->owner we already hold wait_lock (where
+required) but exit_pi_state_list() and put_pi_state() were not and
+raced on clearing it.
+
+Fixes: 734009e96d19 ("futex: Change locking rules")
+Reported-by: Gratian Crisan <gratian.crisan@ni.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: dvhart@infradead.org
+Link: https://lkml.kernel.org/r/20170922154806.jd3ffltfk24m4o4y@hirez.programming.kicks-ass.net
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/futex.c | 33 ++++++++++++++++++++++-----------
+ 1 file changed, 22 insertions(+), 11 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi
+ /*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+- *
+- * Must be called with the hb lock held.
+ */
+ static void put_pi_state(struct futex_pi_state *pi_state)
+ {
+@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi
+ * and has cleaned up the pi_state already
+ */
+ if (pi_state->owner) {
+- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+- list_del_init(&pi_state->list);
+- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
++ struct task_struct *owner;
+
+- rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++ owner = pi_state->owner;
++ if (owner) {
++ raw_spin_lock(&owner->pi_lock);
++ list_del_init(&pi_state->list);
++ raw_spin_unlock(&owner->pi_lock);
++ }
++ rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ }
+
+- if (current->pi_state_cache)
++ if (current->pi_state_cache) {
+ kfree(pi_state);
+- else {
++ } else {
+ /*
+ * pi_state->list is already empty.
+ * clear pi_state->owner.
+@@ -905,13 +909,14 @@ void exit_pi_state_list(struct task_stru
+ raw_spin_unlock_irq(&curr->pi_lock);
+
+ spin_lock(&hb->lock);
+-
+- raw_spin_lock_irq(&curr->pi_lock);
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++ raw_spin_lock(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
+ if (head->next != next) {
++ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+ continue;
+ }
+@@ -920,9 +925,10 @@ void exit_pi_state_list(struct task_stru
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ pi_state->owner = NULL;
+- raw_spin_unlock_irq(&curr->pi_lock);
++ raw_spin_unlock(&curr->pi_lock);
+
+ get_pi_state(pi_state);
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+@@ -1204,6 +1210,10 @@ static int attach_to_pi_owner(u32 uval,
+
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &p->pi_state_list);
++ /*
++ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
++ * because there is no concurrency as the object is not published yet.
++ */
+ pi_state->owner = p;
+ raw_spin_unlock_irq(&p->pi_lock);
+
+@@ -2820,6 +2830,7 @@ retry:
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
++ /* drops pi_state->pi_mutex.wait_lock */
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ put_pi_state(pi_state);
--- /dev/null
+From 72364d320644c12948786962673772f271039a4a Mon Sep 17 00:00:00 2001
+From: Jeffy Chen <jeffy.chen@rock-chips.com>
+Date: Thu, 28 Sep 2017 12:37:31 +0800
+Subject: irq/generic-chip: Don't replace domain's name
+
+From: Jeffy Chen <jeffy.chen@rock-chips.com>
+
+commit 72364d320644c12948786962673772f271039a4a upstream.
+
+When generic irq chips are allocated for an irq domain the domain name is
+set to the irq chip name. That was done to have named domains before the
+recent changes which enforce domain naming were done.
+
+Since then the overwrite causes a memory leak when the domain name is
+dynamically allocated and even worse it would cause the domain free code to
+free the wrong name pointer, which might point to a constant.
+
+Remove the name assignment to prevent this.
+
+Fixes: d59f6617eef0 ("genirq: Allow fwnode to carry name information only")
+Signed-off-by: Jeffy Chen <jeffy.chen@rock-chips.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Link: https://lkml.kernel.org/r/20170928043731.4764-1-jeffy.chen@rock-chips.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/irq/generic-chip.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/kernel/irq/generic-chip.c
++++ b/kernel/irq/generic-chip.c
+@@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(str
+ /* Calc pointer to the next generic chip */
+ tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+ }
+- d->name = name;
+ return 0;
+ }
+ EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
--- /dev/null
+From 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f Mon Sep 17 00:00:00 2001
+From: Jim Mattson <jmattson@google.com>
+Date: Tue, 12 Sep 2017 13:02:54 -0700
+Subject: kvm: nVMX: Don't allow L2 to access the hardware CR8
+
+From: Jim Mattson <jmattson@google.com>
+
+commit 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f upstream.
+
+If L1 does not specify the "use TPR shadow" VM-execution control in
+vmcs12, then L0 must specify the "CR8-load exiting" and "CR8-store
+exiting" VM-execution controls in vmcs02. Failure to do so will give
+the L2 VM unrestricted read/write access to the hardware CR8.
+
+This fixes CVE-2017-12154.
+
+Signed-off-by: Jim Mattson <jmattson@google.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -10271,6 +10271,11 @@ static int prepare_vmcs02(struct kvm_vcp
+ if (exec_control & CPU_BASED_TPR_SHADOW) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+ vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
++ } else {
++#ifdef CONFIG_X86_64
++ exec_control |= CPU_BASED_CR8_LOAD_EXITING |
++ CPU_BASED_CR8_STORE_EXITING;
++#endif
+ }
+
+ /*
--- /dev/null
+From 44889942b6eb356eab27ce25fe10701adfec7776 Mon Sep 17 00:00:00 2001
+From: Ladi Prosek <lprosek@redhat.com>
+Date: Fri, 22 Sep 2017 07:53:15 +0200
+Subject: KVM: nVMX: fix HOST_CR3/HOST_CR4 cache
+
+From: Ladi Prosek <lprosek@redhat.com>
+
+commit 44889942b6eb356eab27ce25fe10701adfec7776 upstream.
+
+For nested virt we maintain multiple VMCS that can run on a vCPU. So it is
+incorrect to keep vmcs_host_cr3 and vmcs_host_cr4, whose purpose is caching
+the value of the rarely changing HOST_CR3 and HOST_CR4 VMCS fields, in
+vCPU-wide data structures.
+
+Hyper-V nested on KVM runs into this consistently for me with PCID enabled.
+CR3 is updated with a new value, unlikely(cr3 != vmx->host_state.vmcs_host_cr3)
+fires, and the currently loaded VMCS is updated. Then we switch from L2 to
+L1 and the next exit reverts CR3 to its old value.
+
+Fixes: d6e41f1151fe ("x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant")
+Signed-off-by: Ladi Prosek <lprosek@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -200,6 +200,8 @@ struct loaded_vmcs {
+ int cpu;
+ bool launched;
+ bool nmi_known_unmasked;
++ unsigned long vmcs_host_cr3; /* May not match real cr3 */
++ unsigned long vmcs_host_cr4; /* May not match real cr4 */
+ struct list_head loaded_vmcss_on_cpu_link;
+ };
+
+@@ -595,8 +597,6 @@ struct vcpu_vmx {
+ int gs_ldt_reload_needed;
+ int fs_reload_needed;
+ u64 msr_host_bndcfgs;
+- unsigned long vmcs_host_cr3; /* May not match real cr3 */
+- unsigned long vmcs_host_cr4; /* May not match real cr4 */
+ } host_state;
+ struct {
+ int vm86_active;
+@@ -5138,12 +5138,12 @@ static void vmx_set_constant_host_state(
+ */
+ cr3 = __read_cr3();
+ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
+- vmx->host_state.vmcs_host_cr3 = cr3;
++ vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
+
+ /* Save the most likely value for this task's CR4 in the VMCS. */
+ cr4 = cr4_read_shadow();
+ vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
+- vmx->host_state.vmcs_host_cr4 = cr4;
++ vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
+
+ vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
+ #ifdef CONFIG_X86_64
+@@ -8992,15 +8992,15 @@ static void __noclone vmx_vcpu_run(struc
+ vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+
+ cr3 = __get_current_cr3_fast();
+- if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
++ if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+- vmx->host_state.vmcs_host_cr3 = cr3;
++ vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
+ }
+
+ cr4 = cr4_read_shadow();
+- if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
++ if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
+ vmcs_writel(HOST_CR4, cr4);
+- vmx->host_state.vmcs_host_cr4 = cr4;
++ vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
+ }
+
+ /* When single-stepping over STI and MOV SS, we must clear the
--- /dev/null
+From 8b306e2f3c41939ea528e6174c88cfbfff893ce1 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Tue, 6 Jun 2017 12:57:05 +0200
+Subject: KVM: VMX: avoid double list add with VT-d posted interrupts
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 8b306e2f3c41939ea528e6174c88cfbfff893ce1 upstream.
+
+In some cases, for example involving hot-unplug of assigned
+devices, pi_post_block can forget to remove the vCPU from the
+blocked_vcpu_list. When this happens, the next call to
+pi_pre_block corrupts the list.
+
+Fix this in two ways. First, check vcpu->pre_pcpu in pi_pre_block
+and WARN instead of adding the element twice in the list. Second,
+always do the list removal in pi_post_block if vcpu->pre_pcpu is
+set (not -1).
+
+The new code keeps interrupts disabled for the whole duration of
+pi_pre_block/pi_post_block. This is not strictly necessary, but
+easier to follow. For the same reason, PI.ON is checked only
+after the cmpxchg, and to handle it we just call the post-block
+code. This removes duplication of the list removal code.
+
+Cc: Huangweidong <weidong.huang@huawei.com>
+Cc: Gonglei <arei.gonglei@huawei.com>
+Cc: wangxin <wangxinxin.wang@huawei.com>
+Cc: Radim Krčmář <rkrcmar@redhat.com>
+Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c | 62 +++++++++++++++++++++--------------------------------
+ 1 file changed, 25 insertions(+), 37 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -11394,10 +11394,11 @@ static void __pi_post_block(struct kvm_v
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+- unsigned long flags;
+
+ do {
+ old.control = new.control = pi_desc->control;
++ WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
++ "Wakeup handler not enabled while the VCPU is blocked\n");
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+@@ -11414,14 +11415,10 @@ static void __pi_post_block(struct kvm_v
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+- if(vcpu->pre_pcpu != -1) {
+- spin_lock_irqsave(
+- &per_cpu(blocked_vcpu_on_cpu_lock,
+- vcpu->pre_pcpu), flags);
++ if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
++ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_del(&vcpu->blocked_vcpu_list);
+- spin_unlock_irqrestore(
+- &per_cpu(blocked_vcpu_on_cpu_lock,
+- vcpu->pre_pcpu), flags);
++ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ vcpu->pre_pcpu = -1;
+ }
+ }
+@@ -11441,7 +11438,6 @@ static void __pi_post_block(struct kvm_v
+ */
+ static int pi_pre_block(struct kvm_vcpu *vcpu)
+ {
+- unsigned long flags;
+ unsigned int dest;
+ struct pi_desc old, new;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+@@ -11451,34 +11447,20 @@ static int pi_pre_block(struct kvm_vcpu
+ !kvm_vcpu_apicv_active(vcpu))
+ return 0;
+
+- vcpu->pre_pcpu = vcpu->cpu;
+- spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+- vcpu->pre_pcpu), flags);
+- list_add_tail(&vcpu->blocked_vcpu_list,
+- &per_cpu(blocked_vcpu_on_cpu,
+- vcpu->pre_pcpu));
+- spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+- vcpu->pre_pcpu), flags);
++ WARN_ON(irqs_disabled());
++ local_irq_disable();
++ if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
++ vcpu->pre_pcpu = vcpu->cpu;
++ spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
++ list_add_tail(&vcpu->blocked_vcpu_list,
++ &per_cpu(blocked_vcpu_on_cpu,
++ vcpu->pre_pcpu));
++ spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
++ }
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+- /*
+- * We should not block the vCPU if
+- * an interrupt is posted for it.
+- */
+- if (pi_test_on(pi_desc) == 1) {
+- spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+- vcpu->pre_pcpu), flags);
+- list_del(&vcpu->blocked_vcpu_list);
+- spin_unlock_irqrestore(
+- &per_cpu(blocked_vcpu_on_cpu_lock,
+- vcpu->pre_pcpu), flags);
+- vcpu->pre_pcpu = -1;
+-
+- return 1;
+- }
+-
+ WARN((pi_desc->sn == 1),
+ "Warning: SN field of posted-interrupts "
+ "is set before blocking\n");
+@@ -11503,7 +11485,12 @@ static int pi_pre_block(struct kvm_vcpu
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+- return 0;
++ /* We should not block the vCPU if an interrupt is posted for it. */
++ if (pi_test_on(pi_desc) == 1)
++ __pi_post_block(vcpu);
++
++ local_irq_enable();
++ return (vcpu->pre_pcpu == -1);
+ }
+
+ static int vmx_pre_block(struct kvm_vcpu *vcpu)
+@@ -11519,12 +11506,13 @@ static int vmx_pre_block(struct kvm_vcpu
+
+ static void pi_post_block(struct kvm_vcpu *vcpu)
+ {
+- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+- !irq_remapping_cap(IRQ_POSTING_CAP) ||
+- !kvm_vcpu_apicv_active(vcpu))
++ if (vcpu->pre_pcpu == -1)
+ return;
+
++ WARN_ON(irqs_disabled());
++ local_irq_disable();
+ __pi_post_block(vcpu);
++ local_irq_enable();
+ }
+
+ static void vmx_post_block(struct kvm_vcpu *vcpu)
--- /dev/null
+From 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= <jschoenh@amazon.de>
+Date: Thu, 7 Sep 2017 19:02:30 +0100
+Subject: KVM: VMX: Do not BUG() on out-of-bounds guest IRQ
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jan H. Schönherr <jschoenh@amazon.de>
+
+commit 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb upstream.
+
+The value of the guest_irq argument to vmx_update_pi_irte() is
+ultimately coming from a KVM_IRQFD API call. Do not BUG() in
+vmx_update_pi_irte() if the value is out-of bounds. (Especially,
+since KVM as a whole seems to hang after that.)
+
+Instead, print a message only once if we find that we don't have a
+route for a certain IRQ (which can be out-of-bounds or within the
+array).
+
+This fixes CVE-2017-1000252.
+
+Fixes: efc644048ecde54 ("KVM: x86: Update IRTE for posted-interrupts")
+Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -11542,7 +11542,7 @@ static int vmx_update_pi_irte(struct kvm
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+- int idx, ret = -EINVAL;
++ int idx, ret = 0;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+@@ -11551,7 +11551,12 @@ static int vmx_update_pi_irte(struct kvm
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+- BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
++ if (guest_irq >= irq_rt->nr_rt_entries ||
++ hlist_empty(&irq_rt->map[guest_irq])) {
++ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
++ guest_irq, irq_rt->nr_rt_entries);
++ goto out;
++ }
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
--- /dev/null
+From cd39e1176d320157831ce030b4c869bd2d5eb142 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Tue, 6 Jun 2017 12:57:04 +0200
+Subject: KVM: VMX: extract __pi_post_block
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit cd39e1176d320157831ce030b4c869bd2d5eb142 upstream.
+
+Simple code movement patch, preparing for the next one.
+
+Cc: Huangweidong <weidong.huang@huawei.com>
+Cc: Gonglei <arei.gonglei@huawei.com>
+Cc: wangxin <wangxinxin.wang@huawei.com>
+Cc: Radim Krčmář <rkrcmar@redhat.com>
+Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c | 71 ++++++++++++++++++++++++++++-------------------------
+ 1 file changed, 38 insertions(+), 33 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -11389,6 +11389,43 @@ static void vmx_enable_log_dirty_pt_mask
+ kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+ }
+
++static void __pi_post_block(struct kvm_vcpu *vcpu)
++{
++ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
++ struct pi_desc old, new;
++ unsigned int dest;
++ unsigned long flags;
++
++ do {
++ old.control = new.control = pi_desc->control;
++
++ dest = cpu_physical_id(vcpu->cpu);
++
++ if (x2apic_enabled())
++ new.ndst = dest;
++ else
++ new.ndst = (dest << 8) & 0xFF00;
++
++ /* Allow posting non-urgent interrupts */
++ new.sn = 0;
++
++ /* set 'NV' to 'notification vector' */
++ new.nv = POSTED_INTR_VECTOR;
++ } while (cmpxchg(&pi_desc->control, old.control,
++ new.control) != old.control);
++
++ if(vcpu->pre_pcpu != -1) {
++ spin_lock_irqsave(
++ &per_cpu(blocked_vcpu_on_cpu_lock,
++ vcpu->pre_pcpu), flags);
++ list_del(&vcpu->blocked_vcpu_list);
++ spin_unlock_irqrestore(
++ &per_cpu(blocked_vcpu_on_cpu_lock,
++ vcpu->pre_pcpu), flags);
++ vcpu->pre_pcpu = -1;
++ }
++}
++
+ /*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+@@ -11482,44 +11519,12 @@ static int vmx_pre_block(struct kvm_vcpu
+
+ static void pi_post_block(struct kvm_vcpu *vcpu)
+ {
+- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+- struct pi_desc old, new;
+- unsigned int dest;
+- unsigned long flags;
+-
+ if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP) ||
+ !kvm_vcpu_apicv_active(vcpu))
+ return;
+
+- do {
+- old.control = new.control = pi_desc->control;
+-
+- dest = cpu_physical_id(vcpu->cpu);
+-
+- if (x2apic_enabled())
+- new.ndst = dest;
+- else
+- new.ndst = (dest << 8) & 0xFF00;
+-
+- /* Allow posting non-urgent interrupts */
+- new.sn = 0;
+-
+- /* set 'NV' to 'notification vector' */
+- new.nv = POSTED_INTR_VECTOR;
+- } while (cmpxchg(&pi_desc->control, old.control,
+- new.control) != old.control);
+-
+- if(vcpu->pre_pcpu != -1) {
+- spin_lock_irqsave(
+- &per_cpu(blocked_vcpu_on_cpu_lock,
+- vcpu->pre_pcpu), flags);
+- list_del(&vcpu->blocked_vcpu_list);
+- spin_unlock_irqrestore(
+- &per_cpu(blocked_vcpu_on_cpu_lock,
+- vcpu->pre_pcpu), flags);
+- vcpu->pre_pcpu = -1;
+- }
++ __pi_post_block(vcpu);
+ }
+
+ static void vmx_post_block(struct kvm_vcpu *vcpu)
--- /dev/null
+From 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Tue, 6 Jun 2017 12:57:06 +0200
+Subject: KVM: VMX: simplify and fix vmx_vcpu_pi_load
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 31afb2ea2b10a7d17ce3db4cdb0a12b63b2fe08a upstream.
+
+The simplify part: do not touch pi_desc.nv, we can set it when the
+VCPU is first created. Likewise, pi_desc.sn is only handled by
+vmx_vcpu_pi_load, do not touch it in __pi_post_block.
+
+The fix part: do not check kvm_arch_has_assigned_device, instead
+check the SN bit to figure out whether vmx_vcpu_pi_put ran before.
+This matches what the previous patch did in pi_post_block.
+
+Cc: Huangweidong <weidong.huang@huawei.com>
+Cc: Gonglei <arei.gonglei@huawei.com>
+Cc: wangxin <wangxinxin.wang@huawei.com>
+Cc: Radim Krčmář <rkrcmar@redhat.com>
+Tested-by: Longpeng (Mike) <longpeng2@huawei.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c | 68 +++++++++++++++++++++++++++--------------------------
+ 1 file changed, 35 insertions(+), 33 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -2187,43 +2187,41 @@ static void vmx_vcpu_pi_load(struct kvm_
+ struct pi_desc old, new;
+ unsigned int dest;
+
+- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+- !irq_remapping_cap(IRQ_POSTING_CAP) ||
+- !kvm_vcpu_apicv_active(vcpu))
++ /*
++ * In case of hot-plug or hot-unplug, we may have to undo
++ * vmx_vcpu_pi_put even if there is no assigned device. And we
++ * always keep PI.NDST up to date for simplicity: it makes the
++ * code easier, and CPU migration is not a fast path.
++ */
++ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
++ return;
++
++ /*
++ * First handle the simple case where no cmpxchg is necessary; just
++ * allow posting non-urgent interrupts.
++ *
++ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
++ * PI.NDST: pi_post_block will do it for us and the wakeup_handler
++ * expects the VCPU to be on the blocked_vcpu_list that matches
++ * PI.NDST.
++ */
++ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
++ vcpu->cpu == cpu) {
++ pi_clear_sn(pi_desc);
+ return;
++ }
+
++ /* The full case. */
+ do {
+ old.control = new.control = pi_desc->control;
+
+- /*
+- * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
+- * are two possible cases:
+- * 1. After running 'pre_block', context switch
+- * happened. For this case, 'sn' was set in
+- * vmx_vcpu_put(), so we need to clear it here.
+- * 2. After running 'pre_block', we were blocked,
+- * and woken up by some other guy. For this case,
+- * we don't need to do anything, 'pi_post_block'
+- * will do everything for us. However, we cannot
+- * check whether it is case #1 or case #2 here
+- * (maybe, not needed), so we also clear sn here,
+- * I think it is not a big deal.
+- */
+- if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
+- if (vcpu->cpu != cpu) {
+- dest = cpu_physical_id(cpu);
+-
+- if (x2apic_enabled())
+- new.ndst = dest;
+- else
+- new.ndst = (dest << 8) & 0xFF00;
+- }
++ dest = cpu_physical_id(cpu);
+
+- /* set 'NV' to 'notification vector' */
+- new.nv = POSTED_INTR_VECTOR;
+- }
++ if (x2apic_enabled())
++ new.ndst = dest;
++ else
++ new.ndst = (dest << 8) & 0xFF00;
+
+- /* Allow posting non-urgent interrupts */
+ new.sn = 0;
+ } while (cmpxchg(&pi_desc->control, old.control,
+ new.control) != old.control);
+@@ -9310,6 +9308,13 @@ static struct kvm_vcpu *vmx_create_vcpu(
+
+ vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+
++ /*
++ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
++ * or POSTED_INTR_WAKEUP_VECTOR.
++ */
++ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
++ vmx->pi_desc.sn = 1;
++
+ return &vmx->vcpu;
+
+ free_vmcs:
+@@ -11407,9 +11412,6 @@ static void __pi_post_block(struct kvm_v
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+- /* Allow posting non-urgent interrupts */
+- new.sn = 0;
+-
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg(&pi_desc->control, old.control,
--- /dev/null
+From b862789aa5186d5ea3a024b7cfe0f80c3a38b980 Mon Sep 17 00:00:00 2001
+From: Boqun Feng <boqun.feng@gmail.com>
+Date: Fri, 29 Sep 2017 19:01:45 +0800
+Subject: kvm/x86: Handle async PF in RCU read-side critical sections
+
+From: Boqun Feng <boqun.feng@gmail.com>
+
+commit b862789aa5186d5ea3a024b7cfe0f80c3a38b980 upstream.
+
+Sasha Levin reported a WARNING:
+
+| WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329
+| rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline]
+| WARNING: CPU: 0 PID: 6974 at kernel/rcu/tree_plugin.h:329
+| rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458
+...
+| CPU: 0 PID: 6974 Comm: syz-fuzzer Not tainted 4.13.0-next-20170908+ #246
+| Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
+| 1.10.1-1ubuntu1 04/01/2014
+| Call Trace:
+...
+| RIP: 0010:rcu_preempt_note_context_switch kernel/rcu/tree_plugin.h:329 [inline]
+| RIP: 0010:rcu_note_context_switch+0x16c/0x2210 kernel/rcu/tree.c:458
+| RSP: 0018:ffff88003b2debc8 EFLAGS: 00010002
+| RAX: 0000000000000001 RBX: 1ffff1000765bd85 RCX: 0000000000000000
+| RDX: 1ffff100075d7882 RSI: ffffffffb5c7da20 RDI: ffff88003aebc410
+| RBP: ffff88003b2def30 R08: dffffc0000000000 R09: 0000000000000001
+| R10: 0000000000000000 R11: 0000000000000000 R12: ffff88003b2def08
+| R13: 0000000000000000 R14: ffff88003aebc040 R15: ffff88003aebc040
+| __schedule+0x201/0x2240 kernel/sched/core.c:3292
+| schedule+0x113/0x460 kernel/sched/core.c:3421
+| kvm_async_pf_task_wait+0x43f/0x940 arch/x86/kernel/kvm.c:158
+| do_async_page_fault+0x72/0x90 arch/x86/kernel/kvm.c:271
+| async_page_fault+0x22/0x30 arch/x86/entry/entry_64.S:1069
+| RIP: 0010:format_decode+0x240/0x830 lib/vsprintf.c:1996
+| RSP: 0018:ffff88003b2df520 EFLAGS: 00010283
+| RAX: 000000000000003f RBX: ffffffffb5d1e141 RCX: ffff88003b2df670
+| RDX: 0000000000000001 RSI: dffffc0000000000 RDI: ffffffffb5d1e140
+| RBP: ffff88003b2df560 R08: dffffc0000000000 R09: 0000000000000000
+| R10: ffff88003b2df718 R11: 0000000000000000 R12: ffff88003b2df5d8
+| R13: 0000000000000064 R14: ffffffffb5d1e140 R15: 0000000000000000
+| vsnprintf+0x173/0x1700 lib/vsprintf.c:2136
+| sprintf+0xbe/0xf0 lib/vsprintf.c:2386
+| proc_self_get_link+0xfb/0x1c0 fs/proc/self.c:23
+| get_link fs/namei.c:1047 [inline]
+| link_path_walk+0x1041/0x1490 fs/namei.c:2127
+...
+
+This happened when the host hit a page fault, and delivered it as in an
+async page fault, while the guest was in an RCU read-side critical
+section. The guest then tries to reschedule in kvm_async_pf_task_wait(),
+but rcu_preempt_note_context_switch() would treat the reschedule as a
+sleep in RCU read-side critical section, which is not allowed (even in
+preemptible RCU). Thus the WARN.
+
+To cure this, make kvm_async_pf_task_wait() go to the halt path if the
+PF happens in a RCU read-side critical section.
+
+Reported-by: Sasha Levin <levinsasha928@gmail.com>
+Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/kvm.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/kvm.c
++++ b/arch/x86/kernel/kvm.c
+@@ -140,7 +140,8 @@ void kvm_async_pf_task_wait(u32 token)
+
+ n.token = token;
+ n.cpu = smp_processor_id();
+- n.halted = is_idle_task(current) || preempt_count() > 1;
++ n.halted = is_idle_task(current) || preempt_count() > 1 ||
++ rcu_preempt_depth();
+ init_swait_queue_head(&n.wq);
+ hlist_add_head(&n.link, &b->list);
+ raw_spin_unlock(&b->lock);
--- /dev/null
+From 79bf31a3b2a7ca467cfec8ff97d359a77065d01f Mon Sep 17 00:00:00 2001
+From: Shaohua Li <shli@fb.com>
+Date: Thu, 21 Sep 2017 09:55:28 -0700
+Subject: md: fix a race condition for flush request handling
+
+From: Shaohua Li <shli@fb.com>
+
+commit 79bf31a3b2a7ca467cfec8ff97d359a77065d01f upstream.
+
+md_submit_flush_data calls pers->make_request, which missed the suspend check.
+Fix it with the new md_handle_request API.
+
+Reported-by: Nate Dailey <nate.dailey@stratus.com>
+Tested-by: Nate Dailey <nate.dailey@stratus.com>
+Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start())
+Reviewed-by: NeilBrown <neilb@suse.com>
+Signed-off-by: Shaohua Li <shli@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/md.c | 14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+--- a/drivers/md/md.c
++++ b/drivers/md/md.c
+@@ -439,16 +439,22 @@ static void md_submit_flush_data(struct
+ struct mddev *mddev = container_of(ws, struct mddev, flush_work);
+ struct bio *bio = mddev->flush_bio;
+
++ /*
++ * must reset flush_bio before calling into md_handle_request to avoid a
++ * deadlock, because other bios passed md_handle_request suspend check
++ * could wait for this and below md_handle_request could wait for those
++ * bios because of suspend check
++ */
++ mddev->flush_bio = NULL;
++ wake_up(&mddev->sb_wait);
++
+ if (bio->bi_iter.bi_size == 0)
+ /* an empty barrier - all done */
+ bio_endio(bio);
+ else {
+ bio->bi_opf &= ~REQ_PREFLUSH;
+- mddev->pers->make_request(mddev, bio);
++ md_handle_request(mddev, bio);
+ }
+-
+- mddev->flush_bio = NULL;
+- wake_up(&mddev->sb_wait);
+ }
+
+ void md_flush_request(struct mddev *mddev, struct bio *bio)
--- /dev/null
+From 393debc23c7820211d1c8253dd6a8408a7628fe7 Mon Sep 17 00:00:00 2001
+From: Shaohua Li <shli@fb.com>
+Date: Thu, 21 Sep 2017 10:23:35 -0700
+Subject: md: separate request handling
+
+From: Shaohua Li <shli@fb.com>
+
+commit 393debc23c7820211d1c8253dd6a8408a7628fe7 upstream.
+
+With commit cc27b0c78c79, pers->make_request could bail out without handling
+the bio. If that happens, we should retry. The commit fixes md_make_request
+but not other call sites. Separate the request handling part, so other call
+sites can use it.
+
+Reported-by: Nate Dailey <nate.dailey@stratus.com>
+Fix: cc27b0c78c79(md: fix deadlock between mddev_suspend() and md_write_start())
+Reviewed-by: NeilBrown <neilb@suse.com>
+Signed-off-by: Shaohua Li <shli@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/md.c | 58 +++++++++++++++++++++++++++++++-------------------------
+ drivers/md/md.h | 1
+ 2 files changed, 34 insertions(+), 25 deletions(-)
+
+--- a/drivers/md/md.c
++++ b/drivers/md/md.c
+@@ -266,6 +266,37 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
+ * call has finished, the bio has been linked into some internal structure
+ * and so is visible to ->quiesce(), so we don't need the refcount any more.
+ */
++void md_handle_request(struct mddev *mddev, struct bio *bio)
++{
++check_suspended:
++ rcu_read_lock();
++ if (mddev->suspended) {
++ DEFINE_WAIT(__wait);
++ for (;;) {
++ prepare_to_wait(&mddev->sb_wait, &__wait,
++ TASK_UNINTERRUPTIBLE);
++ if (!mddev->suspended)
++ break;
++ rcu_read_unlock();
++ schedule();
++ rcu_read_lock();
++ }
++ finish_wait(&mddev->sb_wait, &__wait);
++ }
++ atomic_inc(&mddev->active_io);
++ rcu_read_unlock();
++
++ if (!mddev->pers->make_request(mddev, bio)) {
++ atomic_dec(&mddev->active_io);
++ wake_up(&mddev->sb_wait);
++ goto check_suspended;
++ }
++
++ if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
++ wake_up(&mddev->sb_wait);
++}
++EXPORT_SYMBOL(md_handle_request);
++
+ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
+ {
+ const int rw = bio_data_dir(bio);
+@@ -285,23 +316,6 @@ static blk_qc_t md_make_request(struct r
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+ }
+-check_suspended:
+- rcu_read_lock();
+- if (mddev->suspended) {
+- DEFINE_WAIT(__wait);
+- for (;;) {
+- prepare_to_wait(&mddev->sb_wait, &__wait,
+- TASK_UNINTERRUPTIBLE);
+- if (!mddev->suspended)
+- break;
+- rcu_read_unlock();
+- schedule();
+- rcu_read_lock();
+- }
+- finish_wait(&mddev->sb_wait, &__wait);
+- }
+- atomic_inc(&mddev->active_io);
+- rcu_read_unlock();
+
+ /*
+ * save the sectors now since our bio can
+@@ -310,20 +324,14 @@ check_suspended:
+ sectors = bio_sectors(bio);
+ /* bio could be mergeable after passing to underlayer */
+ bio->bi_opf &= ~REQ_NOMERGE;
+- if (!mddev->pers->make_request(mddev, bio)) {
+- atomic_dec(&mddev->active_io);
+- wake_up(&mddev->sb_wait);
+- goto check_suspended;
+- }
++
++ md_handle_request(mddev, bio);
+
+ cpu = part_stat_lock();
+ part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+ part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+ part_stat_unlock();
+
+- if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
+- wake_up(&mddev->sb_wait);
+-
+ return BLK_QC_T_NONE;
+ }
+
+--- a/drivers/md/md.h
++++ b/drivers/md/md.h
+@@ -686,6 +686,7 @@ extern void md_stop_writes(struct mddev
+ extern int md_rdev_init(struct md_rdev *rdev);
+ extern void md_rdev_clear(struct md_rdev *rdev);
+
++extern void md_handle_request(struct mddev *mddev, struct bio *bio);
+ extern void mddev_suspend(struct mddev *mddev);
+ extern void mddev_resume(struct mddev *mddev);
+ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
--- /dev/null
+From 7e439681af82984045efc215437ebb2ca8d33a4c Mon Sep 17 00:00:00 2001
+From: Boris Brezillon <boris.brezillon@free-electrons.com>
+Date: Mon, 25 Sep 2017 10:19:57 +0200
+Subject: mtd: Fix partition alignment check on multi-erasesize devices
+
+From: Boris Brezillon <boris.brezillon@free-electrons.com>
+
+commit 7e439681af82984045efc215437ebb2ca8d33a4c upstream.
+
+Commit 1eeef2d7483a ("mtd: handle partitioning on devices with 0
+erasesize") introduced a regression on heterogeneous erase region
+devices. Alignment of the partition was tested against the master
+eraseblock size which can be bigger than the slave one, thus leading
+to some partitions being marked as read-only.
+
+Update wr_alignment to match this slave erasesize after this erasesize
+has been determined by picking the biggest erasesize of all the regions
+embedded in the MTD partition.
+
+Reported-by: Mathias Thore <Mathias.Thore@infinera.com>
+Fixes: 1eeef2d7483a ("mtd: handle partitioning on devices with 0 erasesize")
+Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
+Tested-by: Mathias Thore <Mathias.Thore@infinera.com>
+Reviewed-by: Mathias Thore <Mathias.Thore@infinera.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/mtd/mtdpart.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/drivers/mtd/mtdpart.c
++++ b/drivers/mtd/mtdpart.c
+@@ -581,6 +581,14 @@ static struct mtd_part *allocate_partiti
+ slave->mtd.erasesize = parent->erasesize;
+ }
+
++ /*
++ * Slave erasesize might differ from the master one if the master
++ * exposes several regions with different erasesize. Adjust
++ * wr_alignment accordingly.
++ */
++ if (!(slave->mtd.flags & MTD_NO_ERASE))
++ wr_alignment = slave->mtd.erasesize;
++
+ tmp = slave->offset;
+ remainder = do_div(tmp, wr_alignment);
+ if ((slave->mtd.flags & MTD_WRITEABLE) && remainder) {
--- /dev/null
+From 36de80740008e6a4a55115b4a92e2059e47c1cba Mon Sep 17 00:00:00 2001
+From: Richard Genoud <richard.genoud@gmail.com>
+Date: Wed, 27 Sep 2017 14:49:17 +0200
+Subject: mtd: nand: atmel: fix buffer overflow in atmel_pmecc_user
+
+From: Richard Genoud <richard.genoud@gmail.com>
+
+commit 36de80740008e6a4a55115b4a92e2059e47c1cba upstream.
+
+When calculating the size needed by struct atmel_pmecc_user *user,
+the dmu and delta buffer sizes were forgotten.
+This lead to a memory corruption (especially with a large ecc_strength).
+
+Link: http://lkml.kernel.org/r/1506503157.3016.5.camel@gmail.com
+Fixes: f88fc122cc34 ("mtd: nand: Cleanup/rework the atmel_nand driver")
+Reported-by: Richard Genoud <richard.genoud@gmail.com>
+Pointed-at-by: Boris Brezillon <boris.brezillon@free-electrons.com>
+Signed-off-by: Richard Genoud <richard.genoud@gmail.com>
+Reviewed-by: Nicolas Ferre <nicolas.ferre@microchip.com>
+Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/mtd/nand/atmel/pmecc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/mtd/nand/atmel/pmecc.c
++++ b/drivers/mtd/nand/atmel/pmecc.c
+@@ -363,7 +363,7 @@ atmel_pmecc_create_user(struct atmel_pme
+ size += (req->ecc.strength + 1) * sizeof(u16);
+ /* Reserve space for mu, dmu and delta. */
+ size = ALIGN(size, sizeof(s32));
+- size += (req->ecc.strength + 1) * sizeof(s32);
++ size += (req->ecc.strength + 1) * sizeof(s32) * 3;
+
+ user = kzalloc(size, GFP_KERNEL);
+ if (!user)
--- /dev/null
+From 9561475db680f7144d2223a409dd3d7e322aca03 Mon Sep 17 00:00:00 2001
+From: Nicolai Stange <nstange@suse.de>
+Date: Mon, 11 Sep 2017 09:45:40 +0200
+Subject: PCI: Fix race condition with driver_override
+
+From: Nicolai Stange <nstange@suse.de>
+
+commit 9561475db680f7144d2223a409dd3d7e322aca03 upstream.
+
+The driver_override implementation is susceptible to a race condition when
+different threads are reading vs. storing a different driver override. Add
+locking to avoid the race condition.
+
+This is in close analogy to commit 6265539776a0 ("driver core: platform:
+fix race condition with driver_override") from Adrian Salido.
+
+Fixes: 782a985d7af2 ("PCI: Introduce new device binding path using pci_dev.driver_override")
+Signed-off-by: Nicolai Stange <nstange@suse.de>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pci/pci-sysfs.c | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+--- a/drivers/pci/pci-sysfs.c
++++ b/drivers/pci/pci-sysfs.c
+@@ -686,7 +686,7 @@ static ssize_t driver_override_store(str
+ const char *buf, size_t count)
+ {
+ struct pci_dev *pdev = to_pci_dev(dev);
+- char *driver_override, *old = pdev->driver_override, *cp;
++ char *driver_override, *old, *cp;
+
+ /* We need to keep extra room for a newline */
+ if (count >= (PAGE_SIZE - 1))
+@@ -700,12 +700,15 @@ static ssize_t driver_override_store(str
+ if (cp)
+ *cp = '\0';
+
++ device_lock(dev);
++ old = pdev->driver_override;
+ if (strlen(driver_override)) {
+ pdev->driver_override = driver_override;
+ } else {
+ kfree(driver_override);
+ pdev->driver_override = NULL;
+ }
++ device_unlock(dev);
+
+ kfree(old);
+
+@@ -716,8 +719,12 @@ static ssize_t driver_override_show(stru
+ struct device_attribute *attr, char *buf)
+ {
+ struct pci_dev *pdev = to_pci_dev(dev);
++ ssize_t len;
+
+- return snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override);
++ device_lock(dev);
++ len = snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override);
++ device_unlock(dev);
++ return len;
+ }
+ static DEVICE_ATTR_RW(driver_override);
+
--- /dev/null
+From ce7c47d60bda6c7f09ccf16e978d971c8fa16ff0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
+Date: Mon, 18 Sep 2017 23:00:59 +0300
+Subject: platform/x86: fujitsu-laptop: Don't oops when FUJ02E3 is not presnt
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ville Syrjälä <ville.syrjala@linux.intel.com>
+
+commit ce7c47d60bda6c7f09ccf16e978d971c8fa16ff0 upstream.
+
+My Fujitsu-Siemens Lifebook S6120 doesn't have the FUJ02E3 device,
+but it does have FUJ02B1. That means we do register the backlight
+device (and it even seems to work), but the code will oops as soon
+as we try to set the backlight brightness because it's trying to
+call call_fext_func() with a NULL device. Let's just skip those
+function calls when the FUJ02E3 device is not present.
+
+Cc: Jonathan Woithe <jwoithe@just42.net>
+Cc: Andy Shevchenko <andy@infradead.org>
+Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
+Signed-off-by: Darren Hart (VMware) <dvhart@infradead.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/platform/x86/fujitsu-laptop.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/drivers/platform/x86/fujitsu-laptop.c
++++ b/drivers/platform/x86/fujitsu-laptop.c
+@@ -254,10 +254,12 @@ static int bl_update_status(struct backl
+ {
+ struct acpi_device *device = bl_get_data(b);
+
+- if (b->props.power == FB_BLANK_POWERDOWN)
+- call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3);
+- else
+- call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0);
++ if (fext) {
++ if (b->props.power == FB_BLANK_POWERDOWN)
++ call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x3);
++ else
++ call_fext_func(fext, FUNC_BACKLIGHT, 0x1, 0x4, 0x0);
++ }
+
+ return set_lcd_level(device, b->props.brightness);
+ }
--- /dev/null
+From e4d8ae00169f7686e1da5a62e5cf797d12bf8822 Mon Sep 17 00:00:00 2001
+From: Viresh Kumar <viresh.kumar@linaro.org>
+Date: Thu, 21 Sep 2017 10:44:36 -0700
+Subject: PM / OPP: Call notifier without holding opp_table->lock
+
+From: Viresh Kumar <viresh.kumar@linaro.org>
+
+commit e4d8ae00169f7686e1da5a62e5cf797d12bf8822 upstream.
+
+The notifier callbacks may want to call some OPP helper routines which
+may try to take the same opp_table->lock again and cause a deadlock. One
+such usecase was reported by Chanwoo Choi, where calling
+dev_pm_opp_disable() leads us to the devfreq's OPP notifier handler,
+which further calls dev_pm_opp_find_freq_floor() and it deadlocks.
+
+We don't really need the opp_table->lock to be held across the notifier
+call though, all we want to make sure is that the 'opp' doesn't get
+freed while being used from within the notifier chain. We can do it with
+help of dev_pm_opp_get/put() as well. Let's do it.
+
+Fixes: 5b650b388844 "PM / OPP: Take kref from _find_opp_table()"
+Reported-by: Chanwoo Choi <cw00.choi@samsung.com>
+Tested-by: Chanwoo Choi <cw00.choi@samsung.com>
+Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
+Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
+Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/base/power/opp/core.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/drivers/base/power/opp/core.c
++++ b/drivers/base/power/opp/core.c
+@@ -1581,6 +1581,9 @@ static int _opp_set_availability(struct
+
+ opp->available = availability_req;
+
++ dev_pm_opp_get(opp);
++ mutex_unlock(&opp_table->lock);
++
+ /* Notify the change of the OPP availability */
+ if (availability_req)
+ blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE,
+@@ -1589,8 +1592,12 @@ static int _opp_set_availability(struct
+ blocking_notifier_call_chain(&opp_table->head,
+ OPP_EVENT_DISABLE, opp);
+
++ dev_pm_opp_put(opp);
++ goto put_table;
++
+ unlock:
+ mutex_unlock(&opp_table->lock);
++put_table:
+ dev_pm_opp_put_opp_table(opp_table);
+ return r;
+ }
--- /dev/null
+From 5ccba44ba118a5000cccc50076b0344632459779 Mon Sep 17 00:00:00 2001
+From: Ethan Zhao <ethan.zhao@oracle.com>
+Date: Mon, 4 Sep 2017 13:59:34 +0800
+Subject: sched/sysctl: Check user input value of sysctl_sched_time_avg
+
+From: Ethan Zhao <ethan.zhao@oracle.com>
+
+commit 5ccba44ba118a5000cccc50076b0344632459779 upstream.
+
+System will hang if user set sysctl_sched_time_avg to 0:
+
+ [root@XXX ~]# sysctl kernel.sched_time_avg_ms=0
+
+ Stack traceback for pid 0
+ 0xffff883f6406c600 0 0 1 3 R 0xffff883f6406cf50 *swapper/3
+ ffff883f7ccc3ae8 0000000000000018 ffffffff810c4dd0 0000000000000000
+ 0000000000017800 ffff883f7ccc3d78 0000000000000003 ffff883f7ccc3bf8
+ ffffffff810c4fc9 ffff883f7ccc3c08 00000000810c5043 ffff883f7ccc3c08
+ Call Trace:
+ <IRQ> [<ffffffff810c4dd0>] ? update_group_capacity+0x110/0x200
+ [<ffffffff810c4fc9>] ? update_sd_lb_stats+0x109/0x600
+ [<ffffffff810c5507>] ? find_busiest_group+0x47/0x530
+ [<ffffffff810c5b84>] ? load_balance+0x194/0x900
+ [<ffffffff810ad5ca>] ? update_rq_clock.part.83+0x1a/0xe0
+ [<ffffffff810c6d42>] ? rebalance_domains+0x152/0x290
+ [<ffffffff810c6f5c>] ? run_rebalance_domains+0xdc/0x1d0
+ [<ffffffff8108a75b>] ? __do_softirq+0xfb/0x320
+ [<ffffffff8108ac85>] ? irq_exit+0x125/0x130
+ [<ffffffff810b3a17>] ? scheduler_ipi+0x97/0x160
+ [<ffffffff81052709>] ? smp_reschedule_interrupt+0x29/0x30
+ [<ffffffff8173a1be>] ? reschedule_interrupt+0x6e/0x80
+ <EOI> [<ffffffff815bc83c>] ? cpuidle_enter_state+0xcc/0x230
+ [<ffffffff815bc80c>] ? cpuidle_enter_state+0x9c/0x230
+ [<ffffffff815bc9d7>] ? cpuidle_enter+0x17/0x20
+ [<ffffffff810cd6dc>] ? cpu_startup_entry+0x38c/0x420
+ [<ffffffff81053373>] ? start_secondary+0x173/0x1e0
+
+Because divide-by-zero error happens in function:
+
+update_group_capacity()
+ update_cpu_capacity()
+ scale_rt_capacity()
+ {
+ ...
+ total = sched_avg_period() + delta;
+ used = div_u64(avg, total);
+ ...
+ }
+
+To fix this issue, check user input value of sysctl_sched_time_avg, keep
+it unchanged when hitting invalid input, and set the minimum limit of
+sysctl_sched_time_avg to 1 ms.
+
+Reported-by: James Puthukattukaran <james.puthukattukaran@oracle.com>
+Signed-off-by: Ethan Zhao <ethan.zhao@oracle.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: efault@gmx.de
+Cc: ethan.kernel@gmail.com
+Cc: keescook@chromium.org
+Cc: mcgrof@kernel.org
+Link: http://lkml.kernel.org/r/1504504774-18253-1-git-send-email-ethan.zhao@oracle.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sysctl.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = {
+ .data = &sysctl_sched_time_avg,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+- .proc_handler = proc_dointvec,
++ .proc_handler = proc_dointvec_minmax,
++ .extra1 = &one,
+ },
+ #ifdef CONFIG_SCHEDSTATS
+ {
extable-enable-rcu-if-it-is-not-watching-in-kernel_text_address.patch
selftests-seccomp-support-glibc-2.26-siginfo_t.h.patch
seccomp-fix-the-usage-of-get-put_seccomp_filter-in-seccomp_get_filter.patch
+arm64-make-sure-spsel-is-always-set.patch
+arm64-mm-use-read_once-when-dereferencing-pointer-to-pte-table.patch
+arm64-fault-route-pte-translation-faults-via-do_translation_fault.patch
+kvm-vmx-extract-__pi_post_block.patch
+kvm-vmx-avoid-double-list-add-with-vt-d-posted-interrupts.patch
+kvm-vmx-simplify-and-fix-vmx_vcpu_pi_load.patch
+kvm-nvmx-fix-host_cr3-host_cr4-cache.patch
+kvm-x86-handle-async-pf-in-rcu-read-side-critical-sections.patch
+kvm-vmx-do-not-bug-on-out-of-bounds-guest-irq.patch
+kvm-nvmx-don-t-allow-l2-to-access-the-hardware-cr8.patch
+xfs-validate-bdev-support-for-dax-inode-flag.patch
+fix-infoleak-in-waitid-2.patch
+sched-sysctl-check-user-input-value-of-sysctl_sched_time_avg.patch
+irq-generic-chip-don-t-replace-domain-s-name.patch
+mtd-fix-partition-alignment-check-on-multi-erasesize-devices.patch
+mtd-nand-atmel-fix-buffer-overflow-in-atmel_pmecc_user.patch
+etnaviv-fix-submit-error-path.patch
+etnaviv-fix-gem-object-list-corruption.patch
+futex-fix-pi_state-owner-serialization.patch
+md-fix-a-race-condition-for-flush-request-handling.patch
+md-separate-request-handling.patch
+pci-fix-race-condition-with-driver_override.patch
+btrfs-fix-null-pointer-dereference-from-free_reloc_roots.patch
+btrfs-clear-ordered-flag-on-cleaning-up-ordered-extents.patch
+btrfs-finish-ordered-extent-cleaning-if-no-progress-is-found.patch
+btrfs-propagate-error-to-btrfs_cmp_data_prepare-caller.patch
+btrfs-prevent-to-set-invalid-default-subvolid.patch
+platform-x86-fujitsu-laptop-don-t-oops-when-fuj02e3-is-not-presnt.patch
+pm-opp-call-notifier-without-holding-opp_table-lock.patch
+x86-mm-fix-fault-error-path-using-unsafe-vma-pointer.patch
--- /dev/null
+From a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 Mon Sep 17 00:00:00 2001
+From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+Date: Mon, 4 Sep 2017 10:32:15 +0200
+Subject: x86/mm: Fix fault error path using unsafe vma pointer
+
+From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+
+commit a3c4fb7c9c2ebfd50b8c60f6c069932bb319bc37 upstream.
+
+commit 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal
+generation code") passes down a vma pointer to the error path, but that is
+done once the mmap_sem is released when calling mm_fault_error() from
+__do_page_fault().
+
+This is dangerous as the vma structure is no more safe to be used once the
+mmap_sem has been released. As only the protection key value is required in
+the error processing, we could just pass down this value.
+
+Fix it by passing a pointer to a protection key value down to the fault
+signal generation code. The use of a pointer allows to keep the check
+generating a warning message in fill_sig_info_pkey() when the vma was not
+known. If the pointer is valid, the protection value can be accessed by
+deferencing the pointer.
+
+[ tglx: Made *pkey u32 as that's the type which is passed in siginfo ]
+
+Fixes: 7b2d0dbac489 ("x86/mm/pkeys: Pass VMA down in to fault signal generation code")
+Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-mm@kvack.org
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Link: http://lkml.kernel.org/r/1504513935-12742-1-git-send-email-ldufour@linux.vnet.ibm.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/fault.c | 47 ++++++++++++++++++++++++-----------------------
+ 1 file changed, 24 insertions(+), 23 deletions(-)
+
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -192,8 +192,7 @@ is_prefetch(struct pt_regs *regs, unsign
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
+ * faulted on a pte with its pkey=4.
+ */
+-static void fill_sig_info_pkey(int si_code, siginfo_t *info,
+- struct vm_area_struct *vma)
++static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
+ {
+ /* This is effectively an #ifdef */
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+@@ -209,7 +208,7 @@ static void fill_sig_info_pkey(int si_co
+ * valid VMA, so we should never reach this without a
+ * valid VMA.
+ */
+- if (!vma) {
++ if (!pkey) {
+ WARN_ONCE(1, "PKU fault with no VMA passed in");
+ info->si_pkey = 0;
+ return;
+@@ -219,13 +218,12 @@ static void fill_sig_info_pkey(int si_co
+ * absolutely guranteed to be 100% accurate because of
+ * the race explained above.
+ */
+- info->si_pkey = vma_pkey(vma);
++ info->si_pkey = *pkey;
+ }
+
+ static void
+ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+- struct task_struct *tsk, struct vm_area_struct *vma,
+- int fault)
++ struct task_struct *tsk, u32 *pkey, int fault)
+ {
+ unsigned lsb = 0;
+ siginfo_t info;
+@@ -240,7 +238,7 @@ force_sig_info_fault(int si_signo, int s
+ lsb = PAGE_SHIFT;
+ info.si_addr_lsb = lsb;
+
+- fill_sig_info_pkey(si_code, &info, vma);
++ fill_sig_info_pkey(si_code, &info, pkey);
+
+ force_sig_info(si_signo, &info, tsk);
+ }
+@@ -758,8 +756,6 @@ no_context(struct pt_regs *regs, unsigne
+ struct task_struct *tsk = current;
+ unsigned long flags;
+ int sig;
+- /* No context means no VMA to pass down */
+- struct vm_area_struct *vma = NULL;
+
+ /* Are we prepared to handle this kernel fault? */
+ if (fixup_exception(regs, X86_TRAP_PF)) {
+@@ -784,7 +780,7 @@ no_context(struct pt_regs *regs, unsigne
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_info_fault(signal, si_code, address,
+- tsk, vma, 0);
++ tsk, NULL, 0);
+ }
+
+ /*
+@@ -893,8 +889,7 @@ show_signal_msg(struct pt_regs *regs, un
+
+ static void
+ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+- unsigned long address, struct vm_area_struct *vma,
+- int si_code)
++ unsigned long address, u32 *pkey, int si_code)
+ {
+ struct task_struct *tsk = current;
+
+@@ -942,7 +937,7 @@ __bad_area_nosemaphore(struct pt_regs *r
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+- force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
++ force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
+
+ return;
+ }
+@@ -955,9 +950,9 @@ __bad_area_nosemaphore(struct pt_regs *r
+
+ static noinline void
+ bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+- unsigned long address, struct vm_area_struct *vma)
++ unsigned long address, u32 *pkey)
+ {
+- __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
++ __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
+ }
+
+ static void
+@@ -965,6 +960,10 @@ __bad_area(struct pt_regs *regs, unsigne
+ unsigned long address, struct vm_area_struct *vma, int si_code)
+ {
+ struct mm_struct *mm = current->mm;
++ u32 pkey;
++
++ if (vma)
++ pkey = vma_pkey(vma);
+
+ /*
+ * Something tried to access memory that isn't in our memory map..
+@@ -972,7 +971,8 @@ __bad_area(struct pt_regs *regs, unsigne
+ */
+ up_read(&mm->mmap_sem);
+
+- __bad_area_nosemaphore(regs, error_code, address, vma, si_code);
++ __bad_area_nosemaphore(regs, error_code, address,
++ (vma) ? &pkey : NULL, si_code);
+ }
+
+ static noinline void
+@@ -1015,7 +1015,7 @@ bad_area_access_error(struct pt_regs *re
+
+ static void
+ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+- struct vm_area_struct *vma, unsigned int fault)
++ u32 *pkey, unsigned int fault)
+ {
+ struct task_struct *tsk = current;
+ int code = BUS_ADRERR;
+@@ -1042,13 +1042,12 @@ do_sigbus(struct pt_regs *regs, unsigned
+ code = BUS_MCEERR_AR;
+ }
+ #endif
+- force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
++ force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
+ }
+
+ static noinline void
+ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+- unsigned long address, struct vm_area_struct *vma,
+- unsigned int fault)
++ unsigned long address, u32 *pkey, unsigned int fault)
+ {
+ if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+ no_context(regs, error_code, address, 0, 0);
+@@ -1072,9 +1071,9 @@ mm_fault_error(struct pt_regs *regs, uns
+ } else {
+ if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+ VM_FAULT_HWPOISON_LARGE))
+- do_sigbus(regs, error_code, address, vma, fault);
++ do_sigbus(regs, error_code, address, pkey, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+- bad_area_nosemaphore(regs, error_code, address, vma);
++ bad_area_nosemaphore(regs, error_code, address, pkey);
+ else
+ BUG();
+ }
+@@ -1268,6 +1267,7 @@ __do_page_fault(struct pt_regs *regs, un
+ struct mm_struct *mm;
+ int fault, major = 0;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
++ u32 pkey;
+
+ tsk = current;
+ mm = tsk->mm;
+@@ -1468,9 +1468,10 @@ good_area:
+ return;
+ }
+
++ pkey = vma_pkey(vma);
+ up_read(&mm->mmap_sem);
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+- mm_fault_error(regs, error_code, address, vma, fault);
++ mm_fault_error(regs, error_code, address, &pkey, fault);
+ return;
+ }
+
--- /dev/null
+From 6851a3db7e224bbb85e23b3c64a506c9e0904382 Mon Sep 17 00:00:00 2001
+From: Ross Zwisler <ross.zwisler@linux.intel.com>
+Date: Mon, 18 Sep 2017 14:46:03 -0700
+Subject: xfs: validate bdev support for DAX inode flag
+
+From: Ross Zwisler <ross.zwisler@linux.intel.com>
+
+commit 6851a3db7e224bbb85e23b3c64a506c9e0904382 upstream.
+
+Currently only the blocksize is checked, but we should really be calling
+bdev_dax_supported() which also tests to make sure we can get a
+struct dax_device and that the dax_direct_access() path is working.
+
+This is the same check that we do for the "-o dax" mount option in
+xfs_fs_fill_super().
+
+This does not fix the race issues that caused the XFS DAX inode option to
+be disabled, so that option will still be disabled. If/when we re-enable
+it, though, I think we will want this issue to have been fixed. I also do
+think that we want to fix this in stable kernels.
+
+Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_ioctl.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -1088,6 +1088,7 @@ xfs_ioctl_setattr_dax_invalidate(
+ int *join_flags)
+ {
+ struct inode *inode = VFS_I(ip);
++ struct super_block *sb = inode->i_sb;
+ int error;
+
+ *join_flags = 0;
+@@ -1100,7 +1101,7 @@ xfs_ioctl_setattr_dax_invalidate(
+ if (fa->fsx_xflags & FS_XFLAG_DAX) {
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+ return -EINVAL;
+- if (ip->i_mount->m_sb.sb_blocksize != PAGE_SIZE)
++ if (bdev_dax_supported(sb, sb->s_blocksize) < 0)
+ return -EINVAL;
+ }
+