From d3fac8d7dd1ef6a544657cf152a185c5bf1b74bb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 14 Apr 2020 16:42:40 +0200 Subject: [PATCH] 4.19-stable patches added patches: btrfs-drop-block-from-cache-on-error-in-relocation.patch btrfs-fix-crash-during-unmount-due-to-race-with-delayed-inode-workers.patch btrfs-fix-missing-file-extent-item-for-hole-after-ranged-fsync.patch btrfs-fix-missing-semaphore-unlock-in-btrfs_sync_file.patch btrfs-set-update-the-uuid-generation-as-soon-as-possible.patch cifs-fix-bug-which-the-return-value-by-asynchronous-read-is-error.patch crypto-mxs-dcp-fix-scatterlist-linearization-for-hash.patch kvm-nvmx-properly-handle-userspace-interrupt-window-request.patch kvm-s390-vsie-fix-delivery-of-addressing-exceptions.patch kvm-s390-vsie-fix-region-1-asce-sanity-shadow-address-checks.patch kvm-vmx-always-vmclear-in-use-vmcses-during-crash-with-kexec-support.patch kvm-vmx-fix-crash-cleanup-when-kvm-wasn-t-used.patch kvm-x86-allocate-new-rmap-and-large-page-tracking-when-moving-memslot.patch kvm-x86-gracefully-handle-__vmalloc-failure-during-vm-allocation.patch mtd-spinand-do-not-erase-the-block-before-writing-a-bad-block-marker.patch mtd-spinand-stop-using-spinand-oobbuf-for-buffering-bad-block-markers.patch --- ...ck-from-cache-on-error-in-relocation.patch | 41 ++++ ...e-to-race-with-delayed-inode-workers.patch | 222 ++++++++++++++++++ ...ent-item-for-hole-after-ranged-fsync.patch | 103 ++++++++ ...-semaphore-unlock-in-btrfs_sync_file.patch | 35 +++ ...-uuid-generation-as-soon-as-possible.patch | 64 +++++ ...-value-by-asynchronous-read-is-error.patch | 63 +++++ ...x-scatterlist-linearization-for-hash.patch | 113 +++++++++ ...e-userspace-interrupt-window-request.patch | 160 +++++++++++++ ...ix-delivery-of-addressing-exceptions.patch | 50 ++++ ...-1-asce-sanity-shadow-address-checks.patch | 56 +++++ ...cses-during-crash-with-kexec-support.patch | 180 ++++++++++++++ ...x-crash-cleanup-when-kvm-wasn-t-used.patch | 75 ++++++ ...ge-page-tracking-when-moving-memslot.patch | 102 ++++++++ ...vmalloc-failure-during-vm-allocation.patch | 50 ++++ ...ck-before-writing-a-bad-block-marker.patch | 50 ++++ ...bbuf-for-buffering-bad-block-markers.patch | 85 +++++++ queue-4.19/series | 16 ++ 17 files changed, 1465 insertions(+) create mode 100644 queue-4.19/btrfs-drop-block-from-cache-on-error-in-relocation.patch create mode 100644 queue-4.19/btrfs-fix-crash-during-unmount-due-to-race-with-delayed-inode-workers.patch create mode 100644 queue-4.19/btrfs-fix-missing-file-extent-item-for-hole-after-ranged-fsync.patch create mode 100644 queue-4.19/btrfs-fix-missing-semaphore-unlock-in-btrfs_sync_file.patch create mode 100644 queue-4.19/btrfs-set-update-the-uuid-generation-as-soon-as-possible.patch create mode 100644 queue-4.19/cifs-fix-bug-which-the-return-value-by-asynchronous-read-is-error.patch create mode 100644 queue-4.19/crypto-mxs-dcp-fix-scatterlist-linearization-for-hash.patch create mode 100644 queue-4.19/kvm-nvmx-properly-handle-userspace-interrupt-window-request.patch create mode 100644 queue-4.19/kvm-s390-vsie-fix-delivery-of-addressing-exceptions.patch create mode 100644 queue-4.19/kvm-s390-vsie-fix-region-1-asce-sanity-shadow-address-checks.patch create mode 100644 queue-4.19/kvm-vmx-always-vmclear-in-use-vmcses-during-crash-with-kexec-support.patch create mode 100644 queue-4.19/kvm-vmx-fix-crash-cleanup-when-kvm-wasn-t-used.patch create mode 100644 queue-4.19/kvm-x86-allocate-new-rmap-and-large-page-tracking-when-moving-memslot.patch create mode 100644 queue-4.19/kvm-x86-gracefully-handle-__vmalloc-failure-during-vm-allocation.patch create mode 100644 queue-4.19/mtd-spinand-do-not-erase-the-block-before-writing-a-bad-block-marker.patch create mode 100644 queue-4.19/mtd-spinand-stop-using-spinand-oobbuf-for-buffering-bad-block-markers.patch diff --git a/queue-4.19/btrfs-drop-block-from-cache-on-error-in-relocation.patch b/queue-4.19/btrfs-drop-block-from-cache-on-error-in-relocation.patch new file mode 100644 index 00000000000..36dd4d4a532 --- /dev/null +++ b/queue-4.19/btrfs-drop-block-from-cache-on-error-in-relocation.patch @@ -0,0 +1,41 @@ +From 8e19c9732ad1d127b5575a10f4fbcacf740500ff Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 4 Mar 2020 11:18:23 -0500 +Subject: btrfs: drop block from cache on error in relocation + +From: Josef Bacik + +commit 8e19c9732ad1d127b5575a10f4fbcacf740500ff upstream. + +If we have an error while building the backref tree in relocation we'll +process all the pending edges and then free the node. However if we +integrated some edges into the cache we'll lose our link to those edges +by simply freeing this node, which means we'll leak memory and +references to any roots that we've found. + +Instead we need to use remove_backref_node(), which walks through all of +the edges that are still linked to this node and free's them up and +drops any root references we may be holding. + +CC: stable@vger.kernel.org # 4.9+ +Reviewed-by: Qu Wenruo +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/relocation.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -1141,7 +1141,7 @@ out: + free_backref_node(cache, lower); + } + +- free_backref_node(cache, node); ++ remove_backref_node(cache, node); + return ERR_PTR(err); + } + ASSERT(!node || !node->detached); diff --git a/queue-4.19/btrfs-fix-crash-during-unmount-due-to-race-with-delayed-inode-workers.patch b/queue-4.19/btrfs-fix-crash-during-unmount-due-to-race-with-delayed-inode-workers.patch new file mode 100644 index 00000000000..50fd02b95cf --- /dev/null +++ b/queue-4.19/btrfs-fix-crash-during-unmount-due-to-race-with-delayed-inode-workers.patch @@ -0,0 +1,222 @@ +From f0cc2cd70164efe8f75c5d99560f0f69969c72e4 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 28 Feb 2020 13:04:36 +0000 +Subject: Btrfs: fix crash during unmount due to race with delayed inode workers + +From: Filipe Manana + +commit f0cc2cd70164efe8f75c5d99560f0f69969c72e4 upstream. + +During unmount we can have a job from the delayed inode items work queue +still running, that can lead to at least two bad things: + +1) A crash, because the worker can try to create a transaction just + after the fs roots were freed; + +2) A transaction leak, because the worker can create a transaction + before the fs roots are freed and just after we committed the last + transaction and after we stopped the transaction kthread. + +A stack trace example of the crash: + + [79011.691214] kernel BUG at lib/radix-tree.c:982! + [79011.692056] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI + [79011.693180] CPU: 3 PID: 1394 Comm: kworker/u8:2 Tainted: G W 5.6.0-rc2-btrfs-next-54 #2 + (...) + [79011.696789] Workqueue: btrfs-delayed-meta btrfs_work_helper [btrfs] + [79011.697904] RIP: 0010:radix_tree_tag_set+0xe7/0x170 + (...) + [79011.702014] RSP: 0018:ffffb3c84a317ca0 EFLAGS: 00010293 + [79011.702949] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 + [79011.704202] RDX: ffffb3c84a317cb0 RSI: ffffb3c84a317ca8 RDI: ffff8db3931340a0 + [79011.705463] RBP: 0000000000000005 R08: 0000000000000005 R09: ffffffff974629d0 + [79011.706756] R10: ffffb3c84a317bc0 R11: 0000000000000001 R12: ffff8db393134000 + [79011.708010] R13: ffff8db3931340a0 R14: ffff8db393134068 R15: 0000000000000001 + [79011.709270] FS: 0000000000000000(0000) GS:ffff8db3b6a00000(0000) knlGS:0000000000000000 + [79011.710699] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [79011.711710] CR2: 00007f22c2a0a000 CR3: 0000000232ad4005 CR4: 00000000003606e0 + [79011.712958] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [79011.714205] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [79011.715448] Call Trace: + [79011.715925] record_root_in_trans+0x72/0xf0 [btrfs] + [79011.716819] btrfs_record_root_in_trans+0x4b/0x70 [btrfs] + [79011.717925] start_transaction+0xdd/0x5c0 [btrfs] + [79011.718829] btrfs_async_run_delayed_root+0x17e/0x2b0 [btrfs] + [79011.719915] btrfs_work_helper+0xaa/0x720 [btrfs] + [79011.720773] process_one_work+0x26d/0x6a0 + [79011.721497] worker_thread+0x4f/0x3e0 + [79011.722153] ? process_one_work+0x6a0/0x6a0 + [79011.722901] kthread+0x103/0x140 + [79011.723481] ? kthread_create_worker_on_cpu+0x70/0x70 + [79011.724379] ret_from_fork+0x3a/0x50 + (...) + +The following diagram shows a sequence of steps that lead to the crash +during ummount of the filesystem: + + CPU 1 CPU 2 CPU 3 + + btrfs_punch_hole() + btrfs_btree_balance_dirty() + btrfs_balance_delayed_items() + --> sees + fs_info->delayed_root->items + with value 200, which is greater + than + BTRFS_DELAYED_BACKGROUND (128) + and smaller than + BTRFS_DELAYED_WRITEBACK (512) + btrfs_wq_run_delayed_node() + --> queues a job for + fs_info->delayed_workers to run + btrfs_async_run_delayed_root() + + btrfs_async_run_delayed_root() + --> job queued by CPU 1 + + --> starts picking and running + delayed nodes from the + prepare_list list + + close_ctree() + + btrfs_delete_unused_bgs() + + btrfs_commit_super() + + btrfs_join_transaction() + --> gets transaction N + + btrfs_commit_transaction(N) + --> set transaction state + to TRANTS_STATE_COMMIT_START + + btrfs_first_prepared_delayed_node() + --> picks delayed node X through + the prepared_list list + + btrfs_run_delayed_items() + + btrfs_first_delayed_node() + --> also picks delayed node X + but through the node_list + list + + __btrfs_commit_inode_delayed_items() + --> runs all delayed items from + this node and drops the + node's item count to 0 + through call to + btrfs_release_delayed_inode() + + --> finishes running any remaining + delayed nodes + + --> finishes transaction commit + + --> stops cleaner and transaction threads + + btrfs_free_fs_roots() + --> frees all roots and removes them + from the radix tree + fs_info->fs_roots_radix + + btrfs_join_transaction() + start_transaction() + btrfs_record_root_in_trans() + record_root_in_trans() + radix_tree_tag_set() + --> crashes because + the root is not in + the radix tree + anymore + +If the worker is able to call btrfs_join_transaction() before the unmount +task frees the fs roots, we end up leaking a transaction and all its +resources, since after the call to btrfs_commit_super() and stopping the +transaction kthread, we don't expect to have any transaction open anymore. + +When this situation happens the worker has a delayed node that has no +more items to run, since the task calling btrfs_run_delayed_items(), +which is doing a transaction commit, picks the same node and runs all +its items first. + +We can not wait for the worker to complete when running delayed items +through btrfs_run_delayed_items(), because we call that function in +several phases of a transaction commit, and that could cause a deadlock +because the worker calls btrfs_join_transaction() and the task doing the +transaction commit may have already set the transaction state to +TRANS_STATE_COMMIT_DOING. + +Also it's not possible to get into a situation where only some of the +items of a delayed node are added to the fs/subvolume tree in the current +transaction and the remaining ones in the next transaction, because when +running the items of a delayed inode we lock its mutex, effectively +waiting for the worker if the worker is running the items of the delayed +node already. + +Since this can only cause issues when unmounting a filesystem, fix it in +a simple way by waiting for any jobs on the delayed workers queue before +calling btrfs_commit_supper() at close_ctree(). This works because at this +point no one can call btrfs_btree_balance_dirty() or +btrfs_balance_delayed_items(), and if we end up waiting for any worker to +complete, btrfs_commit_super() will commit the transaction created by the +worker. + +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/async-thread.c | 8 ++++++++ + fs/btrfs/async-thread.h | 1 + + fs/btrfs/disk-io.c | 13 +++++++++++++ + 3 files changed, 22 insertions(+) + +--- a/fs/btrfs/async-thread.c ++++ b/fs/btrfs/async-thread.c +@@ -434,3 +434,11 @@ void btrfs_set_work_high_priority(struct + { + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); + } ++ ++void btrfs_flush_workqueue(struct btrfs_workqueue *wq) ++{ ++ if (wq->high) ++ flush_workqueue(wq->high->normal_wq); ++ ++ flush_workqueue(wq->normal->normal_wq); ++} +--- a/fs/btrfs/async-thread.h ++++ b/fs/btrfs/async-thread.h +@@ -73,5 +73,6 @@ void btrfs_set_work_high_priority(struct + struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work); + struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); + bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); ++void btrfs_flush_workqueue(struct btrfs_workqueue *wq); + + #endif +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3949,6 +3949,19 @@ void close_ctree(struct btrfs_fs_info *f + */ + btrfs_delete_unused_bgs(fs_info); + ++ /* ++ * There might be existing delayed inode workers still running ++ * and holding an empty delayed inode item. We must wait for ++ * them to complete first because they can create a transaction. ++ * This happens when someone calls btrfs_balance_delayed_items() ++ * and then a transaction commit runs the same delayed nodes ++ * before any delayed worker has done something with the nodes. ++ * We must wait for any worker here and not at transaction ++ * commit time since that could cause a deadlock. ++ * This is a very rare case. ++ */ ++ btrfs_flush_workqueue(fs_info->delayed_workers); ++ + ret = btrfs_commit_super(fs_info); + if (ret) + btrfs_err(fs_info, "commit super ret %d", ret); diff --git a/queue-4.19/btrfs-fix-missing-file-extent-item-for-hole-after-ranged-fsync.patch b/queue-4.19/btrfs-fix-missing-file-extent-item-for-hole-after-ranged-fsync.patch new file mode 100644 index 00000000000..d1b27594d07 --- /dev/null +++ b/queue-4.19/btrfs-fix-missing-file-extent-item-for-hole-after-ranged-fsync.patch @@ -0,0 +1,103 @@ +From 95418ed1d10774cd9a49af6f39e216c1256f1eeb Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 9 Mar 2020 12:41:05 +0000 +Subject: btrfs: fix missing file extent item for hole after ranged fsync + +From: Filipe Manana + +commit 95418ed1d10774cd9a49af6f39e216c1256f1eeb upstream. + +When doing a fast fsync for a range that starts at an offset greater than +zero, we can end up with a log that when replayed causes the respective +inode miss a file extent item representing a hole if we are not using the +NO_HOLES feature. This is because for fast fsyncs we don't log any extents +that cover a range different from the one requested in the fsync. + +Example scenario to trigger it: + + $ mkfs.btrfs -O ^no-holes -f /dev/sdd + $ mount /dev/sdd /mnt + + # Create a file with a single 256K and fsync it to clear to full sync + # bit in the inode - we want the msync below to trigger a fast fsync. + $ xfs_io -f -c "pwrite -S 0xab 0 256K" -c "fsync" /mnt/foo + + # Force a transaction commit and wipe out the log tree. + $ sync + + # Dirty 768K of data, increasing the file size to 1Mb, and flush only + # the range from 256K to 512K without updating the log tree + # (sync_file_range() does not trigger fsync, it only starts writeback + # and waits for it to finish). + + $ xfs_io -c "pwrite -S 0xcd 256K 768K" /mnt/foo + $ xfs_io -c "sync_range -abw 256K 256K" /mnt/foo + + # Now dirty the range from 768K to 1M again and sync that range. + $ xfs_io -c "mmap -w 768K 256K" \ + -c "mwrite -S 0xef 768K 256K" \ + -c "msync -s 768K 256K" \ + -c "munmap" \ + /mnt/foo + + + + # Mount to replay the log. + $ mount /dev/sdd /mnt + $ umount /mnt + + $ btrfs check /dev/sdd + Opening filesystem to check... + Checking filesystem on /dev/sdd + UUID: 482fb574-b288-478e-a190-a9c44a78fca6 + [1/7] checking root items + [2/7] checking extents + [3/7] checking free space cache + [4/7] checking fs roots + root 5 inode 257 errors 100, file extent discount + Found file extent holes: + start: 262144, len: 524288 + ERROR: errors found in fs roots + found 720896 bytes used, error(s) found + total csum bytes: 512 + total tree bytes: 131072 + total fs tree bytes: 32768 + total extent tree bytes: 16384 + btree space waste bytes: 123514 + file data blocks allocated: 589824 + referenced 589824 + +Fix this issue by setting the range to full (0 to LLONG_MAX) when the +NO_HOLES feature is not enabled. This results in extra work being done +but it gives the guarantee we don't end up with missing holes after +replaying the log. + +CC: stable@vger.kernel.org # 4.19+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -2074,6 +2074,16 @@ int btrfs_sync_file(struct file *file, l + btrfs_init_log_ctx(&ctx, inode); + + /* ++ * Set the range to full if the NO_HOLES feature is not enabled. ++ * This is to avoid missing file extent items representing holes after ++ * replaying the log. ++ */ ++ if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { ++ start = 0; ++ end = LLONG_MAX; ++ } ++ ++ /* + * We write the dirty pages in the range and wait until they complete + * out of the ->i_mutex. If so, we can flush the dirty pages by + * multi-task, and make the performance up. See diff --git a/queue-4.19/btrfs-fix-missing-semaphore-unlock-in-btrfs_sync_file.patch b/queue-4.19/btrfs-fix-missing-semaphore-unlock-in-btrfs_sync_file.patch new file mode 100644 index 00000000000..9d81e384c7a --- /dev/null +++ b/queue-4.19/btrfs-fix-missing-semaphore-unlock-in-btrfs_sync_file.patch @@ -0,0 +1,35 @@ +From 6ff06729c22ec0b7498d900d79cc88cfb8aceaeb Mon Sep 17 00:00:00 2001 +From: Robbie Ko +Date: Tue, 17 Mar 2020 14:31:02 +0800 +Subject: btrfs: fix missing semaphore unlock in btrfs_sync_file + +From: Robbie Ko + +commit 6ff06729c22ec0b7498d900d79cc88cfb8aceaeb upstream. + +Ordered ops are started twice in sync file, once outside of inode mutex +and once inside, taking the dio semaphore. There was one error path +missing the semaphore unlock. + +Fixes: aab15e8ec2576 ("Btrfs: fix rare chances for data loss when doing a fast fsync") +CC: stable@vger.kernel.org # 4.19+ +Signed-off-by: Robbie Ko +Reviewed-by: Filipe Manana +[ add changelog ] +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -2137,6 +2137,7 @@ int btrfs_sync_file(struct file *file, l + */ + ret = start_ordered_ops(inode, start, end); + if (ret) { ++ up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + goto out; + } diff --git a/queue-4.19/btrfs-set-update-the-uuid-generation-as-soon-as-possible.patch b/queue-4.19/btrfs-set-update-the-uuid-generation-as-soon-as-possible.patch new file mode 100644 index 00000000000..8b96667e20a --- /dev/null +++ b/queue-4.19/btrfs-set-update-the-uuid-generation-as-soon-as-possible.patch @@ -0,0 +1,64 @@ +From 75ec1db8717a8f0a9d9c8d033e542fdaa7b73898 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 14 Feb 2020 15:22:06 -0500 +Subject: btrfs: set update the uuid generation as soon as possible + +From: Josef Bacik + +commit 75ec1db8717a8f0a9d9c8d033e542fdaa7b73898 upstream. + +In my EIO stress testing I noticed I was getting forced to rescan the +uuid tree pretty often, which was weird. This is because my error +injection stuff would sometimes inject an error after log replay but +before we loaded the UUID tree. If log replay committed the transaction +it wouldn't have updated the uuid tree generation, but the tree was +valid and didn't change, so there's no reason to not update the +generation here. + +Fix this by setting the BTRFS_FS_UPDATE_UUID_TREE_GEN bit immediately +after reading all the fs roots if the uuid tree generation matches the +fs generation. Then any transaction commits that happen during mount +won't screw up our uuid tree state, forcing us to do needless uuid +rescans. + +Fixes: 70f801754728 ("Btrfs: check UUID tree during mount if required") +CC: stable@vger.kernel.org # 4.19+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/disk-io.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3007,6 +3007,18 @@ retry_root_backup: + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + ++ /* ++ * If we have a uuid root and we're not being told to rescan we need to ++ * check the generation here so we can set the ++ * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the ++ * transaction during a balance or the log replay without updating the ++ * uuid generation, and then if we crash we would rescan the uuid tree, ++ * even though it was perfectly fine. ++ */ ++ if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) && ++ fs_info->generation == btrfs_super_uuid_tree_generation(disk_super)) ++ set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); ++ + ret = btrfs_verify_dev_extents(fs_info); + if (ret) { + btrfs_err(fs_info, +@@ -3237,8 +3249,6 @@ retry_root_backup: + close_ctree(fs_info); + return ret; + } +- } else { +- set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); + } + set_bit(BTRFS_FS_OPEN, &fs_info->flags); + diff --git a/queue-4.19/cifs-fix-bug-which-the-return-value-by-asynchronous-read-is-error.patch b/queue-4.19/cifs-fix-bug-which-the-return-value-by-asynchronous-read-is-error.patch new file mode 100644 index 00000000000..0eb9ec4a44e --- /dev/null +++ b/queue-4.19/cifs-fix-bug-which-the-return-value-by-asynchronous-read-is-error.patch @@ -0,0 +1,63 @@ +From 97adda8b3ab703de8e4c8d27646ddd54fe22879c Mon Sep 17 00:00:00 2001 +From: Yilu Lin +Date: Wed, 18 Mar 2020 11:59:19 +0800 +Subject: CIFS: Fix bug which the return value by asynchronous read is error + +From: Yilu Lin + +commit 97adda8b3ab703de8e4c8d27646ddd54fe22879c upstream. + +This patch is used to fix the bug in collect_uncached_read_data() +that rc is automatically converted from a signed number to an +unsigned number when the CIFS asynchronous read fails. +It will cause ctx->rc is error. + +Example: +Share a directory and create a file on the Windows OS. +Mount the directory to the Linux OS using CIFS. +On the CIFS client of the Linux OS, invoke the pread interface to +deliver the read request. + +The size of the read length plus offset of the read request is greater +than the maximum file size. + +In this case, the CIFS server on the Windows OS returns a failure +message (for example, the return value of +smb2.nt_status is STATUS_INVALID_PARAMETER). + +After receiving the response message, the CIFS client parses +smb2.nt_status to STATUS_INVALID_PARAMETER +and converts it to the Linux error code (rdata->result=-22). + +Then the CIFS client invokes the collect_uncached_read_data function to +assign the value of rdata->result to rc, that is, rc=rdata->result=-22. + +The type of the ctx->total_len variable is unsigned integer, +the type of the rc variable is integer, and the type of +the ctx->rc variable is ssize_t. + +Therefore, during the ternary operation, the value of rc is +automatically converted to an unsigned number. The final result is +ctx->rc=4294967274. However, the expected result is ctx->rc=-22. + +Signed-off-by: Yilu Lin +Signed-off-by: Steve French +CC: Stable +Acked-by: Ronnie Sahlberg +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/cifs/file.c ++++ b/fs/cifs/file.c +@@ -3339,7 +3339,7 @@ again: + if (rc == -ENODATA) + rc = 0; + +- ctx->rc = (rc == 0) ? ctx->total_len : rc; ++ ctx->rc = (rc == 0) ? (ssize_t)ctx->total_len : rc; + + mutex_unlock(&ctx->aio_mutex); + diff --git a/queue-4.19/crypto-mxs-dcp-fix-scatterlist-linearization-for-hash.patch b/queue-4.19/crypto-mxs-dcp-fix-scatterlist-linearization-for-hash.patch new file mode 100644 index 00000000000..d2d871ba680 --- /dev/null +++ b/queue-4.19/crypto-mxs-dcp-fix-scatterlist-linearization-for-hash.patch @@ -0,0 +1,113 @@ +From fa03481b6e2e82355c46644147b614f18c7a8161 Mon Sep 17 00:00:00 2001 +From: Rosioru Dragos +Date: Tue, 25 Feb 2020 17:05:52 +0200 +Subject: crypto: mxs-dcp - fix scatterlist linearization for hash +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Rosioru Dragos + +commit fa03481b6e2e82355c46644147b614f18c7a8161 upstream. + +The incorrect traversal of the scatterlist, during the linearization phase +lead to computing the hash value of the wrong input buffer. +New implementation uses scatterwalk_map_and_copy() +to address this issue. + +Cc: +Fixes: 15b59e7c3733 ("crypto: mxs - Add Freescale MXS DCP driver") +Signed-off-by: Rosioru Dragos +Reviewed-by: Horia Geantă +Signed-off-by: Herbert Xu +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/crypto/mxs-dcp.c | 54 ++++++++++++++++++++++------------------------- + 1 file changed, 26 insertions(+), 28 deletions(-) + +--- a/drivers/crypto/mxs-dcp.c ++++ b/drivers/crypto/mxs-dcp.c +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #define DCP_MAX_CHANS 4 + #define DCP_BUF_SZ PAGE_SIZE +@@ -621,49 +622,46 @@ static int dcp_sha_req_to_buf(struct cry + struct dcp_async_ctx *actx = crypto_ahash_ctx(tfm); + struct dcp_sha_req_ctx *rctx = ahash_request_ctx(req); + struct hash_alg_common *halg = crypto_hash_alg_common(tfm); +- const int nents = sg_nents(req->src); + + uint8_t *in_buf = sdcp->coh->sha_in_buf; + uint8_t *out_buf = sdcp->coh->sha_out_buf; + +- uint8_t *src_buf; +- + struct scatterlist *src; + +- unsigned int i, len, clen; ++ unsigned int i, len, clen, oft = 0; + int ret; + + int fin = rctx->fini; + if (fin) + rctx->fini = 0; + +- for_each_sg(req->src, src, nents, i) { +- src_buf = sg_virt(src); +- len = sg_dma_len(src); ++ src = req->src; ++ len = req->nbytes; + +- do { +- if (actx->fill + len > DCP_BUF_SZ) +- clen = DCP_BUF_SZ - actx->fill; +- else +- clen = len; ++ while (len) { ++ if (actx->fill + len > DCP_BUF_SZ) ++ clen = DCP_BUF_SZ - actx->fill; ++ else ++ clen = len; + +- memcpy(in_buf + actx->fill, src_buf, clen); +- len -= clen; +- src_buf += clen; +- actx->fill += clen; ++ scatterwalk_map_and_copy(in_buf + actx->fill, src, oft, clen, ++ 0); + +- /* +- * If we filled the buffer and still have some +- * more data, submit the buffer. +- */ +- if (len && actx->fill == DCP_BUF_SZ) { +- ret = mxs_dcp_run_sha(req); +- if (ret) +- return ret; +- actx->fill = 0; +- rctx->init = 0; +- } +- } while (len); ++ len -= clen; ++ oft += clen; ++ actx->fill += clen; ++ ++ /* ++ * If we filled the buffer and still have some ++ * more data, submit the buffer. ++ */ ++ if (len && actx->fill == DCP_BUF_SZ) { ++ ret = mxs_dcp_run_sha(req); ++ if (ret) ++ return ret; ++ actx->fill = 0; ++ rctx->init = 0; ++ } + } + + if (fin) { diff --git a/queue-4.19/kvm-nvmx-properly-handle-userspace-interrupt-window-request.patch b/queue-4.19/kvm-nvmx-properly-handle-userspace-interrupt-window-request.patch new file mode 100644 index 00000000000..00f4f39c18d --- /dev/null +++ b/queue-4.19/kvm-nvmx-properly-handle-userspace-interrupt-window-request.patch @@ -0,0 +1,160 @@ +From a1c77abb8d93381e25a8d2df3a917388244ba776 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Mon, 2 Mar 2020 22:27:35 -0800 +Subject: KVM: nVMX: Properly handle userspace interrupt window request + +From: Sean Christopherson + +commit a1c77abb8d93381e25a8d2df3a917388244ba776 upstream. + +Return true for vmx_interrupt_allowed() if the vCPU is in L2 and L1 has +external interrupt exiting enabled. IRQs are never blocked in hardware +if the CPU is in the guest (L2 from L1's perspective) when IRQs trigger +VM-Exit. + +The new check percolates up to kvm_vcpu_ready_for_interrupt_injection() +and thus vcpu_run(), and so KVM will exit to userspace if userspace has +requested an interrupt window (to inject an IRQ into L1). + +Remove the @external_intr param from vmx_check_nested_events(), which is +actually an indicator that userspace wants an interrupt window, e.g. +it's named @req_int_win further up the stack. Injecting a VM-Exit into +L1 to try and bounce out to L0 userspace is all kinds of broken and is +no longer necessary. + +Remove the hack in nested_vmx_vmexit() that attempted to workaround the +breakage in vmx_check_nested_events() by only filling interrupt info if +there's an actual interrupt pending. The hack actually made things +worse because it caused KVM to _never_ fill interrupt info when the +LAPIC resides in userspace (kvm_cpu_has_interrupt() queries +interrupt.injected, which is always cleared by prepare_vmcs12() before +reaching the hack in nested_vmx_vmexit()). + +Fixes: 6550c4df7e50 ("KVM: nVMX: Fix interrupt window request with "Acknowledge interrupt on exit"") +Cc: stable@vger.kernel.org +Cc: Liran Alon +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/kvm_host.h | 2 +- + arch/x86/kvm/vmx.c | 27 +++++++++++---------------- + arch/x86/kvm/x86.c | 10 +++++----- + 3 files changed, 17 insertions(+), 22 deletions(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1070,7 +1070,7 @@ struct kvm_x86_ops { + bool (*xsaves_supported)(void); + bool (*umip_emulated)(void); + +- int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); ++ int (*check_nested_events)(struct kvm_vcpu *vcpu); + void (*request_immediate_exit)(struct kvm_vcpu *vcpu); + + void (*sched_in)(struct kvm_vcpu *kvm, int cpu); +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -6954,8 +6954,13 @@ static int vmx_nmi_allowed(struct kvm_vc + + static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) + { +- return (!to_vmx(vcpu)->nested.nested_run_pending && +- vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && ++ if (to_vmx(vcpu)->nested.nested_run_pending) ++ return false; ++ ++ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) ++ return true; ++ ++ return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); + } +@@ -12990,7 +12995,7 @@ static void vmcs12_save_pending_event(st + } + } + +-static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) ++static int vmx_check_nested_events(struct kvm_vcpu *vcpu) + { + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qual; +@@ -13028,8 +13033,7 @@ static int vmx_check_nested_events(struc + return 0; + } + +- if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && +- nested_exit_on_intr(vcpu)) { ++ if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) { + if (block_nested_events) + return -EBUSY; + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); +@@ -13607,17 +13611,8 @@ static void nested_vmx_vmexit(struct kvm + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (likely(!vmx->fail)) { +- /* +- * TODO: SDM says that with acknowledge interrupt on +- * exit, bit 31 of the VM-exit interrupt information +- * (valid interrupt) is always set to 1 on +- * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't +- * need kvm_cpu_has_interrupt(). See the commit +- * message for details. +- */ +- if (nested_exit_intr_ack_set(vcpu) && +- exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && +- kvm_cpu_has_interrupt(vcpu)) { ++ if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && ++ nested_exit_intr_ack_set(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -7124,7 +7124,7 @@ static void update_cr8_intercept(struct + kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); + } + +-static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) ++static int inject_pending_event(struct kvm_vcpu *vcpu) + { + int r; + +@@ -7160,7 +7160,7 @@ static int inject_pending_event(struct k + * from L2 to L1. + */ + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { +- r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); ++ r = kvm_x86_ops->check_nested_events(vcpu); + if (r != 0) + return r; + } +@@ -7210,7 +7210,7 @@ static int inject_pending_event(struct k + * KVM_REQ_EVENT only on certain events and not unconditionally? + */ + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { +- r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); ++ r = kvm_x86_ops->check_nested_events(vcpu); + if (r != 0) + return r; + } +@@ -7683,7 +7683,7 @@ static int vcpu_enter_guest(struct kvm_v + goto out; + } + +- if (inject_pending_event(vcpu, req_int_win) != 0) ++ if (inject_pending_event(vcpu) != 0) + req_immediate_exit = true; + else { + /* Enable SMI/NMI/IRQ window open exits if needed. +@@ -7894,7 +7894,7 @@ static inline int vcpu_block(struct kvm + static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) + { + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) +- kvm_x86_ops->check_nested_events(vcpu, false); ++ kvm_x86_ops->check_nested_events(vcpu); + + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted); diff --git a/queue-4.19/kvm-s390-vsie-fix-delivery-of-addressing-exceptions.patch b/queue-4.19/kvm-s390-vsie-fix-delivery-of-addressing-exceptions.patch new file mode 100644 index 00000000000..4ac68c1cc8f --- /dev/null +++ b/queue-4.19/kvm-s390-vsie-fix-delivery-of-addressing-exceptions.patch @@ -0,0 +1,50 @@ +From 4d4cee96fb7a3cc53702a9be8299bf525be4ee98 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Fri, 3 Apr 2020 17:30:47 +0200 +Subject: KVM: s390: vsie: Fix delivery of addressing exceptions + +From: David Hildenbrand + +commit 4d4cee96fb7a3cc53702a9be8299bf525be4ee98 upstream. + +Whenever we get an -EFAULT, we failed to read in guest 2 physical +address space. Such addressing exceptions are reported via a program +intercept to the nested hypervisor. + +We faked the intercept, we have to return to guest 2. Instead, right +now we would be returning -EFAULT from the intercept handler, eventually +crashing the VM. +the correct thing to do is to return 1 as rc == 1 is the internal +representation of "we have to go back into g2". + +Addressing exceptions can only happen if the g2->g3 page tables +reference invalid g2 addresses (say, either a table or the final page is +not accessible - so something that basically never happens in sane +environments. + +Identified by manual code inspection. + +Fixes: a3508fbe9dc6 ("KVM: s390: vsie: initial support for nested virtualization") +Cc: # v4.8+ +Signed-off-by: David Hildenbrand +Link: https://lore.kernel.org/r/20200403153050.20569-3-david@redhat.com +Reviewed-by: Claudio Imbrenda +Reviewed-by: Christian Borntraeger +[borntraeger@de.ibm.com: fix patch description] +Signed-off-by: Christian Borntraeger +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/kvm/vsie.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/s390/kvm/vsie.c ++++ b/arch/s390/kvm/vsie.c +@@ -1024,6 +1024,7 @@ static int vsie_run(struct kvm_vcpu *vcp + scb_s->iprcc = PGM_ADDRESSING; + scb_s->pgmilc = 4; + scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4); ++ rc = 1; + } + return rc; + } diff --git a/queue-4.19/kvm-s390-vsie-fix-region-1-asce-sanity-shadow-address-checks.patch b/queue-4.19/kvm-s390-vsie-fix-region-1-asce-sanity-shadow-address-checks.patch new file mode 100644 index 00000000000..56f6af758a9 --- /dev/null +++ b/queue-4.19/kvm-s390-vsie-fix-region-1-asce-sanity-shadow-address-checks.patch @@ -0,0 +1,56 @@ +From a1d032a49522cb5368e5dfb945a85899b4c74f65 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Fri, 3 Apr 2020 17:30:46 +0200 +Subject: KVM: s390: vsie: Fix region 1 ASCE sanity shadow address checks + +From: David Hildenbrand + +commit a1d032a49522cb5368e5dfb945a85899b4c74f65 upstream. + +In case we have a region 1 the following calculation +(31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11) +results in 64. As shifts beyond the size are undefined the compiler is +free to use instructions like sllg. sllg will only use 6 bits of the +shift value (here 64) resulting in no shift at all. That means that ALL +addresses will be rejected. + +The can result in endless loops, e.g. when prefix cannot get mapped. + +Fixes: 4be130a08420 ("s390/mm: add shadow gmap support") +Tested-by: Janosch Frank +Reported-by: Janosch Frank +Cc: # v4.8+ +Signed-off-by: David Hildenbrand +Link: https://lore.kernel.org/r/20200403153050.20569-2-david@redhat.com +Reviewed-by: Claudio Imbrenda +Reviewed-by: Christian Borntraeger +[borntraeger@de.ibm.com: fix patch description, remove WARN_ON_ONCE] +Signed-off-by: Christian Borntraeger +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/mm/gmap.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/arch/s390/mm/gmap.c ++++ b/arch/s390/mm/gmap.c +@@ -787,14 +787,18 @@ static void gmap_call_notifier(struct gm + static inline unsigned long *gmap_table_walk(struct gmap *gmap, + unsigned long gaddr, int level) + { ++ const int asce_type = gmap->asce & _ASCE_TYPE_MASK; + unsigned long *table; + + if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4)) + return NULL; + if (gmap_is_shadow(gmap) && gmap->removed) + return NULL; +- if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11))) ++ ++ if (asce_type != _ASCE_TYPE_REGION1 && ++ gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) + return NULL; ++ + table = gmap->table; + switch (gmap->asce & _ASCE_TYPE_MASK) { + case _ASCE_TYPE_REGION1: diff --git a/queue-4.19/kvm-vmx-always-vmclear-in-use-vmcses-during-crash-with-kexec-support.patch b/queue-4.19/kvm-vmx-always-vmclear-in-use-vmcses-during-crash-with-kexec-support.patch new file mode 100644 index 00000000000..45fdd41af9b --- /dev/null +++ b/queue-4.19/kvm-vmx-always-vmclear-in-use-vmcses-during-crash-with-kexec-support.patch @@ -0,0 +1,180 @@ +From 31603d4fc2bb4f0815245d496cb970b27b4f636a Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Sat, 21 Mar 2020 12:37:49 -0700 +Subject: KVM: VMX: Always VMCLEAR in-use VMCSes during crash with kexec support + +From: Sean Christopherson + +commit 31603d4fc2bb4f0815245d496cb970b27b4f636a upstream. + +VMCLEAR all in-use VMCSes during a crash, even if kdump's NMI shootdown +interrupted a KVM update of the percpu in-use VMCS list. + +Because NMIs are not blocked by disabling IRQs, it's possible that +crash_vmclear_local_loaded_vmcss() could be called while the percpu list +of VMCSes is being modified, e.g. in the middle of list_add() in +vmx_vcpu_load_vmcs(). This potential corner case was called out in the +original commit[*], but the analysis of its impact was wrong. + +Skipping the VMCLEARs is wrong because it all but guarantees that a +loaded, and therefore cached, VMCS will live across kexec and corrupt +memory in the new kernel. Corruption will occur because the CPU's VMCS +cache is non-coherent, i.e. not snooped, and so the writeback of VMCS +memory on its eviction will overwrite random memory in the new kernel. +The VMCS will live because the NMI shootdown also disables VMX, i.e. the +in-progress VMCLEAR will #UD, and existing Intel CPUs do not flush the +VMCS cache on VMXOFF. + +Furthermore, interrupting list_add() and list_del() is safe due to +crash_vmclear_local_loaded_vmcss() using forward iteration. list_add() +ensures the new entry is not visible to forward iteration unless the +entire add completes, via WRITE_ONCE(prev->next, new). A bad "prev" +pointer could be observed if the NMI shootdown interrupted list_del() or +list_add(), but list_for_each_entry() does not consume ->prev. + +In addition to removing the temporary disabling of VMCLEAR, open code +loaded_vmcs_init() in __loaded_vmcs_clear() and reorder VMCLEAR so that +the VMCS is deleted from the list only after it's been VMCLEAR'd. +Deleting the VMCS before VMCLEAR would allow a race where the NMI +shootdown could arrive between list_del() and vmcs_clear() and thus +neither flow would execute a successful VMCLEAR. Alternatively, more +code could be moved into loaded_vmcs_init(), but that gets rather silly +as the only other user, alloc_loaded_vmcs(), doesn't need the smp_wmb() +and would need to work around the list_del(). + +Update the smp_*() comments related to the list manipulation, and +opportunistically reword them to improve clarity. + +[*] https://patchwork.kernel.org/patch/1675731/#3720461 + +Fixes: 8f536b7697a0 ("KVM: VMX: provide the vmclear function and a bitmap to support VMCLEAR in kdump") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20200321193751.24985-2-sean.j.christopherson@intel.com> +Reviewed-by: Vitaly Kuznetsov +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx.c | 67 ++++++++++++----------------------------------------- + 1 file changed, 16 insertions(+), 51 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -2156,43 +2156,15 @@ static void vmcs_load(struct vmcs *vmcs) + } + + #ifdef CONFIG_KEXEC_CORE +-/* +- * This bitmap is used to indicate whether the vmclear +- * operation is enabled on all cpus. All disabled by +- * default. +- */ +-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; +- +-static inline void crash_enable_local_vmclear(int cpu) +-{ +- cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); +-} +- +-static inline void crash_disable_local_vmclear(int cpu) +-{ +- cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); +-} +- +-static inline int crash_local_vmclear_enabled(int cpu) +-{ +- return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); +-} +- + static void crash_vmclear_local_loaded_vmcss(void) + { + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v; + +- if (!crash_local_vmclear_enabled(cpu)) +- return; +- + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + vmcs_clear(v->vmcs); + } +-#else +-static inline void crash_enable_local_vmclear(int cpu) { } +-static inline void crash_disable_local_vmclear(int cpu) { } + #endif /* CONFIG_KEXEC_CORE */ + + static void __loaded_vmcs_clear(void *arg) +@@ -2204,19 +2176,24 @@ static void __loaded_vmcs_clear(void *ar + return; /* vcpu migration can race with cpu offline */ + if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) + per_cpu(current_vmcs, cpu) = NULL; +- crash_disable_local_vmclear(cpu); ++ ++ vmcs_clear(loaded_vmcs->vmcs); ++ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) ++ vmcs_clear(loaded_vmcs->shadow_vmcs); ++ + list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + + /* +- * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link +- * is before setting loaded_vmcs->vcpu to -1 which is done in +- * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist +- * then adds the vmcs into percpu list before it is deleted. ++ * Ensure all writes to loaded_vmcs, including deleting it from its ++ * current percpu list, complete before setting loaded_vmcs->vcpu to ++ * -1, otherwise a different cpu can see vcpu == -1 first and add ++ * loaded_vmcs to its percpu list before it's deleted from this cpu's ++ * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). + */ + smp_wmb(); + +- loaded_vmcs_init(loaded_vmcs); +- crash_enable_local_vmclear(cpu); ++ loaded_vmcs->cpu = -1; ++ loaded_vmcs->launched = 0; + } + + static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) +@@ -3067,18 +3044,17 @@ static void vmx_vcpu_load(struct kvm_vcp + if (!already_loaded) { + loaded_vmcs_clear(vmx->loaded_vmcs); + local_irq_disable(); +- crash_disable_local_vmclear(cpu); + + /* +- * Read loaded_vmcs->cpu should be before fetching +- * loaded_vmcs->loaded_vmcss_on_cpu_link. +- * See the comments in __loaded_vmcs_clear(). ++ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to ++ * this cpu's percpu list, otherwise it may not yet be deleted ++ * from its previous cpu's percpu list. Pairs with the ++ * smb_wmb() in __loaded_vmcs_clear(). + */ + smp_rmb(); + + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, + &per_cpu(loaded_vmcss_on_cpu, cpu)); +- crash_enable_local_vmclear(cpu); + local_irq_enable(); + } + +@@ -4426,17 +4402,6 @@ static int hardware_enable(void) + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + +- /* +- * Now we can enable the vmclear operation in kdump +- * since the loaded_vmcss_on_cpu list on this cpu +- * has been initialized. +- * +- * Though the cpu is not in VMX operation now, there +- * is no problem to enable the vmclear operation +- * for the loaded_vmcss_on_cpu list is empty! +- */ +- crash_enable_local_vmclear(cpu); +- + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); + + test_bits = FEATURE_CONTROL_LOCKED; diff --git a/queue-4.19/kvm-vmx-fix-crash-cleanup-when-kvm-wasn-t-used.patch b/queue-4.19/kvm-vmx-fix-crash-cleanup-when-kvm-wasn-t-used.patch new file mode 100644 index 00000000000..c2d7a3f6d23 --- /dev/null +++ b/queue-4.19/kvm-vmx-fix-crash-cleanup-when-kvm-wasn-t-used.patch @@ -0,0 +1,75 @@ +From dbef2808af6c594922fe32833b30f55f35e9da6d Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Wed, 1 Apr 2020 10:13:48 +0200 +Subject: KVM: VMX: fix crash cleanup when KVM wasn't used + +From: Vitaly Kuznetsov + +commit dbef2808af6c594922fe32833b30f55f35e9da6d upstream. + +If KVM wasn't used at all before we crash the cleanup procedure fails with + BUG: unable to handle page fault for address: ffffffffffffffc8 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 23215067 P4D 23215067 PUD 23217067 PMD 0 + Oops: 0000 [#8] SMP PTI + CPU: 0 PID: 3542 Comm: bash Kdump: loaded Tainted: G D 5.6.0-rc2+ #823 + RIP: 0010:crash_vmclear_local_loaded_vmcss.cold+0x19/0x51 [kvm_intel] + +The root cause is that loaded_vmcss_on_cpu list is not yet initialized, +we initialize it in hardware_enable() but this only happens when we start +a VM. + +Previously, we used to have a bitmap with enabled CPUs and that was +preventing [masking] the issue. + +Initialized loaded_vmcss_on_cpu list earlier, right before we assign +crash_vmclear_loaded_vmcss pointer. blocked_vcpu_on_cpu list and +blocked_vcpu_on_cpu_lock are moved altogether for consistency. + +Fixes: 31603d4fc2bb ("KVM: VMX: Always VMCLEAR in-use VMCSes during crash with kexec support") +Signed-off-by: Vitaly Kuznetsov +Message-Id: <20200401081348.1345307-1-vkuznets@redhat.com> +Reviewed-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -4398,10 +4398,6 @@ static int hardware_enable(void) + !hv_get_vp_assist_page(cpu)) + return -EFAULT; + +- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); +- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); +- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); +- + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); + + test_bits = FEATURE_CONTROL_LOCKED; +@@ -14554,7 +14550,7 @@ module_exit(vmx_exit); + + static int __init vmx_init(void) + { +- int r; ++ int r, cpu; + + #if IS_ENABLED(CONFIG_HYPERV) + /* +@@ -14605,6 +14601,12 @@ static int __init vmx_init(void) + } + } + ++ for_each_possible_cpu(cpu) { ++ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); ++ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); ++ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); ++ } ++ + #ifdef CONFIG_KEXEC_CORE + rcu_assign_pointer(crash_vmclear_loaded_vmcss, + crash_vmclear_local_loaded_vmcss); diff --git a/queue-4.19/kvm-x86-allocate-new-rmap-and-large-page-tracking-when-moving-memslot.patch b/queue-4.19/kvm-x86-allocate-new-rmap-and-large-page-tracking-when-moving-memslot.patch new file mode 100644 index 00000000000..a950d8427bd --- /dev/null +++ b/queue-4.19/kvm-x86-allocate-new-rmap-and-large-page-tracking-when-moving-memslot.patch @@ -0,0 +1,102 @@ +From edd4fa37baa6ee8e44dc65523b27bd6fe44c94de Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 18 Feb 2020 13:07:15 -0800 +Subject: KVM: x86: Allocate new rmap and large page tracking when moving memslot + +From: Sean Christopherson + +commit edd4fa37baa6ee8e44dc65523b27bd6fe44c94de upstream. + +Reallocate a rmap array and recalcuate large page compatibility when +moving an existing memslot to correctly handle the alignment properties +of the new memslot. The number of rmap entries required at each level +is dependent on the alignment of the memslot's base gfn with respect to +that level, e.g. moving a large-page aligned memslot so that it becomes +unaligned will increase the number of rmap entries needed at the now +unaligned level. + +Not updating the rmap array is the most obvious bug, as KVM accesses +garbage data beyond the end of the rmap. KVM interprets the bad data as +pointers, leading to non-canonical #GPs, unexpected #PFs, etc... + + general protection fault: 0000 [#1] SMP + CPU: 0 PID: 1909 Comm: move_memory_reg Not tainted 5.4.0-rc7+ #139 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 + RIP: 0010:rmap_get_first+0x37/0x50 [kvm] + Code: <48> 8b 3b 48 85 ff 74 ec e8 6c f4 ff ff 85 c0 74 e3 48 89 d8 5b c3 + RSP: 0018:ffffc9000021bbc8 EFLAGS: 00010246 + RAX: ffff00617461642e RBX: ffff00617461642e RCX: 0000000000000012 + RDX: ffff88827400f568 RSI: ffffc9000021bbe0 RDI: ffff88827400f570 + RBP: 0010000000000000 R08: ffffc9000021bd00 R09: ffffc9000021bda8 + R10: ffffc9000021bc48 R11: 0000000000000000 R12: 0030000000000000 + R13: 0000000000000000 R14: ffff88827427d700 R15: ffffc9000021bce8 + FS: 00007f7eda014700(0000) GS:ffff888277a00000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007f7ed9216ff8 CR3: 0000000274391003 CR4: 0000000000162eb0 + Call Trace: + kvm_mmu_slot_set_dirty+0xa1/0x150 [kvm] + __kvm_set_memory_region.part.64+0x559/0x960 [kvm] + kvm_set_memory_region+0x45/0x60 [kvm] + kvm_vm_ioctl+0x30f/0x920 [kvm] + do_vfs_ioctl+0xa1/0x620 + ksys_ioctl+0x66/0x70 + __x64_sys_ioctl+0x16/0x20 + do_syscall_64+0x4c/0x170 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + RIP: 0033:0x7f7ed9911f47 + Code: <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 21 6f 2c 00 f7 d8 64 89 01 48 + RSP: 002b:00007ffc00937498 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 + RAX: ffffffffffffffda RBX: 0000000001ab0010 RCX: 00007f7ed9911f47 + RDX: 0000000001ab1350 RSI: 000000004020ae46 RDI: 0000000000000004 + RBP: 000000000000000a R08: 0000000000000000 R09: 00007f7ed9214700 + R10: 00007f7ed92149d0 R11: 0000000000000246 R12: 00000000bffff000 + R13: 0000000000000003 R14: 00007f7ed9215000 R15: 0000000000000000 + Modules linked in: kvm_intel kvm irqbypass + ---[ end trace 0c5f570b3358ca89 ]--- + +The disallow_lpage tracking is more subtle. Failure to update results +in KVM creating large pages when it shouldn't, either due to stale data +or again due to indexing beyond the end of the metadata arrays, which +can lead to memory corruption and/or leaking data to guest/userspace. + +Note, the arrays for the old memslot are freed by the unconditional call +to kvm_free_memslot() in __kvm_set_memory_region(). + +Fixes: 05da45583de9b ("KVM: MMU: large page support") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Reviewed-by: Peter Xu +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -9229,6 +9229,13 @@ int kvm_arch_create_memslot(struct kvm * + { + int i; + ++ /* ++ * Clear out the previous array pointers for the KVM_MR_MOVE case. The ++ * old arrays will be freed by __kvm_set_memory_region() if installing ++ * the new memslot is successful. ++ */ ++ memset(&slot->arch, 0, sizeof(slot->arch)); ++ + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + struct kvm_lpage_info *linfo; + unsigned long ugfn; +@@ -9303,6 +9310,10 @@ int kvm_arch_prepare_memory_region(struc + const struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) + { ++ if (change == KVM_MR_MOVE) ++ return kvm_arch_create_memslot(kvm, memslot, ++ mem->memory_size >> PAGE_SHIFT); ++ + return 0; + } + diff --git a/queue-4.19/kvm-x86-gracefully-handle-__vmalloc-failure-during-vm-allocation.patch b/queue-4.19/kvm-x86-gracefully-handle-__vmalloc-failure-during-vm-allocation.patch new file mode 100644 index 00000000000..52f0ef8aa03 --- /dev/null +++ b/queue-4.19/kvm-x86-gracefully-handle-__vmalloc-failure-during-vm-allocation.patch @@ -0,0 +1,50 @@ +From d18b2f43b9147c8005ae0844fb445d8cc6a87e31 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Sun, 26 Jan 2020 16:41:11 -0800 +Subject: KVM: x86: Gracefully handle __vmalloc() failure during VM allocation + +From: Sean Christopherson + +commit d18b2f43b9147c8005ae0844fb445d8cc6a87e31 upstream. + +Check the result of __vmalloc() to avoid dereferencing a NULL pointer in +the event that allocation failres. + +Fixes: d1e5b0e98ea27 ("kvm: Make VM ioctl do valloc for some archs") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Reviewed-by: Vitaly Kuznetsov +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/svm.c | 4 ++++ + arch/x86/kvm/vmx.c | 4 ++++ + 2 files changed, 8 insertions(+) + +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -1917,6 +1917,10 @@ static void __unregister_enc_region_lock + static struct kvm *svm_vm_alloc(void) + { + struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm)); ++ ++ if (!kvm_svm) ++ return NULL; ++ + return &kvm_svm->kvm; + } + +--- a/arch/x86/kvm/vmx.c ++++ b/arch/x86/kvm/vmx.c +@@ -10986,6 +10986,10 @@ STACK_FRAME_NON_STANDARD(vmx_vcpu_run); + static struct kvm *vmx_vm_alloc(void) + { + struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx)); ++ ++ if (!kvm_vmx) ++ return NULL; ++ + return &kvm_vmx->kvm; + } + diff --git a/queue-4.19/mtd-spinand-do-not-erase-the-block-before-writing-a-bad-block-marker.patch b/queue-4.19/mtd-spinand-do-not-erase-the-block-before-writing-a-bad-block-marker.patch new file mode 100644 index 00000000000..b39723891a8 --- /dev/null +++ b/queue-4.19/mtd-spinand-do-not-erase-the-block-before-writing-a-bad-block-marker.patch @@ -0,0 +1,50 @@ +From b645ad39d56846618704e463b24bb994c9585c7f Mon Sep 17 00:00:00 2001 +From: Frieder Schrempf +Date: Tue, 18 Feb 2020 10:05:35 +0000 +Subject: mtd: spinand: Do not erase the block before writing a bad block marker + +From: Frieder Schrempf + +commit b645ad39d56846618704e463b24bb994c9585c7f upstream. + +Currently when marking a block, we use spinand_erase_op() to erase +the block before writing the marker to the OOB area. Doing so without +waiting for the operation to finish can lead to the marking failing +silently and no bad block marker being written to the flash. + +In fact we don't need to do an erase at all before writing the BBM. +The ECC is disabled for raw accesses to the OOB data and we don't +need to work around any issues with chips reporting ECC errors as it +is known to be the case for raw NAND. + +Fixes: 7529df465248 ("mtd: nand: Add core infrastructure to support SPI NANDs") +Cc: stable@vger.kernel.org +Signed-off-by: Frieder Schrempf +Reviewed-by: Boris Brezillon +Signed-off-by: Miquel Raynal +Link: https://lore.kernel.org/linux-mtd/20200218100432.32433-4-frieder.schrempf@kontron.de +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/mtd/nand/spi/core.c | 3 --- + 1 file changed, 3 deletions(-) + +--- a/drivers/mtd/nand/spi/core.c ++++ b/drivers/mtd/nand/spi/core.c +@@ -673,7 +673,6 @@ static int spinand_markbad(struct nand_d + }; + int ret; + +- /* Erase block before marking it bad. */ + ret = spinand_select_target(spinand, pos->target); + if (ret) + return ret; +@@ -682,8 +681,6 @@ static int spinand_markbad(struct nand_d + if (ret) + return ret; + +- spinand_erase_op(spinand, pos); +- + return spinand_write_page(spinand, &req); + } + diff --git a/queue-4.19/mtd-spinand-stop-using-spinand-oobbuf-for-buffering-bad-block-markers.patch b/queue-4.19/mtd-spinand-stop-using-spinand-oobbuf-for-buffering-bad-block-markers.patch new file mode 100644 index 00000000000..51b56795e74 --- /dev/null +++ b/queue-4.19/mtd-spinand-stop-using-spinand-oobbuf-for-buffering-bad-block-markers.patch @@ -0,0 +1,85 @@ +From 2148937501ee3d663e0010e519a553fea67ad103 Mon Sep 17 00:00:00 2001 +From: Frieder Schrempf +Date: Tue, 18 Feb 2020 10:05:14 +0000 +Subject: mtd: spinand: Stop using spinand->oobbuf for buffering bad block markers + +From: Frieder Schrempf + +commit 2148937501ee3d663e0010e519a553fea67ad103 upstream. + +For reading and writing the bad block markers, spinand->oobbuf is +currently used as a buffer for the marker bytes. During the +underlying read and write operations to actually get/set the content +of the OOB area, the content of spinand->oobbuf is reused and changed +by accessing it through spinand->oobbuf and/or spinand->databuf. + +This is a flaw in the original design of the SPI NAND core and at the +latest from 13c15e07eedf ("mtd: spinand: Handle the case where +PROGRAM LOAD does not reset the cache") on, it results in not having +the bad block marker written at all, as the spinand->oobbuf is +cleared to 0xff after setting the marker bytes to zero. + +To fix it, we now just store the two bytes for the marker on the +stack and let the read/write operations copy it from/to the page +buffer later. + +Fixes: 7529df465248 ("mtd: nand: Add core infrastructure to support SPI NANDs") +Cc: stable@vger.kernel.org +Signed-off-by: Frieder Schrempf +Reviewed-by: Boris Brezillon +Signed-off-by: Miquel Raynal +Link: https://lore.kernel.org/linux-mtd/20200218100432.32433-2-frieder.schrempf@kontron.de +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/mtd/nand/spi/core.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +--- a/drivers/mtd/nand/spi/core.c ++++ b/drivers/mtd/nand/spi/core.c +@@ -629,18 +629,18 @@ static int spinand_mtd_write(struct mtd_ + static bool spinand_isbad(struct nand_device *nand, const struct nand_pos *pos) + { + struct spinand_device *spinand = nand_to_spinand(nand); ++ u8 marker[2] = { }; + struct nand_page_io_req req = { + .pos = *pos, +- .ooblen = 2, ++ .ooblen = sizeof(marker), + .ooboffs = 0, +- .oobbuf.in = spinand->oobbuf, ++ .oobbuf.in = marker, + .mode = MTD_OPS_RAW, + }; + +- memset(spinand->oobbuf, 0, 2); + spinand_select_target(spinand, pos->target); + spinand_read_page(spinand, &req, false); +- if (spinand->oobbuf[0] != 0xff || spinand->oobbuf[1] != 0xff) ++ if (marker[0] != 0xff || marker[1] != 0xff) + return true; + + return false; +@@ -664,11 +664,12 @@ static int spinand_mtd_block_isbad(struc + static int spinand_markbad(struct nand_device *nand, const struct nand_pos *pos) + { + struct spinand_device *spinand = nand_to_spinand(nand); ++ u8 marker[2] = { }; + struct nand_page_io_req req = { + .pos = *pos, + .ooboffs = 0, +- .ooblen = 2, +- .oobbuf.out = spinand->oobbuf, ++ .ooblen = sizeof(marker), ++ .oobbuf.out = marker, + }; + int ret; + +@@ -683,7 +684,6 @@ static int spinand_markbad(struct nand_d + + spinand_erase_op(spinand, pos); + +- memset(spinand->oobbuf, 0, 2); + return spinand_write_page(spinand, &req); + } + diff --git a/queue-4.19/series b/queue-4.19/series index cb6e20945a5..6b75aa546a8 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -76,3 +76,19 @@ mips-octeon-irq-fix-potential-null-pointer-dereference.patch ath9k-handle-txpower-changes-even-when-tpc-is-disabled.patch signal-extend-exec_id-to-64bits.patch x86-entry-32-add-missing-asm_clac-to-general_protection-entry.patch +kvm-nvmx-properly-handle-userspace-interrupt-window-request.patch +kvm-s390-vsie-fix-region-1-asce-sanity-shadow-address-checks.patch +kvm-s390-vsie-fix-delivery-of-addressing-exceptions.patch +kvm-x86-allocate-new-rmap-and-large-page-tracking-when-moving-memslot.patch +kvm-vmx-always-vmclear-in-use-vmcses-during-crash-with-kexec-support.patch +kvm-x86-gracefully-handle-__vmalloc-failure-during-vm-allocation.patch +kvm-vmx-fix-crash-cleanup-when-kvm-wasn-t-used.patch +cifs-fix-bug-which-the-return-value-by-asynchronous-read-is-error.patch +mtd-spinand-stop-using-spinand-oobbuf-for-buffering-bad-block-markers.patch +mtd-spinand-do-not-erase-the-block-before-writing-a-bad-block-marker.patch +btrfs-fix-crash-during-unmount-due-to-race-with-delayed-inode-workers.patch +btrfs-set-update-the-uuid-generation-as-soon-as-possible.patch +btrfs-drop-block-from-cache-on-error-in-relocation.patch +btrfs-fix-missing-file-extent-item-for-hole-after-ranged-fsync.patch +btrfs-fix-missing-semaphore-unlock-in-btrfs_sync_file.patch +crypto-mxs-dcp-fix-scatterlist-linearization-for-hash.patch -- 2.47.3