From 1ea01c06ca847b14e8e4406911d29c20ef8071c5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 9 Feb 2020 13:27:55 +0100 Subject: [PATCH] 5.5-stable patches added patches: aio-prevent-potential-eventfd-recursion-on-poll.patch arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch btrfs-correctly-handle-empty-trees-in-find_first_clear_extent_bit.patch btrfs-drop-log-root-for-dropped-roots.patch btrfs-fix-infinite-loop-during-fsync-after-rename-operations.patch btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch btrfs-free-block-groups-after-free-ing-fs-trees.patch btrfs-make-deduplication-with-range-including-the-last-block-work.patch btrfs-send-fix-emission-of-invalid-clone-operations-within-the-same-file.patch btrfs-set-trans-drity-in-btrfs_commit_transaction.patch drm-atmel-hlcdc-enable-clock-before-configuring-timing-engine.patch drm-atmel-hlcdc-prefer-a-lower-pixel-clock-than-requested.patch drm-atmel-hlcdc-use-double-rate-for-pixel-clock-only-if-supported.patch drm-rect-avoid-division-by-zero.patch eventfd-track-eventfd_signal-recursion-depth.patch ext4-fix-deadlock-allocating-crypto-bounce-page-from-mempool.patch ext4-fix-race-conditions-in-d_compare-and-d_hash.patch gfs2-fix-gfs2_find_jhead-that-returns-uninitialized-jhead-with-seq-0.patch gfs2-fix-o_sync-write-handling.patch gfs2-move-setting-current-backing_dev_info.patch io_uring-don-t-map-read-write-iovec-potentially-twice.patch io_uring-spin-for-sq-thread-to-idle-on-shutdown.patch iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch jbd2_seq_info_next-should-increase-position-index.patch kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch kvm-svm-pku-not-currently-supported.patch kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch kvm-x86-mmu-apply-max-pa-check-for-mmio-sptes-to-32-bit-kvm.patch kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch media-iguanair-fix-endpoint-sanity-check.patch media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch nfs-fix-memory-leaks-and-corruption-in-readdir.patch nfsd-fix-filecache-lookup.patch powerpc-futex-fix-incorrect-user-access-blocking.patch scsi-qla2xxx-fix-unbound-nvme-response-length.patch sunrpc-expiry_time-should-be-seconds-not-timeval.patch tools-kvm_stat-fix-kvm_exit-filter-name.patch watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch xen-balloon-support-xend-based-toolstack-take-two.patch xen-gntdev-do-not-use-mm-notifiers-with-autotranslating-guests.patch --- ...-potential-eventfd-recursion-on-poll.patch | 70 ++ ...able-pllp-bypass-during-tegra124-lp1.patch | 70 ++ ...e-policy-options-via-sysfs-interface.patch | 139 ++++ ...trees-in-find_first_clear_extent_bit.patch | 122 +++ ...trfs-drop-log-root-for-dropped-roots.patch | 86 +++ ...during-fsync-after-rename-operations.patch | 140 ++++ ...nching-and-fsync-when-using-no_holes.patch | 693 ++++++++++++++++++ ...ting-tree-mod-seq-elements-and-nodes.patch | 237 ++++++ ...-we-loop-in-extent_write_cache_pages.patch | 105 +++ ...block-groups-after-free-ing-fs-trees.patch | 53 ++ ...-range-including-the-last-block-work.patch | 67 ++ ...lone-operations-within-the-same-file.patch | 92 +++ ...ns-drity-in-btrfs_commit_transaction.patch | 96 +++ ...ock-before-configuring-timing-engine.patch | 53 ++ ...r-a-lower-pixel-clock-than-requested.patch | 43 ++ ...te-for-pixel-clock-only-if-supported.patch | 45 ++ .../drm-rect-avoid-division-by-zero.patch | 47 ++ ...track-eventfd_signal-recursion-depth.patch | 102 +++ ...ting-crypto-bounce-page-from-mempool.patch | 77 ++ ...e-conditions-in-d_compare-and-d_hash.patch | 79 ++ ...turns-uninitialized-jhead-with-seq-0.patch | 38 + .../gfs2-fix-o_sync-write-handling.patch | 111 +++ ...ove-setting-current-backing_dev_info.patch | 80 ++ ...p-read-write-iovec-potentially-twice.patch | 42 ++ ...in-for-sq-thread-to-idle-on-shutdown.patch | 55 ++ ...row-error-when-trying-to-remove-igtk.patch | 56 ++ ..._next-should-increase-position-index.patch | 39 + ...-uninit-vcpu-if-vcore-creation-fails.patch | 44 ++ ...red-page-if-mmu-initialization-fails.patch | 41 ++ .../kvm-svm-pku-not-currently-supported.patch | 112 +++ ...put_fpu-w-o-load_fpu-on-mpx-platform.patch | 55 ++ ...a-check-for-mmio-sptes-to-32-bit-kvm.patch | 42 ++ ...tations-from-spectre-v1-l1tf-attacks.patch | 57 ++ ...ndirect-from-spectre-v1-l1tf-attacks.patch | 58 ++ ...ndirect-from-spectre-v1-l1tf-attacks.patch | 40 + ...sh_data-from-spectre-v1-l1tf-attacks.patch | 59 ++ ...g_write-from-spectre-v1-l1tf-attacks.patch | 54 ++ ...rom-spectre-v1-l1tf-attacks-in-x86.c.patch | 54 ++ ...eg_unit-from-spectre-v1-l1tf-attacks.patch | 47 ++ ...n-pmu.h-from-spectre-v1-l1tf-attacks.patch | 69 ++ ...intel.c-from-spectre-v1-l1tf-attacks.patch | 76 ++ ...de_insn-from-spectre-v1-l1tf-attacks.patch | 48 ++ ...e-to-prevent-spectre-v1-l1tf-attacks.patch | 45 ++ ...g-to-prevent-spectre-v1-l1tf-attacks.patch | 57 ++ ...a-iguanair-fix-endpoint-sanity-check.patch | 40 + ...ized-before-registering-input-device.patch | 145 ++++ ...ocking-in-mwifiex_process_country_ie.patch | 35 + ...he-pages-need-to-be-locked-when-read.patch | 112 +++ ...mory-leaks-and-corruption-in-readdir.patch | 81 ++ queue-5.5/nfsd-fix-filecache-lookup.patch | 44 ++ ...x-fix-incorrect-user-access-blocking.patch | 105 +++ ...xxx-fix-unbound-nvme-response-length.patch | 78 ++ queue-5.5/series | 57 ++ ...y_time-should-be-seconds-not-timeval.patch | 54 ++ ...ls-kvm_stat-fix-kvm_exit-filter-name.patch | 73 ++ ...ifier-handling-in-watchdog-core-code.patch | 197 +++++ ...upport-xend-based-toolstack-take-two.patch | 47 ++ ...otifiers-with-autotranslating-guests.patch | 63 ++ 58 files changed, 4926 insertions(+) create mode 100644 queue-5.5/aio-prevent-potential-eventfd-recursion-on-poll.patch create mode 100644 queue-5.5/arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch create mode 100644 queue-5.5/bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch create mode 100644 queue-5.5/btrfs-correctly-handle-empty-trees-in-find_first_clear_extent_bit.patch create mode 100644 queue-5.5/btrfs-drop-log-root-for-dropped-roots.patch create mode 100644 queue-5.5/btrfs-fix-infinite-loop-during-fsync-after-rename-operations.patch create mode 100644 queue-5.5/btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch create mode 100644 queue-5.5/btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch create mode 100644 queue-5.5/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch create mode 100644 queue-5.5/btrfs-free-block-groups-after-free-ing-fs-trees.patch create mode 100644 queue-5.5/btrfs-make-deduplication-with-range-including-the-last-block-work.patch create mode 100644 queue-5.5/btrfs-send-fix-emission-of-invalid-clone-operations-within-the-same-file.patch create mode 100644 queue-5.5/btrfs-set-trans-drity-in-btrfs_commit_transaction.patch create mode 100644 queue-5.5/drm-atmel-hlcdc-enable-clock-before-configuring-timing-engine.patch create mode 100644 queue-5.5/drm-atmel-hlcdc-prefer-a-lower-pixel-clock-than-requested.patch create mode 100644 queue-5.5/drm-atmel-hlcdc-use-double-rate-for-pixel-clock-only-if-supported.patch create mode 100644 queue-5.5/drm-rect-avoid-division-by-zero.patch create mode 100644 queue-5.5/eventfd-track-eventfd_signal-recursion-depth.patch create mode 100644 queue-5.5/ext4-fix-deadlock-allocating-crypto-bounce-page-from-mempool.patch create mode 100644 queue-5.5/ext4-fix-race-conditions-in-d_compare-and-d_hash.patch create mode 100644 queue-5.5/gfs2-fix-gfs2_find_jhead-that-returns-uninitialized-jhead-with-seq-0.patch create mode 100644 queue-5.5/gfs2-fix-o_sync-write-handling.patch create mode 100644 queue-5.5/gfs2-move-setting-current-backing_dev_info.patch create mode 100644 queue-5.5/io_uring-don-t-map-read-write-iovec-potentially-twice.patch create mode 100644 queue-5.5/io_uring-spin-for-sq-thread-to-idle-on-shutdown.patch create mode 100644 queue-5.5/iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch create mode 100644 queue-5.5/jbd2_seq_info_next-should-increase-position-index.patch create mode 100644 queue-5.5/kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch create mode 100644 queue-5.5/kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch create mode 100644 queue-5.5/kvm-svm-pku-not-currently-supported.patch create mode 100644 queue-5.5/kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch create mode 100644 queue-5.5/kvm-x86-mmu-apply-max-pa-check-for-mmio-sptes-to-32-bit-kvm.patch create mode 100644 queue-5.5/kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch create mode 100644 queue-5.5/kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch create mode 100644 queue-5.5/media-iguanair-fix-endpoint-sanity-check.patch create mode 100644 queue-5.5/media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch create mode 100644 queue-5.5/mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch create mode 100644 queue-5.5/nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch create mode 100644 queue-5.5/nfs-fix-memory-leaks-and-corruption-in-readdir.patch create mode 100644 queue-5.5/nfsd-fix-filecache-lookup.patch create mode 100644 queue-5.5/powerpc-futex-fix-incorrect-user-access-blocking.patch create mode 100644 queue-5.5/scsi-qla2xxx-fix-unbound-nvme-response-length.patch create mode 100644 queue-5.5/sunrpc-expiry_time-should-be-seconds-not-timeval.patch create mode 100644 queue-5.5/tools-kvm_stat-fix-kvm_exit-filter-name.patch create mode 100644 queue-5.5/watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch create mode 100644 queue-5.5/xen-balloon-support-xend-based-toolstack-take-two.patch create mode 100644 queue-5.5/xen-gntdev-do-not-use-mm-notifiers-with-autotranslating-guests.patch diff --git a/queue-5.5/aio-prevent-potential-eventfd-recursion-on-poll.patch b/queue-5.5/aio-prevent-potential-eventfd-recursion-on-poll.patch new file mode 100644 index 00000000000..695f9aec342 --- /dev/null +++ b/queue-5.5/aio-prevent-potential-eventfd-recursion-on-poll.patch @@ -0,0 +1,70 @@ +From 01d7a356872eec22ef34a33a5f9cfa917d145468 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Mon, 3 Feb 2020 10:33:42 -0700 +Subject: aio: prevent potential eventfd recursion on poll + +From: Jens Axboe + +commit 01d7a356872eec22ef34a33a5f9cfa917d145468 upstream. + +If we have nested or circular eventfd wakeups, then we can deadlock if +we run them inline from our poll waitqueue wakeup handler. It's also +possible to have very long chains of notifications, to the extent where +we could risk blowing the stack. + +Check the eventfd recursion count before calling eventfd_signal(). If +it's non-zero, then punt the signaling to async context. This is always +safe, as it takes us out-of-line in terms of stack and locking context. + +Cc: stable@vger.kernel.org # 4.19+ +Reviewed-by: Jeff Moyer +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + fs/aio.c | 20 ++++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -1610,6 +1610,14 @@ static int aio_fsync(struct fsync_iocb * + return 0; + } + ++static void aio_poll_put_work(struct work_struct *work) ++{ ++ struct poll_iocb *req = container_of(work, struct poll_iocb, work); ++ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); ++ ++ iocb_put(iocb); ++} ++ + static void aio_poll_complete_work(struct work_struct *work) + { + struct poll_iocb *req = container_of(work, struct poll_iocb, work); +@@ -1674,6 +1682,8 @@ static int aio_poll_wake(struct wait_que + list_del_init(&req->wait.entry); + + if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { ++ struct kioctx *ctx = iocb->ki_ctx; ++ + /* + * Try to complete the iocb inline if we can. Use + * irqsave/irqrestore because not all filesystems (e.g. fuse) +@@ -1683,8 +1693,14 @@ static int aio_poll_wake(struct wait_que + list_del(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; +- spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); +- iocb_put(iocb); ++ if (iocb->ki_eventfd && eventfd_signal_count()) { ++ iocb = NULL; ++ INIT_WORK(&req->work, aio_poll_put_work); ++ schedule_work(&req->work); ++ } ++ spin_unlock_irqrestore(&ctx->ctx_lock, flags); ++ if (iocb) ++ iocb_put(iocb); + } else { + schedule_work(&req->work); + } diff --git a/queue-5.5/arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch b/queue-5.5/arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch new file mode 100644 index 00000000000..b856c3a4752 --- /dev/null +++ b/queue-5.5/arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch @@ -0,0 +1,70 @@ +From 1a3388d506bf5b45bb283e6a4c4706cfb4897333 Mon Sep 17 00:00:00 2001 +From: Stephen Warren +Date: Thu, 3 Oct 2019 14:50:31 -0600 +Subject: ARM: tegra: Enable PLLP bypass during Tegra124 LP1 + +From: Stephen Warren + +commit 1a3388d506bf5b45bb283e6a4c4706cfb4897333 upstream. + +For a little over a year, U-Boot has configured the flow controller to +perform automatic RAM re-repair on off->on power transitions of the CPU +rail[1]. This is mandatory for correct operation of Tegra124. However, +RAM re-repair relies on certain clocks, which the kernel must enable and +leave running. PLLP is one of those clocks. This clock is shut down +during LP1 in order to save power. Enable bypass (which I believe routes +osc_div_clk, essentially the crystal clock, to the PLL output) so that +this clock signal toggles even though the PLL is not active. This is +required so that LP1 power mode (system suspend) operates correctly. + +The bypass configuration must then be undone when resuming from LP1, so +that all peripheral clocks run at the expected rate. Without this, many +peripherals won't work correctly; for example, the UART baud rate would +be incorrect. + +NVIDIA's downstream kernel code only does this if not compiled for +Tegra30, so the added code is made conditional upon the chip ID. +NVIDIA's downstream code makes this change conditional upon the active +CPU cluster. The upstream kernel currently doesn't support cluster +switching, so this patch doesn't test the active CPU cluster ID. + +[1] 3cc7942a4ae5 ARM: tegra: implement RAM repair + +Reported-by: Jonathan Hunter +Cc: stable@vger.kernel.org +Signed-off-by: Stephen Warren +Signed-off-by: Thierry Reding +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm/mach-tegra/sleep-tegra30.S | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +--- a/arch/arm/mach-tegra/sleep-tegra30.S ++++ b/arch/arm/mach-tegra/sleep-tegra30.S +@@ -370,6 +370,14 @@ _pll_m_c_x_done: + pll_locked r1, r0, CLK_RESET_PLLC_BASE + pll_locked r1, r0, CLK_RESET_PLLX_BASE + ++ tegra_get_soc_id TEGRA_APB_MISC_BASE, r1 ++ cmp r1, #TEGRA30 ++ beq 1f ++ ldr r1, [r0, #CLK_RESET_PLLP_BASE] ++ bic r1, r1, #(1<<31) @ disable PllP bypass ++ str r1, [r0, #CLK_RESET_PLLP_BASE] ++1: ++ + mov32 r7, TEGRA_TMRUS_BASE + ldr r1, [r7] + add r1, r1, #LOCK_DELAY +@@ -630,7 +638,10 @@ tegra30_switch_cpu_to_clk32k: + str r0, [r4, #PMC_PLLP_WB0_OVERRIDE] + + /* disable PLLP, PLLA, PLLC and PLLX */ ++ tegra_get_soc_id TEGRA_APB_MISC_BASE, r1 ++ cmp r1, #TEGRA30 + ldr r0, [r5, #CLK_RESET_PLLP_BASE] ++ orrne r0, r0, #(1 << 31) @ enable PllP bypass on fast cluster + bic r0, r0, #(1 << 30) + str r0, [r5, #CLK_RESET_PLLP_BASE] + ldr r0, [r5, #CLK_RESET_PLLA_BASE] diff --git a/queue-5.5/bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch b/queue-5.5/bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch new file mode 100644 index 00000000000..3b0ec95dc2e --- /dev/null +++ b/queue-5.5/bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch @@ -0,0 +1,139 @@ +From 038ba8cc1bffc51250add4a9b9249d4331576d8f Mon Sep 17 00:00:00 2001 +From: Coly Li +Date: Sat, 1 Feb 2020 22:42:33 +0800 +Subject: bcache: add readahead cache policy options via sysfs interface + +From: Coly Li + +commit 038ba8cc1bffc51250add4a9b9249d4331576d8f upstream. + +In year 2007 high performance SSD was still expensive, in order to +save more space for real workload or meta data, the readahead I/Os +for non-meta data was bypassed and not cached on SSD. + +In now days, SSD price drops a lot and people can find larger size +SSD with more comfortable price. It is unncessary to alway bypass +normal readahead I/Os to save SSD space for now. + +This patch adds options for readahead data cache policies via sysfs +file /sys/block/bcache/readahead_cache_policy, the options are, +- "all": cache all readahead data I/Os. +- "meta-only": only cache meta data, and bypass other regular I/Os. + +If users want to make bcache continue to only cache readahead request +for metadata and bypass regular data readahead, please set "meta-only" +to this sysfs file. By default, bcache will back to cache all read- +ahead requests now. + +Cc: stable@vger.kernel.org +Signed-off-by: Coly Li +Acked-by: Eric Wheeler +Cc: Michael Lyle +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/bcache/bcache.h | 3 +++ + drivers/md/bcache/request.c | 17 ++++++++++++----- + drivers/md/bcache/sysfs.c | 22 ++++++++++++++++++++++ + 3 files changed, 37 insertions(+), 5 deletions(-) + +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -329,6 +329,9 @@ struct cached_dev { + */ + atomic_t has_dirty; + ++#define BCH_CACHE_READA_ALL 0 ++#define BCH_CACHE_READA_META_ONLY 1 ++ unsigned int cache_readahead_policy; + struct bch_ratelimit writeback_rate; + struct delayed_work writeback_rate_update; + +--- a/drivers/md/bcache/request.c ++++ b/drivers/md/bcache/request.c +@@ -379,13 +379,20 @@ static bool check_should_bypass(struct c + goto skip; + + /* +- * Flag for bypass if the IO is for read-ahead or background, +- * unless the read-ahead request is for metadata ++ * If the bio is for read-ahead or background IO, bypass it or ++ * not depends on the following situations, ++ * - If the IO is for meta data, always cache it and no bypass ++ * - If the IO is not meta data, check dc->cache_reada_policy, ++ * BCH_CACHE_READA_ALL: cache it and not bypass ++ * BCH_CACHE_READA_META_ONLY: not cache it and bypass ++ * That is, read-ahead request for metadata always get cached + * (eg, for gfs2 or xfs). + */ +- if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && +- !(bio->bi_opf & (REQ_META|REQ_PRIO))) +- goto skip; ++ if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) { ++ if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) && ++ (dc->cache_readahead_policy != BCH_CACHE_READA_ALL)) ++ goto skip; ++ } + + if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || + bio_sectors(bio) & (c->sb.block_size - 1)) { +--- a/drivers/md/bcache/sysfs.c ++++ b/drivers/md/bcache/sysfs.c +@@ -27,6 +27,12 @@ static const char * const bch_cache_mode + NULL + }; + ++static const char * const bch_reada_cache_policies[] = { ++ "all", ++ "meta-only", ++ NULL ++}; ++ + /* Default is 0 ("auto") */ + static const char * const bch_stop_on_failure_modes[] = { + "auto", +@@ -100,6 +106,7 @@ rw_attribute(congested_write_threshold_u + rw_attribute(sequential_cutoff); + rw_attribute(data_csum); + rw_attribute(cache_mode); ++rw_attribute(readahead_cache_policy); + rw_attribute(stop_when_cache_set_failed); + rw_attribute(writeback_metadata); + rw_attribute(writeback_running); +@@ -168,6 +175,11 @@ SHOW(__bch_cached_dev) + bch_cache_modes, + BDEV_CACHE_MODE(&dc->sb)); + ++ if (attr == &sysfs_readahead_cache_policy) ++ return bch_snprint_string_list(buf, PAGE_SIZE, ++ bch_reada_cache_policies, ++ dc->cache_readahead_policy); ++ + if (attr == &sysfs_stop_when_cache_set_failed) + return bch_snprint_string_list(buf, PAGE_SIZE, + bch_stop_on_failure_modes, +@@ -353,6 +365,15 @@ STORE(__cached_dev) + } + } + ++ if (attr == &sysfs_readahead_cache_policy) { ++ v = __sysfs_match_string(bch_reada_cache_policies, -1, buf); ++ if (v < 0) ++ return v; ++ ++ if ((unsigned int) v != dc->cache_readahead_policy) ++ dc->cache_readahead_policy = v; ++ } ++ + if (attr == &sysfs_stop_when_cache_set_failed) { + v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); + if (v < 0) +@@ -467,6 +488,7 @@ static struct attribute *bch_cached_dev_ + &sysfs_data_csum, + #endif + &sysfs_cache_mode, ++ &sysfs_readahead_cache_policy, + &sysfs_stop_when_cache_set_failed, + &sysfs_writeback_metadata, + &sysfs_writeback_running, diff --git a/queue-5.5/btrfs-correctly-handle-empty-trees-in-find_first_clear_extent_bit.patch b/queue-5.5/btrfs-correctly-handle-empty-trees-in-find_first_clear_extent_bit.patch new file mode 100644 index 00000000000..336fe537608 --- /dev/null +++ b/queue-5.5/btrfs-correctly-handle-empty-trees-in-find_first_clear_extent_bit.patch @@ -0,0 +1,122 @@ +From 5750c37523a2c8cbb450b9ef31e21c2ba876b05e Mon Sep 17 00:00:00 2001 +From: Nikolay Borisov +Date: Mon, 27 Jan 2020 11:59:26 +0200 +Subject: btrfs: Correctly handle empty trees in find_first_clear_extent_bit + +From: Nikolay Borisov + +commit 5750c37523a2c8cbb450b9ef31e21c2ba876b05e upstream. + +Raviu reported that running his regular fs_trim segfaulted with the +following backtrace: + +[ 237.525947] assertion failed: prev, in ../fs/btrfs/extent_io.c:1595 +[ 237.525984] ------------[ cut here ]------------ +[ 237.525985] kernel BUG at ../fs/btrfs/ctree.h:3117! +[ 237.525992] invalid opcode: 0000 [#1] SMP PTI +[ 237.525998] CPU: 4 PID: 4423 Comm: fstrim Tainted: G U OE 5.4.14-8-vanilla #1 +[ 237.526001] Hardware name: ASUSTeK COMPUTER INC. +[ 237.526044] RIP: 0010:assfail.constprop.58+0x18/0x1a [btrfs] +[ 237.526079] Call Trace: +[ 237.526120] find_first_clear_extent_bit+0x13d/0x150 [btrfs] +[ 237.526148] btrfs_trim_fs+0x211/0x3f0 [btrfs] +[ 237.526184] btrfs_ioctl_fitrim+0x103/0x170 [btrfs] +[ 237.526219] btrfs_ioctl+0x129a/0x2ed0 [btrfs] +[ 237.526227] ? filemap_map_pages+0x190/0x3d0 +[ 237.526232] ? do_filp_open+0xaf/0x110 +[ 237.526238] ? _copy_to_user+0x22/0x30 +[ 237.526242] ? cp_new_stat+0x150/0x180 +[ 237.526247] ? do_vfs_ioctl+0xa4/0x640 +[ 237.526278] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] +[ 237.526283] do_vfs_ioctl+0xa4/0x640 +[ 237.526288] ? __do_sys_newfstat+0x3c/0x60 +[ 237.526292] ksys_ioctl+0x70/0x80 +[ 237.526297] __x64_sys_ioctl+0x16/0x20 +[ 237.526303] do_syscall_64+0x5a/0x1c0 +[ 237.526310] entry_SYSCALL_64_after_hwframe+0x49/0xbe + +That was due to btrfs_fs_device::aloc_tree being empty. Initially I +thought this wasn't possible and as a percaution have put the assert in +find_first_clear_extent_bit. Turns out this is indeed possible and could +happen when a file system with SINGLE data/metadata profile has a 2nd +device added. Until balance is run or a new chunk is allocated on this +device it will be completely empty. + +In this case find_first_clear_extent_bit should return the full range +[0, -1ULL] and let the caller handle this i.e for trim the end will be +capped at the size of actual device. + +Link: https://lore.kernel.org/linux-btrfs/izW2WNyvy1dEDweBICizKnd2KDwDiDyY2EYQr4YCwk7pkuIpthx-JRn65MPBde00ND6V0_Lh8mW0kZwzDiLDv25pUYWxkskWNJnVP0kgdMA=@protonmail.com/ +Fixes: 45bfcfc168f8 ("btrfs: Implement find_first_clear_extent_bit") +CC: stable@vger.kernel.org # 5.2+ +Signed-off-by: Nikolay Borisov +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent_io.c | 32 ++++++++++++++++++-------------- + fs/btrfs/tests/extent-io-tests.c | 9 +++++++++ + 2 files changed, 27 insertions(+), 14 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -1593,21 +1593,25 @@ void find_first_clear_extent_bit(struct + /* Find first extent with bits cleared */ + while (1) { + node = __etree_search(tree, start, &next, &prev, NULL, NULL); +- if (!node) { ++ if (!node && !next && !prev) { ++ /* ++ * Tree is completely empty, send full range and let ++ * caller deal with it ++ */ ++ *start_ret = 0; ++ *end_ret = -1; ++ goto out; ++ } else if (!node && !next) { ++ /* ++ * We are past the last allocated chunk, set start at ++ * the end of the last extent. ++ */ ++ state = rb_entry(prev, struct extent_state, rb_node); ++ *start_ret = state->end + 1; ++ *end_ret = -1; ++ goto out; ++ } else if (!node) { + node = next; +- if (!node) { +- /* +- * We are past the last allocated chunk, +- * set start at the end of the last extent. The +- * device alloc tree should never be empty so +- * prev is always set. +- */ +- ASSERT(prev); +- state = rb_entry(prev, struct extent_state, rb_node); +- *start_ret = state->end + 1; +- *end_ret = -1; +- goto out; +- } + } + /* + * At this point 'node' either contains 'start' or start is +--- a/fs/btrfs/tests/extent-io-tests.c ++++ b/fs/btrfs/tests/extent-io-tests.c +@@ -441,8 +441,17 @@ static int test_find_first_clear_extent_ + int ret = -EINVAL; + + test_msg("running find_first_clear_extent_bit test"); ++ + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + ++ /* Test correct handling of empty tree */ ++ find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); ++ if (start != 0 || end != -1) { ++ test_err( ++ "error getting a range from completely empty tree: start %llu end %llu", ++ start, end); ++ goto out; ++ } + /* + * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between + * 4M-32M diff --git a/queue-5.5/btrfs-drop-log-root-for-dropped-roots.patch b/queue-5.5/btrfs-drop-log-root-for-dropped-roots.patch new file mode 100644 index 00000000000..449865d85e9 --- /dev/null +++ b/queue-5.5/btrfs-drop-log-root-for-dropped-roots.patch @@ -0,0 +1,86 @@ +From 889bfa39086e86b52fcfaa04d72c95eaeb12f9a5 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 17 Jan 2020 09:12:45 -0500 +Subject: btrfs: drop log root for dropped roots + +From: Josef Bacik + +commit 889bfa39086e86b52fcfaa04d72c95eaeb12f9a5 upstream. + +If we fsync on a subvolume and create a log root for that volume, and +then later delete that subvolume we'll never clean up its log root. Fix +this by making switch_commit_roots free the log for any dropped roots we +encounter. The extra churn is because we need a btrfs_trans_handle, not +the btrfs_transaction. + +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/transaction.c | 22 ++++++++++++---------- + 1 file changed, 12 insertions(+), 10 deletions(-) + +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -147,13 +147,14 @@ void btrfs_put_transaction(struct btrfs_ + } + } + +-static noinline void switch_commit_roots(struct btrfs_transaction *trans) ++static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) + { ++ struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *root, *tmp; + + down_write(&fs_info->commit_root_sem); +- list_for_each_entry_safe(root, tmp, &trans->switch_commits, ++ list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, + dirty_list) { + list_del_init(&root->dirty_list); + free_extent_buffer(root->commit_root); +@@ -165,16 +166,17 @@ static noinline void switch_commit_roots + } + + /* We can free old roots now. */ +- spin_lock(&trans->dropped_roots_lock); +- while (!list_empty(&trans->dropped_roots)) { +- root = list_first_entry(&trans->dropped_roots, ++ spin_lock(&cur_trans->dropped_roots_lock); ++ while (!list_empty(&cur_trans->dropped_roots)) { ++ root = list_first_entry(&cur_trans->dropped_roots, + struct btrfs_root, root_list); + list_del_init(&root->root_list); +- spin_unlock(&trans->dropped_roots_lock); ++ spin_unlock(&cur_trans->dropped_roots_lock); ++ btrfs_free_log(trans, root); + btrfs_drop_and_free_fs_root(fs_info, root); +- spin_lock(&trans->dropped_roots_lock); ++ spin_lock(&cur_trans->dropped_roots_lock); + } +- spin_unlock(&trans->dropped_roots_lock); ++ spin_unlock(&cur_trans->dropped_roots_lock); + up_write(&fs_info->commit_root_sem); + } + +@@ -1421,7 +1423,7 @@ static int qgroup_account_snapshot(struc + ret = commit_cowonly_roots(trans); + if (ret) + goto out; +- switch_commit_roots(trans->transaction); ++ switch_commit_roots(trans); + ret = btrfs_write_and_wait_transaction(trans); + if (ret) + btrfs_handle_fs_error(fs_info, ret, +@@ -2309,7 +2311,7 @@ int btrfs_commit_transaction(struct btrf + list_add_tail(&fs_info->chunk_root->dirty_list, + &cur_trans->switch_commits); + +- switch_commit_roots(cur_trans); ++ switch_commit_roots(trans); + + ASSERT(list_empty(&cur_trans->dirty_bgs)); + ASSERT(list_empty(&cur_trans->io_bgs)); diff --git a/queue-5.5/btrfs-fix-infinite-loop-during-fsync-after-rename-operations.patch b/queue-5.5/btrfs-fix-infinite-loop-during-fsync-after-rename-operations.patch new file mode 100644 index 00000000000..d0c9354933a --- /dev/null +++ b/queue-5.5/btrfs-fix-infinite-loop-during-fsync-after-rename-operations.patch @@ -0,0 +1,140 @@ +From b5e4ff9d465da1233a2d9a47ebce487c70d8f4ab Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 15 Jan 2020 13:21:35 +0000 +Subject: Btrfs: fix infinite loop during fsync after rename operations + +From: Filipe Manana + +commit b5e4ff9d465da1233a2d9a47ebce487c70d8f4ab upstream. + +Recently fsstress (from fstests) sporadically started to trigger an +infinite loop during fsync operations. This turned out to be because +support for the rename exchange and whiteout operations was added to +fsstress in fstests. These operations, unlike any others in fsstress, +cause file names to be reused, whence triggering this issue. However +it's not necessary to use rename exchange and rename whiteout operations +trigger this issue, simple rename operations and file creations are +enough to trigger the issue. + +The issue boils down to when we are logging inodes that conflict (that +had the name of any inode we need to log during the fsync operation), we +keep logging them even if they were already logged before, and after +that we check if there's any other inode that conflicts with them and +then add it again to the list of inodes to log. Skipping already logged +inodes fixes the issue. + +Consider the following example: + + $ mkfs.btrfs -f /dev/sdb + $ mount /dev/sdb /mnt + + $ mkdir /mnt/testdir # inode 257 + + $ touch /mnt/testdir/zz # inode 258 + $ ln /mnt/testdir/zz /mnt/testdir/zz_link + + $ touch /mnt/testdir/a # inode 259 + + $ sync + + # The following 3 renames achieve the same result as a rename exchange + # operation ( /mnt/testdir/zz_link to /mnt/testdir/a). + + $ mv /mnt/testdir/a /mnt/testdir/a/tmp + $ mv /mnt/testdir/zz_link /mnt/testdir/a + $ mv /mnt/testdir/a/tmp /mnt/testdir/zz_link + + # The following rename and file creation give the same result as a + # rename whiteout operation ( zz to a2). + + $ mv /mnt/testdir/zz /mnt/testdir/a2 + $ touch /mnt/testdir/zz # inode 260 + + $ xfs_io -c fsync /mnt/testdir/zz + --> results in the infinite loop + +The following steps happen: + +1) When logging inode 260, we find that its reference named "zz" was + used by inode 258 in the previous transaction (through the commit + root), so inode 258 is added to the list of conflicting indoes that + need to be logged; + +2) After logging inode 258, we find that its reference named "a" was + used by inode 259 in the previous transaction, and therefore we add + inode 259 to the list of conflicting inodes to be logged; + +3) After logging inode 259, we find that its reference named "zz_link" + was used by inode 258 in the previous transaction - we add inode 258 + to the list of conflicting inodes to log, again - we had already + logged it before at step 3. After logging it again, we find again + that inode 259 conflicts with him, and we add again 259 to the list, + etc - we end up repeating all the previous steps. + +So fix this by skipping logging of conflicting inodes that were already +logged. + +Fixes: 6b5fc433a7ad67 ("Btrfs: fix fsync after succession of renames of different files") +CC: stable@vger.kernel.org # 5.1+ +Signed-off-by: Filipe Manana +Reviewed-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 44 insertions(+) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -4836,6 +4836,50 @@ static int log_conflicting_inodes(struct + continue; + } + /* ++ * If the inode was already logged skip it - otherwise we can ++ * hit an infinite loop. Example: ++ * ++ * From the commit root (previous transaction) we have the ++ * following inodes: ++ * ++ * inode 257 a directory ++ * inode 258 with references "zz" and "zz_link" on inode 257 ++ * inode 259 with reference "a" on inode 257 ++ * ++ * And in the current (uncommitted) transaction we have: ++ * ++ * inode 257 a directory, unchanged ++ * inode 258 with references "a" and "a2" on inode 257 ++ * inode 259 with reference "zz_link" on inode 257 ++ * inode 261 with reference "zz" on inode 257 ++ * ++ * When logging inode 261 the following infinite loop could ++ * happen if we don't skip already logged inodes: ++ * ++ * - we detect inode 258 as a conflicting inode, with inode 261 ++ * on reference "zz", and log it; ++ * ++ * - we detect inode 259 as a conflicting inode, with inode 258 ++ * on reference "a", and log it; ++ * ++ * - we detect inode 258 as a conflicting inode, with inode 259 ++ * on reference "zz_link", and log it - again! After this we ++ * repeat the above steps forever. ++ */ ++ spin_lock(&BTRFS_I(inode)->lock); ++ /* ++ * Check the inode's logged_trans only instead of ++ * btrfs_inode_in_log(). This is because the last_log_commit of ++ * the inode is not updated when we only log that it exists and ++ * and it has the full sync bit set (see btrfs_log_inode()). ++ */ ++ if (BTRFS_I(inode)->logged_trans == trans->transid) { ++ spin_unlock(&BTRFS_I(inode)->lock); ++ btrfs_add_delayed_iput(inode); ++ continue; ++ } ++ spin_unlock(&BTRFS_I(inode)->lock); ++ /* + * We are safe logging the other inode without acquiring its + * lock as long as we log with the LOG_INODE_EXISTS mode. We + * are safe against concurrent renames of the other inode as diff --git a/queue-5.5/btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch b/queue-5.5/btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch new file mode 100644 index 00000000000..d703b281888 --- /dev/null +++ b/queue-5.5/btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch @@ -0,0 +1,693 @@ +From 0e56315ca147b3e60c7bf240233a301d3c7fb508 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 19 Nov 2019 12:07:33 +0000 +Subject: Btrfs: fix missing hole after hole punching and fsync when using NO_HOLES + +From: Filipe Manana + +commit 0e56315ca147b3e60c7bf240233a301d3c7fb508 upstream. + +When using the NO_HOLES feature, if we punch a hole into a file and then +fsync it, there are cases where a subsequent fsync will miss the fact that +a hole was punched, resulting in the holes not existing after replaying +the log tree. + +Essentially these cases all imply that, tree-log.c:copy_items(), is not +invoked for the leafs that delimit holes, because nothing changed those +leafs in the current transaction. And it's precisely copy_items() where +we currenly detect and log holes, which works as long as the holes are +between file extent items in the input leaf or between the beginning of +input leaf and the previous leaf or between the last item in the leaf +and the next leaf. + +First example where we miss a hole: + + *) The extent items of the inode span multiple leafs; + + *) The punched hole covers a range that affects only the extent items of + the first leaf; + + *) The fsync operation is done in full mode (BTRFS_INODE_NEEDS_FULL_SYNC + is set in the inode's runtime flags). + + That results in the hole not existing after replaying the log tree. + + For example, if the fs/subvolume tree has the following layout for a + particular inode: + + Leaf N, generation 10: + + [ ... INODE_ITEM INODE_REF EXTENT_ITEM (0 64K) EXTENT_ITEM (64K 128K) ] + + Leaf N + 1, generation 10: + + [ EXTENT_ITEM (128K 64K) ... ] + + If at transaction 11 we punch a hole coverting the range [0, 128K[, we end + up dropping the two extent items from leaf N, but we don't touch the other + leaf, so we end up in the following state: + + Leaf N, generation 11: + + [ ... INODE_ITEM INODE_REF ] + + Leaf N + 1, generation 10: + + [ EXTENT_ITEM (128K 64K) ... ] + + A full fsync after punching the hole will only process leaf N because it + was modified in the current transaction, but not leaf N + 1, since it + was not modified in the current transaction (generation 10 and not 11). + As a result the fsync will not log any holes, because it didn't process + any leaf with extent items. + +Second example where we will miss a hole: + + *) An inode as its items spanning 5 (or more) leafs; + + *) A hole is punched and it covers only the extents items of the 3rd + leaf. This resulsts in deleting the entire leaf and not touching any + of the other leafs. + + So the only leaf that is modified in the current transaction, when + punching the hole, is the first leaf, which contains the inode item. + During the full fsync, the only leaf that is passed to copy_items() + is that first leaf, and that's not enough for the hole detection + code in copy_items() to determine there's a hole between the last + file extent item in the 2nd leaf and the first file extent item in + the 3rd leaf (which was the 4th leaf before punching the hole). + +Fix this by scanning all leafs and punch holes as necessary when doing a +full fsync (less common than a non-full fsync) when the NO_HOLES feature +is enabled. The lack of explicit file extent items to mark holes makes it +necessary to scan existing extents to determine if holes exist. + +A test case for fstests follows soon. + +Fixes: 16e7549f045d33 ("Btrfs: incompatible format change to remove hole extents") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 388 +++++++++++++--------------------------------------- + 1 file changed, 100 insertions(+), 288 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -3935,7 +3935,7 @@ static int log_csums(struct btrfs_trans_ + static noinline int copy_items(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *dst_path, +- struct btrfs_path *src_path, u64 *last_extent, ++ struct btrfs_path *src_path, + int start_slot, int nr, int inode_only, + u64 logged_isize) + { +@@ -3946,7 +3946,6 @@ static noinline int copy_items(struct bt + struct btrfs_file_extent_item *extent; + struct btrfs_inode_item *inode_item; + struct extent_buffer *src = src_path->nodes[0]; +- struct btrfs_key first_key, last_key, key; + int ret; + struct btrfs_key *ins_keys; + u32 *ins_sizes; +@@ -3954,9 +3953,6 @@ static noinline int copy_items(struct bt + int i; + struct list_head ordered_sums; + int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; +- bool has_extents = false; +- bool need_find_last_extent = true; +- bool done = false; + + INIT_LIST_HEAD(&ordered_sums); + +@@ -3965,8 +3961,6 @@ static noinline int copy_items(struct bt + if (!ins_data) + return -ENOMEM; + +- first_key.objectid = (u64)-1; +- + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + +@@ -3987,9 +3981,6 @@ static noinline int copy_items(struct bt + + src_offset = btrfs_item_ptr_offset(src, start_slot + i); + +- if (i == nr - 1) +- last_key = ins_keys[i]; +- + if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(dst_path->nodes[0], + dst_path->slots[0], +@@ -4003,20 +3994,6 @@ static noinline int copy_items(struct bt + src_offset, ins_sizes[i]); + } + +- /* +- * We set need_find_last_extent here in case we know we were +- * processing other items and then walk into the first extent in +- * the inode. If we don't hit an extent then nothing changes, +- * we'll do the last search the next time around. +- */ +- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { +- has_extents = true; +- if (first_key.objectid == (u64)-1) +- first_key = ins_keys[i]; +- } else { +- need_find_last_extent = false; +- } +- + /* take a reference on file data extents so that truncates + * or deletes of this inode don't have to relog the inode + * again +@@ -4082,167 +4059,6 @@ static noinline int copy_items(struct bt + kfree(sums); + } + +- if (!has_extents) +- return ret; +- +- if (need_find_last_extent && *last_extent == first_key.offset) { +- /* +- * We don't have any leafs between our current one and the one +- * we processed before that can have file extent items for our +- * inode (and have a generation number smaller than our current +- * transaction id). +- */ +- need_find_last_extent = false; +- } +- +- /* +- * Because we use btrfs_search_forward we could skip leaves that were +- * not modified and then assume *last_extent is valid when it really +- * isn't. So back up to the previous leaf and read the end of the last +- * extent before we go and fill in holes. +- */ +- if (need_find_last_extent) { +- u64 len; +- +- ret = btrfs_prev_leaf(inode->root, src_path); +- if (ret < 0) +- return ret; +- if (ret) +- goto fill_holes; +- if (src_path->slots[0]) +- src_path->slots[0]--; +- src = src_path->nodes[0]; +- btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); +- if (key.objectid != btrfs_ino(inode) || +- key.type != BTRFS_EXTENT_DATA_KEY) +- goto fill_holes; +- extent = btrfs_item_ptr(src, src_path->slots[0], +- struct btrfs_file_extent_item); +- if (btrfs_file_extent_type(src, extent) == +- BTRFS_FILE_EXTENT_INLINE) { +- len = btrfs_file_extent_ram_bytes(src, extent); +- *last_extent = ALIGN(key.offset + len, +- fs_info->sectorsize); +- } else { +- len = btrfs_file_extent_num_bytes(src, extent); +- *last_extent = key.offset + len; +- } +- } +-fill_holes: +- /* So we did prev_leaf, now we need to move to the next leaf, but a few +- * things could have happened +- * +- * 1) A merge could have happened, so we could currently be on a leaf +- * that holds what we were copying in the first place. +- * 2) A split could have happened, and now not all of the items we want +- * are on the same leaf. +- * +- * So we need to adjust how we search for holes, we need to drop the +- * path and re-search for the first extent key we found, and then walk +- * forward until we hit the last one we copied. +- */ +- if (need_find_last_extent) { +- /* btrfs_prev_leaf could return 1 without releasing the path */ +- btrfs_release_path(src_path); +- ret = btrfs_search_slot(NULL, inode->root, &first_key, +- src_path, 0, 0); +- if (ret < 0) +- return ret; +- ASSERT(ret == 0); +- src = src_path->nodes[0]; +- i = src_path->slots[0]; +- } else { +- i = start_slot; +- } +- +- /* +- * Ok so here we need to go through and fill in any holes we may have +- * to make sure that holes are punched for those areas in case they had +- * extents previously. +- */ +- while (!done) { +- u64 offset, len; +- u64 extent_end; +- +- if (i >= btrfs_header_nritems(src_path->nodes[0])) { +- ret = btrfs_next_leaf(inode->root, src_path); +- if (ret < 0) +- return ret; +- ASSERT(ret == 0); +- src = src_path->nodes[0]; +- i = 0; +- need_find_last_extent = true; +- } +- +- btrfs_item_key_to_cpu(src, &key, i); +- if (!btrfs_comp_cpu_keys(&key, &last_key)) +- done = true; +- if (key.objectid != btrfs_ino(inode) || +- key.type != BTRFS_EXTENT_DATA_KEY) { +- i++; +- continue; +- } +- extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); +- if (btrfs_file_extent_type(src, extent) == +- BTRFS_FILE_EXTENT_INLINE) { +- len = btrfs_file_extent_ram_bytes(src, extent); +- extent_end = ALIGN(key.offset + len, +- fs_info->sectorsize); +- } else { +- len = btrfs_file_extent_num_bytes(src, extent); +- extent_end = key.offset + len; +- } +- i++; +- +- if (*last_extent == key.offset) { +- *last_extent = extent_end; +- continue; +- } +- offset = *last_extent; +- len = key.offset - *last_extent; +- ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), +- offset, 0, 0, len, 0, len, 0, 0, 0); +- if (ret) +- break; +- *last_extent = extent_end; +- } +- +- /* +- * Check if there is a hole between the last extent found in our leaf +- * and the first extent in the next leaf. If there is one, we need to +- * log an explicit hole so that at replay time we can punch the hole. +- */ +- if (ret == 0 && +- key.objectid == btrfs_ino(inode) && +- key.type == BTRFS_EXTENT_DATA_KEY && +- i == btrfs_header_nritems(src_path->nodes[0])) { +- ret = btrfs_next_leaf(inode->root, src_path); +- need_find_last_extent = true; +- if (ret > 0) { +- ret = 0; +- } else if (ret == 0) { +- btrfs_item_key_to_cpu(src_path->nodes[0], &key, +- src_path->slots[0]); +- if (key.objectid == btrfs_ino(inode) && +- key.type == BTRFS_EXTENT_DATA_KEY && +- *last_extent < key.offset) { +- const u64 len = key.offset - *last_extent; +- +- ret = btrfs_insert_file_extent(trans, log, +- btrfs_ino(inode), +- *last_extent, 0, +- 0, len, 0, len, +- 0, 0, 0); +- *last_extent += len; +- } +- } +- } +- /* +- * Need to let the callers know we dropped the path so they should +- * re-search. +- */ +- if (!ret && need_find_last_extent) +- ret = 1; + return ret; + } + +@@ -4407,7 +4223,7 @@ static int btrfs_log_prealloc_extents(st + const u64 i_size = i_size_read(&inode->vfs_inode); + const u64 ino = btrfs_ino(inode); + struct btrfs_path *dst_path = NULL; +- u64 last_extent = (u64)-1; ++ bool dropped_extents = false; + int ins_nr = 0; + int start_slot; + int ret; +@@ -4429,8 +4245,7 @@ static int btrfs_log_prealloc_extents(st + if (slot >= btrfs_header_nritems(leaf)) { + if (ins_nr > 0) { + ret = copy_items(trans, inode, dst_path, path, +- &last_extent, start_slot, +- ins_nr, 1, 0); ++ start_slot, ins_nr, 1, 0); + if (ret < 0) + goto out; + ins_nr = 0; +@@ -4454,8 +4269,7 @@ static int btrfs_log_prealloc_extents(st + path->slots[0]++; + continue; + } +- if (last_extent == (u64)-1) { +- last_extent = key.offset; ++ if (!dropped_extents) { + /* + * Avoid logging extent items logged in past fsync calls + * and leading to duplicate keys in the log tree. +@@ -4469,6 +4283,7 @@ static int btrfs_log_prealloc_extents(st + } while (ret == -EAGAIN); + if (ret) + goto out; ++ dropped_extents = true; + } + if (ins_nr == 0) + start_slot = slot; +@@ -4483,7 +4298,7 @@ static int btrfs_log_prealloc_extents(st + } + } + if (ins_nr > 0) { +- ret = copy_items(trans, inode, dst_path, path, &last_extent, ++ ret = copy_items(trans, inode, dst_path, path, + start_slot, ins_nr, 1, 0); + if (ret > 0) + ret = 0; +@@ -4670,13 +4485,8 @@ static int btrfs_log_all_xattrs(struct b + + if (slot >= nritems) { + if (ins_nr > 0) { +- u64 last_extent = 0; +- + ret = copy_items(trans, inode, dst_path, path, +- &last_extent, start_slot, +- ins_nr, 1, 0); +- /* can't be 1, extent items aren't processed */ +- ASSERT(ret <= 0); ++ start_slot, ins_nr, 1, 0); + if (ret < 0) + return ret; + ins_nr = 0; +@@ -4700,13 +4510,8 @@ static int btrfs_log_all_xattrs(struct b + cond_resched(); + } + if (ins_nr > 0) { +- u64 last_extent = 0; +- + ret = copy_items(trans, inode, dst_path, path, +- &last_extent, start_slot, +- ins_nr, 1, 0); +- /* can't be 1, extent items aren't processed */ +- ASSERT(ret <= 0); ++ start_slot, ins_nr, 1, 0); + if (ret < 0) + return ret; + } +@@ -4715,100 +4520,119 @@ static int btrfs_log_all_xattrs(struct b + } + + /* +- * If the no holes feature is enabled we need to make sure any hole between the +- * last extent and the i_size of our inode is explicitly marked in the log. This +- * is to make sure that doing something like: +- * +- * 1) create file with 128Kb of data +- * 2) truncate file to 64Kb +- * 3) truncate file to 256Kb +- * 4) fsync file +- * 5) +- * 6) mount fs and trigger log replay +- * +- * Will give us a file with a size of 256Kb, the first 64Kb of data match what +- * the file had in its first 64Kb of data at step 1 and the last 192Kb of the +- * file correspond to a hole. The presence of explicit holes in a log tree is +- * what guarantees that log replay will remove/adjust file extent items in the +- * fs/subvol tree. +- * +- * Here we do not need to care about holes between extents, that is already done +- * by copy_items(). We also only need to do this in the full sync path, where we +- * lookup for extents from the fs/subvol tree only. In the fast path case, we +- * lookup the list of modified extent maps and if any represents a hole, we +- * insert a corresponding extent representing a hole in the log tree. ++ * When using the NO_HOLES feature if we punched a hole that causes the ++ * deletion of entire leafs or all the extent items of the first leaf (the one ++ * that contains the inode item and references) we may end up not processing ++ * any extents, because there are no leafs with a generation matching the ++ * current transaction that have extent items for our inode. So we need to find ++ * if any holes exist and then log them. We also need to log holes after any ++ * truncate operation that changes the inode's size. + */ +-static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct btrfs_inode *inode, +- struct btrfs_path *path) ++static int btrfs_log_holes(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct btrfs_inode *inode, ++ struct btrfs_path *path) + { + struct btrfs_fs_info *fs_info = root->fs_info; +- int ret; + struct btrfs_key key; +- u64 hole_start; +- u64 hole_size; +- struct extent_buffer *leaf; +- struct btrfs_root *log = root->log_root; + const u64 ino = btrfs_ino(inode); + const u64 i_size = i_size_read(&inode->vfs_inode); ++ u64 prev_extent_end = 0; ++ int ret; + +- if (!btrfs_fs_incompat(fs_info, NO_HOLES)) ++ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; +- key.offset = (u64)-1; ++ key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +- ASSERT(ret != 0); + if (ret < 0) + return ret; + +- ASSERT(path->slots[0] > 0); +- path->slots[0]--; +- leaf = path->nodes[0]; +- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +- +- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { +- /* inode does not have any extents */ +- hole_start = 0; +- hole_size = i_size; +- } else { ++ while (true) { + struct btrfs_file_extent_item *extent; ++ struct extent_buffer *leaf = path->nodes[0]; + u64 len; + +- /* +- * If there's an extent beyond i_size, an explicit hole was +- * already inserted by copy_items(). +- */ +- if (key.offset >= i_size) +- return 0; ++ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ++ ret = btrfs_next_leaf(root, path); ++ if (ret < 0) ++ return ret; ++ if (ret > 0) { ++ ret = 0; ++ break; ++ } ++ leaf = path->nodes[0]; ++ } ++ ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) ++ break; ++ ++ /* We have a hole, log it. */ ++ if (prev_extent_end < key.offset) { ++ const u64 hole_len = key.offset - prev_extent_end; ++ ++ /* ++ * Release the path to avoid deadlocks with other code ++ * paths that search the root while holding locks on ++ * leafs from the log root. ++ */ ++ btrfs_release_path(path); ++ ret = btrfs_insert_file_extent(trans, root->log_root, ++ ino, prev_extent_end, 0, ++ 0, hole_len, 0, hole_len, ++ 0, 0, 0); ++ if (ret < 0) ++ return ret; ++ ++ /* ++ * Search for the same key again in the root. Since it's ++ * an extent item and we are holding the inode lock, the ++ * key must still exist. If it doesn't just emit warning ++ * and return an error to fall back to a transaction ++ * commit. ++ */ ++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ++ if (ret < 0) ++ return ret; ++ if (WARN_ON(ret > 0)) ++ return -ENOENT; ++ leaf = path->nodes[0]; ++ } + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); +- + if (btrfs_file_extent_type(leaf, extent) == +- BTRFS_FILE_EXTENT_INLINE) +- return 0; ++ BTRFS_FILE_EXTENT_INLINE) { ++ len = btrfs_file_extent_ram_bytes(leaf, extent); ++ prev_extent_end = ALIGN(key.offset + len, ++ fs_info->sectorsize); ++ } else { ++ len = btrfs_file_extent_num_bytes(leaf, extent); ++ prev_extent_end = key.offset + len; ++ } + +- len = btrfs_file_extent_num_bytes(leaf, extent); +- /* Last extent goes beyond i_size, no need to log a hole. */ +- if (key.offset + len > i_size) +- return 0; +- hole_start = key.offset + len; +- hole_size = i_size - hole_start; ++ path->slots[0]++; ++ cond_resched(); + } +- btrfs_release_path(path); + +- /* Last extent ends at i_size. */ +- if (hole_size == 0) +- return 0; ++ if (prev_extent_end < i_size) { ++ u64 hole_len; + +- hole_size = ALIGN(hole_size, fs_info->sectorsize); +- ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, +- hole_size, 0, hole_size, 0, 0, 0); +- return ret; ++ btrfs_release_path(path); ++ hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); ++ ret = btrfs_insert_file_extent(trans, root->log_root, ++ ino, prev_extent_end, 0, 0, ++ hole_len, 0, hole_len, ++ 0, 0, 0); ++ if (ret < 0) ++ return ret; ++ } ++ ++ return 0; + } + + /* +@@ -5110,7 +4934,6 @@ static int btrfs_log_inode(struct btrfs_ + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; +- u64 last_extent = 0; + int err = 0; + int ret; + int nritems; +@@ -5288,7 +5111,7 @@ again: + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, +- &last_extent, ins_start_slot, ++ ins_start_slot, + ins_nr, inode_only, + logged_isize); + if (ret < 0) { +@@ -5311,17 +5134,13 @@ again: + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, +- &last_extent, ins_start_slot, ++ ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; +- if (ret) { +- btrfs_release_path(path); +- continue; +- } + goto next_slot; + } + +@@ -5334,18 +5153,13 @@ again: + goto next_slot; + } + +- ret = copy_items(trans, inode, dst_path, path, &last_extent, ++ ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, inode_only, + logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } +- if (ret) { +- ins_nr = 0; +- btrfs_release_path(path); +- continue; +- } + ins_nr = 1; + ins_start_slot = path->slots[0]; + next_slot: +@@ -5359,13 +5173,12 @@ next_slot: + } + if (ins_nr) { + ret = copy_items(trans, inode, dst_path, path, +- &last_extent, ins_start_slot, ++ ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } +- ret = 0; + ins_nr = 0; + } + btrfs_release_path(path); +@@ -5380,14 +5193,13 @@ next_key: + } + } + if (ins_nr) { +- ret = copy_items(trans, inode, dst_path, path, &last_extent, ++ ret = copy_items(trans, inode, dst_path, path, + ins_start_slot, ins_nr, inode_only, + logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } +- ret = 0; + ins_nr = 0; + } + +@@ -5400,7 +5212,7 @@ next_key: + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); +- err = btrfs_log_trailing_hole(trans, root, inode, path); ++ err = btrfs_log_holes(trans, root, inode, path); + if (err) + goto out_unlock; + } diff --git a/queue-5.5/btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch b/queue-5.5/btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch new file mode 100644 index 00000000000..89db330edb8 --- /dev/null +++ b/queue-5.5/btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch @@ -0,0 +1,237 @@ +From 7227ff4de55d931bbdc156c8ef0ce4f100c78a5b Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 22 Jan 2020 12:23:20 +0000 +Subject: Btrfs: fix race between adding and putting tree mod seq elements and nodes + +From: Filipe Manana + +commit 7227ff4de55d931bbdc156c8ef0ce4f100c78a5b upstream. + +There is a race between adding and removing elements to the tree mod log +list and rbtree that can lead to use-after-free problems. + +Consider the following example that explains how/why the problems happens: + +1) Task A has mod log element with sequence number 200. It currently is + the only element in the mod log list; + +2) Task A calls btrfs_put_tree_mod_seq() because it no longer needs to + access the tree mod log. When it enters the function, it initializes + 'min_seq' to (u64)-1. Then it acquires the lock 'tree_mod_seq_lock' + before checking if there are other elements in the mod seq list. + Since the list it empty, 'min_seq' remains set to (u64)-1. Then it + unlocks the lock 'tree_mod_seq_lock'; + +3) Before task A acquires the lock 'tree_mod_log_lock', task B adds + itself to the mod seq list through btrfs_get_tree_mod_seq() and gets a + sequence number of 201; + +4) Some other task, name it task C, modifies a btree and because there + elements in the mod seq list, it adds a tree mod elem to the tree + mod log rbtree. That node added to the mod log rbtree is assigned + a sequence number of 202; + +5) Task B, which is doing fiemap and resolving indirect back references, + calls btrfs get_old_root(), with 'time_seq' == 201, which in turn + calls tree_mod_log_search() - the search returns the mod log node + from the rbtree with sequence number 202, created by task C; + +6) Task A now acquires the lock 'tree_mod_log_lock', starts iterating + the mod log rbtree and finds the node with sequence number 202. Since + 202 is less than the previously computed 'min_seq', (u64)-1, it + removes the node and frees it; + +7) Task B still has a pointer to the node with sequence number 202, and + it dereferences the pointer itself and through the call to + __tree_mod_log_rewind(), resulting in a use-after-free problem. + +This issue can be triggered sporadically with the test case generic/561 +from fstests, and it happens more frequently with a higher number of +duperemove processes. When it happens to me, it either freezes the VM or +it produces a trace like the following before crashing: + + [ 1245.321140] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI + [ 1245.321200] CPU: 1 PID: 26997 Comm: pool Not tainted 5.5.0-rc6-btrfs-next-52 #1 + [ 1245.321235] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014 + [ 1245.321287] RIP: 0010:rb_next+0x16/0x50 + [ 1245.321307] Code: .... + [ 1245.321372] RSP: 0018:ffffa151c4d039b0 EFLAGS: 00010202 + [ 1245.321388] RAX: 6b6b6b6b6b6b6b6b RBX: ffff8ae221363c80 RCX: 6b6b6b6b6b6b6b6b + [ 1245.321409] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8ae221363c80 + [ 1245.321439] RBP: ffff8ae20fcc4688 R08: 0000000000000002 R09: 0000000000000000 + [ 1245.321475] R10: ffff8ae20b120910 R11: 00000000243f8bb1 R12: 0000000000000038 + [ 1245.321506] R13: ffff8ae221363c80 R14: 000000000000075f R15: ffff8ae223f762b8 + [ 1245.321539] FS: 00007fdee1ec7700(0000) GS:ffff8ae236c80000(0000) knlGS:0000000000000000 + [ 1245.321591] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [ 1245.321614] CR2: 00007fded4030c48 CR3: 000000021da16003 CR4: 00000000003606e0 + [ 1245.321642] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [ 1245.321668] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [ 1245.321706] Call Trace: + [ 1245.321798] __tree_mod_log_rewind+0xbf/0x280 [btrfs] + [ 1245.321841] btrfs_search_old_slot+0x105/0xd00 [btrfs] + [ 1245.321877] resolve_indirect_refs+0x1eb/0xc60 [btrfs] + [ 1245.321912] find_parent_nodes+0x3dc/0x11b0 [btrfs] + [ 1245.321947] btrfs_check_shared+0x115/0x1c0 [btrfs] + [ 1245.321980] ? extent_fiemap+0x59d/0x6d0 [btrfs] + [ 1245.322029] extent_fiemap+0x59d/0x6d0 [btrfs] + [ 1245.322066] do_vfs_ioctl+0x45a/0x750 + [ 1245.322081] ksys_ioctl+0x70/0x80 + [ 1245.322092] ? trace_hardirqs_off_thunk+0x1a/0x1c + [ 1245.322113] __x64_sys_ioctl+0x16/0x20 + [ 1245.322126] do_syscall_64+0x5c/0x280 + [ 1245.322139] entry_SYSCALL_64_after_hwframe+0x49/0xbe + [ 1245.322155] RIP: 0033:0x7fdee3942dd7 + [ 1245.322177] Code: .... + [ 1245.322258] RSP: 002b:00007fdee1ec6c88 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 + [ 1245.322294] RAX: ffffffffffffffda RBX: 00007fded40210d8 RCX: 00007fdee3942dd7 + [ 1245.322314] RDX: 00007fded40210d8 RSI: 00000000c020660b RDI: 0000000000000004 + [ 1245.322337] RBP: 0000562aa89e7510 R08: 0000000000000000 R09: 00007fdee1ec6d44 + [ 1245.322369] R10: 0000000000000073 R11: 0000000000000246 R12: 00007fdee1ec6d48 + [ 1245.322390] R13: 00007fdee1ec6d40 R14: 00007fded40210d0 R15: 00007fdee1ec6d50 + [ 1245.322423] Modules linked in: .... + [ 1245.323443] ---[ end trace 01de1e9ec5dff3cd ]--- + +Fix this by ensuring that btrfs_put_tree_mod_seq() computes the minimum +sequence number and iterates the rbtree while holding the lock +'tree_mod_log_lock' in write mode. Also get rid of the 'tree_mod_seq_lock' +lock, since it is now redundant. + +Fixes: bd989ba359f2ac ("Btrfs: add tree modification log functions") +Fixes: 097b8a7c9e48e2 ("Btrfs: join tree mod log code with the code holding back delayed refs") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Reviewed-by: Nikolay Borisov +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.c | 8 ++------ + fs/btrfs/ctree.h | 6 ++---- + fs/btrfs/delayed-ref.c | 8 ++++---- + fs/btrfs/disk-io.c | 1 - + fs/btrfs/tests/btrfs-tests.c | 1 - + 5 files changed, 8 insertions(+), 16 deletions(-) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -326,12 +326,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_ + struct seq_list *elem) + { + write_lock(&fs_info->tree_mod_log_lock); +- spin_lock(&fs_info->tree_mod_seq_lock); + if (!elem->seq) { + elem->seq = btrfs_inc_tree_mod_seq(fs_info); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + } +- spin_unlock(&fs_info->tree_mod_seq_lock); + write_unlock(&fs_info->tree_mod_log_lock); + + return elem->seq; +@@ -351,7 +349,7 @@ void btrfs_put_tree_mod_seq(struct btrfs + if (!seq_putting) + return; + +- spin_lock(&fs_info->tree_mod_seq_lock); ++ write_lock(&fs_info->tree_mod_log_lock); + list_del(&elem->list); + elem->seq = 0; + +@@ -362,19 +360,17 @@ void btrfs_put_tree_mod_seq(struct btrfs + * blocker with lower sequence number exists, we + * cannot remove anything from the log + */ +- spin_unlock(&fs_info->tree_mod_seq_lock); ++ write_unlock(&fs_info->tree_mod_log_lock); + return; + } + min_seq = cur_elem->seq; + } + } +- spin_unlock(&fs_info->tree_mod_seq_lock); + + /* + * anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ +- write_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -673,14 +673,12 @@ struct btrfs_fs_info { + atomic_t nr_delayed_iputs; + wait_queue_head_t delayed_iputs_wait; + +- /* this protects tree_mod_seq_list */ +- spinlock_t tree_mod_seq_lock; + atomic64_t tree_mod_seq; +- struct list_head tree_mod_seq_list; + +- /* this protects tree_mod_log */ ++ /* this protects tree_mod_log and tree_mod_seq_list */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; ++ struct list_head tree_mod_seq_list; + + atomic_t async_delalloc_pages; + +--- a/fs/btrfs/delayed-ref.c ++++ b/fs/btrfs/delayed-ref.c +@@ -492,7 +492,7 @@ void btrfs_merge_delayed_refs(struct btr + if (head->is_data) + return; + +- spin_lock(&fs_info->tree_mod_seq_lock); ++ read_lock(&fs_info->tree_mod_log_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *elem; + +@@ -500,7 +500,7 @@ void btrfs_merge_delayed_refs(struct btr + struct seq_list, list); + seq = elem->seq; + } +- spin_unlock(&fs_info->tree_mod_seq_lock); ++ read_unlock(&fs_info->tree_mod_log_lock); + + again: + for (node = rb_first_cached(&head->ref_tree); node; +@@ -518,7 +518,7 @@ int btrfs_check_delayed_seq(struct btrfs + struct seq_list *elem; + int ret = 0; + +- spin_lock(&fs_info->tree_mod_seq_lock); ++ read_lock(&fs_info->tree_mod_log_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); +@@ -531,7 +531,7 @@ int btrfs_check_delayed_seq(struct btrfs + } + } + +- spin_unlock(&fs_info->tree_mod_seq_lock); ++ read_unlock(&fs_info->tree_mod_log_lock); + return ret; + } + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2691,7 +2691,6 @@ int __cold open_ctree(struct super_block + spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->delayed_iput_lock); + spin_lock_init(&fs_info->defrag_inodes_lock); +- spin_lock_init(&fs_info->tree_mod_seq_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->unused_bgs_lock); +--- a/fs/btrfs/tests/btrfs-tests.c ++++ b/fs/btrfs/tests/btrfs-tests.c +@@ -121,7 +121,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_ + spin_lock_init(&fs_info->qgroup_lock); + spin_lock_init(&fs_info->super_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); +- spin_lock_init(&fs_info->tree_mod_seq_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + mutex_init(&fs_info->qgroup_rescan_lock); + rwlock_init(&fs_info->tree_mod_log_lock); diff --git a/queue-5.5/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch b/queue-5.5/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch new file mode 100644 index 00000000000..7fa5643dff6 --- /dev/null +++ b/queue-5.5/btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch @@ -0,0 +1,105 @@ +From 42ffb0bf584ae5b6b38f72259af1e0ee417ac77f Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Thu, 23 Jan 2020 15:33:02 -0500 +Subject: btrfs: flush write bio if we loop in extent_write_cache_pages + +From: Josef Bacik + +commit 42ffb0bf584ae5b6b38f72259af1e0ee417ac77f upstream. + +There exists a deadlock with range_cyclic that has existed forever. If +we loop around with a bio already built we could deadlock with a writer +who has the page locked that we're attempting to write but is waiting on +a page in our bio to be written out. The task traces are as follows + + PID: 1329874 TASK: ffff889ebcdf3800 CPU: 33 COMMAND: "kworker/u113:5" + #0 [ffffc900297bb658] __schedule at ffffffff81a4c33f + #1 [ffffc900297bb6e0] schedule at ffffffff81a4c6e3 + #2 [ffffc900297bb6f8] io_schedule at ffffffff81a4ca42 + #3 [ffffc900297bb708] __lock_page at ffffffff811f145b + #4 [ffffc900297bb798] __process_pages_contig at ffffffff814bc502 + #5 [ffffc900297bb8c8] lock_delalloc_pages at ffffffff814bc684 + #6 [ffffc900297bb900] find_lock_delalloc_range at ffffffff814be9ff + #7 [ffffc900297bb9a0] writepage_delalloc at ffffffff814bebd0 + #8 [ffffc900297bba18] __extent_writepage at ffffffff814bfbf2 + #9 [ffffc900297bba98] extent_write_cache_pages at ffffffff814bffbd + + PID: 2167901 TASK: ffff889dc6a59c00 CPU: 14 COMMAND: + "aio-dio-invalid" + #0 [ffffc9003b50bb18] __schedule at ffffffff81a4c33f + #1 [ffffc9003b50bba0] schedule at ffffffff81a4c6e3 + #2 [ffffc9003b50bbb8] io_schedule at ffffffff81a4ca42 + #3 [ffffc9003b50bbc8] wait_on_page_bit at ffffffff811f24d6 + #4 [ffffc9003b50bc60] prepare_pages at ffffffff814b05a7 + #5 [ffffc9003b50bcd8] btrfs_buffered_write at ffffffff814b1359 + #6 [ffffc9003b50bdb0] btrfs_file_write_iter at ffffffff814b5933 + #7 [ffffc9003b50be38] new_sync_write at ffffffff8128f6a8 + #8 [ffffc9003b50bec8] vfs_write at ffffffff81292b9d + #9 [ffffc9003b50bf00] ksys_pwrite64 at ffffffff81293032 + +I used drgn to find the respective pages we were stuck on + +page_entry.page 0xffffea00fbfc7500 index 8148 bit 15 pid 2167901 +page_entry.page 0xffffea00f9bb7400 index 7680 bit 0 pid 1329874 + +As you can see the kworker is waiting for bit 0 (PG_locked) on index +7680, and aio-dio-invalid is waiting for bit 15 (PG_writeback) on index +8148. aio-dio-invalid has 7680, and the kworker epd looks like the +following + + crash> struct extent_page_data ffffc900297bbbb0 + struct extent_page_data { + bio = 0xffff889f747ed830, + tree = 0xffff889eed6ba448, + extent_locked = 0, + sync_io = 0 + } + +Probably worth mentioning as well that it waits for writeback of the +page to complete while holding a lock on it (at prepare_pages()). + +Using drgn I walked the bio pages looking for page +0xffffea00fbfc7500 which is the one we're waiting for writeback on + + bio = Object(prog, 'struct bio', address=0xffff889f747ed830) + for i in range(0, bio.bi_vcnt.value_()): + bv = bio.bi_io_vec[i] + if bv.bv_page.value_() == 0xffffea00fbfc7500: + print("FOUND IT") + +which validated what I suspected. + +The fix for this is simple, flush the epd before we loop back around to +the beginning of the file during writeout. + +Fixes: b293f02e1423 ("Btrfs: Add writepages support") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent_io.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -4188,7 +4188,16 @@ retry: + */ + scanned = 1; + index = 0; +- goto retry; ++ ++ /* ++ * If we're looping we could run into a page that is locked by a ++ * writer and that writer could be waiting on writeback for a ++ * page in our current bio, and thus deadlock, so flush the ++ * write bio here. ++ */ ++ ret = flush_write_bio(epd); ++ if (!ret) ++ goto retry; + } + + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) diff --git a/queue-5.5/btrfs-free-block-groups-after-free-ing-fs-trees.patch b/queue-5.5/btrfs-free-block-groups-after-free-ing-fs-trees.patch new file mode 100644 index 00000000000..346b61a6372 --- /dev/null +++ b/queue-5.5/btrfs-free-block-groups-after-free-ing-fs-trees.patch @@ -0,0 +1,53 @@ +From 4e19443da1941050b346f8fc4c368aa68413bc88 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Tue, 21 Jan 2020 09:17:06 -0500 +Subject: btrfs: free block groups after free'ing fs trees + +From: Josef Bacik + +commit 4e19443da1941050b346f8fc4c368aa68413bc88 upstream. + +Sometimes when running generic/475 we would trip the +WARN_ON(cache->reserved) check when free'ing the block groups on umount. +This is because sometimes we don't commit the transaction because of IO +errors and thus do not cleanup the tree logs until at umount time. + +These blocks are still reserved until they are cleaned up, but they +aren't cleaned up until _after_ we do the free block groups work. Fix +this by moving the free after free'ing the fs roots, that way all of the +tree logs are cleaned up and we have a properly cleaned fs. A bunch of +loops of generic/475 confirmed this fixes the problem. + +CC: stable@vger.kernel.org # 4.9+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/disk-io.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -4026,11 +4026,18 @@ void __cold close_ctree(struct btrfs_fs_ + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + btrfs_stop_all_workers(fs_info); + +- btrfs_free_block_groups(fs_info); +- + clear_bit(BTRFS_FS_OPEN, &fs_info->flags); + free_root_pointers(fs_info, true); + ++ /* ++ * We must free the block groups after dropping the fs_roots as we could ++ * have had an IO error and have left over tree log blocks that aren't ++ * cleaned up until the fs roots are freed. This makes the block group ++ * accounting appear to be wrong because there's pending reserved bytes, ++ * so make sure we do the block group cleanup afterwards. ++ */ ++ btrfs_free_block_groups(fs_info); ++ + iput(fs_info->btree_inode); + + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY diff --git a/queue-5.5/btrfs-make-deduplication-with-range-including-the-last-block-work.patch b/queue-5.5/btrfs-make-deduplication-with-range-including-the-last-block-work.patch new file mode 100644 index 00000000000..91cadb5c2ba --- /dev/null +++ b/queue-5.5/btrfs-make-deduplication-with-range-including-the-last-block-work.patch @@ -0,0 +1,67 @@ +From 831d2fa25ab8e27592b1b0268dae6f2dfaf7cc43 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 16 Dec 2019 18:26:56 +0000 +Subject: Btrfs: make deduplication with range including the last block work + +From: Filipe Manana + +commit 831d2fa25ab8e27592b1b0268dae6f2dfaf7cc43 upstream. + +Since btrfs was migrated to use the generic VFS helpers for clone and +deduplication, it stopped allowing for the last block of a file to be +deduplicated when the source file size is not sector size aligned (when +eof is somewhere in the middle of the last block). There are two reasons +for that: + +1) The generic code always rounds down, to a multiple of the block size, + the range's length for deduplications. This means we end up never + deduplicating the last block when the eof is not block size aligned, + even for the safe case where the destination range's end offset matches + the destination file's size. That rounding down operation is done at + generic_remap_check_len(); + +2) Because of that, the btrfs specific code does not expect anymore any + non-aligned range length's for deduplication and therefore does not + work if such nona-aligned length is given. + +This patch addresses that second part, and it depends on a patch that +fixes generic_remap_check_len(), in the VFS, which was submitted ealier +and has the following subject: + + "fs: allow deduplication of eof block into the end of the destination file" + +These two patches address reports from users that started seeing lower +deduplication rates due to the last block never being deduplicated when +the file size is not aligned to the filesystem's block size. + +Link: https://lore.kernel.org/linux-btrfs/2019-1576167349.500456@svIo.N5dq.dFFD/ +CC: stable@vger.kernel.org # 5.1+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -3243,6 +3243,7 @@ static void btrfs_double_extent_lock(str + static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, + struct inode *dst, u64 dst_loff) + { ++ const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + int ret; + + /* +@@ -3250,7 +3251,7 @@ static int btrfs_extent_same_range(struc + * source range to serialize with relocation. + */ + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); +- ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1); ++ ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + + return ret; diff --git a/queue-5.5/btrfs-send-fix-emission-of-invalid-clone-operations-within-the-same-file.patch b/queue-5.5/btrfs-send-fix-emission-of-invalid-clone-operations-within-the-same-file.patch new file mode 100644 index 00000000000..937b535a621 --- /dev/null +++ b/queue-5.5/btrfs-send-fix-emission-of-invalid-clone-operations-within-the-same-file.patch @@ -0,0 +1,92 @@ +From 9722b10148504c4153a74a9c89725af271e490fc Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 29 Jan 2020 17:09:53 +0000 +Subject: Btrfs: send, fix emission of invalid clone operations within the same file + +From: Filipe Manana + +commit 9722b10148504c4153a74a9c89725af271e490fc upstream. + +When doing an incremental send and a file has extents shared with itself +at different file offsets, it's possible for send to emit clone operations +that will fail at the destination because the source range goes beyond the +file's current size. This happens when the file size has increased in the +send snapshot, there is a hole between the shared extents and both shared +extents are at file offsets which are greater the file's size in the +parent snapshot. + +Example: + + $ mkfs.btrfs -f /dev/sdb + $ mount /dev/sdb /mnt/sdb + + $ xfs_io -f -c "pwrite -S 0xf1 0 64K" /mnt/sdb/foobar + $ btrfs subvolume snapshot -r /mnt/sdb /mnt/sdb/base + $ btrfs send -f /tmp/1.snap /mnt/sdb/base + + # Create a 320K extent at file offset 512K. + $ xfs_io -c "pwrite -S 0xab 512K 64K" /mnt/sdb/foobar + $ xfs_io -c "pwrite -S 0xcd 576K 64K" /mnt/sdb/foobar + $ xfs_io -c "pwrite -S 0xef 640K 64K" /mnt/sdb/foobar + $ xfs_io -c "pwrite -S 0x64 704K 64K" /mnt/sdb/foobar + $ xfs_io -c "pwrite -S 0x73 768K 64K" /mnt/sdb/foobar + + # Clone part of that 320K extent into a lower file offset (192K). + # This file offset is greater than the file's size in the parent + # snapshot (64K). Also the clone range is a bit behind the offset of + # the 320K extent so that we leave a hole between the shared extents. + $ xfs_io -c "reflink /mnt/sdb/foobar 448K 192K 192K" /mnt/sdb/foobar + + $ btrfs subvolume snapshot -r /mnt/sdb /mnt/sdb/incr + $ btrfs send -p /mnt/sdb/base -f /tmp/2.snap /mnt/sdb/incr + + $ mkfs.btrfs -f /dev/sdc + $ mount /dev/sdc /mnt/sdc + + $ btrfs receive -f /tmp/1.snap /mnt/sdc + $ btrfs receive -f /tmp/2.snap /mnt/sdc + ERROR: failed to clone extents to foobar: Invalid argument + +The problem is that after processing the extent at file offset 256K, which +refers to the first 128K of the 320K extent created by the buffered write +operations, we have 'cur_inode_next_write_offset' set to 384K, which +corresponds to the end offset of the partially shared extent (256K + 128K) +and to the current file size in the receiver. Then when we process the +extent at offset 512K, we do extent backreference iteration to figure out +if we can clone the extent from some other inode or from the same inode, +and we consider the extent at offset 256K of the same inode as a valid +source for a clone operation, which is not correct because at that point +the current file size in the receiver is 384K, which corresponds to the +end of last processed extent (at file offset 256K), so using a clone +source range from 256K to 256K + 320K is invalid because that goes past +the current size of the file (384K) - this makes the receiver get an +-EINVAL error when attempting the clone operation. + +So fix this by excluding clone sources that have a range that goes beyond +the current file size in the receiver when iterating extent backreferences. + +A test case for fstests follows soon. + +Fixes: 11f2069c113e02 ("Btrfs: send, allow clone operations within the same file") +CC: stable@vger.kernel.org # 5.5+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/send.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -1269,7 +1269,8 @@ static int __iterate_backrefs(u64 ino, u + * destination of the stream. + */ + if (ino == bctx->cur_objectid && +- offset >= bctx->sctx->cur_inode_next_write_offset) ++ offset + bctx->extent_len > ++ bctx->sctx->cur_inode_next_write_offset) + return 0; + } + diff --git a/queue-5.5/btrfs-set-trans-drity-in-btrfs_commit_transaction.patch b/queue-5.5/btrfs-set-trans-drity-in-btrfs_commit_transaction.patch new file mode 100644 index 00000000000..21e983434de --- /dev/null +++ b/queue-5.5/btrfs-set-trans-drity-in-btrfs_commit_transaction.patch @@ -0,0 +1,96 @@ +From d62b23c94952e78211a383b7d90ef0afbd9a3717 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 17 Jan 2020 08:57:51 -0500 +Subject: btrfs: set trans->drity in btrfs_commit_transaction + +From: Josef Bacik + +commit d62b23c94952e78211a383b7d90ef0afbd9a3717 upstream. + +If we abort a transaction we have the following sequence + +if (!trans->dirty && list_empty(&trans->new_bgs)) + return; +WRITE_ONCE(trans->transaction->aborted, err); + +The idea being if we didn't modify anything with our trans handle then +we don't really need to abort the whole transaction, maybe the other +trans handles are fine and we can carry on. + +However in the case of create_snapshot we add a pending_snapshot object +to our transaction and then commit the transaction. We don't actually +modify anything. sync() behaves the same way, attach to an existing +transaction and commit it. This means that if we have an IO error in +the right places we could abort the committing transaction with our +trans->dirty being not set and thus not set transaction->aborted. + +This is a problem because in the create_snapshot() case we depend on +pending->error being set to something, or btrfs_commit_transaction +returning an error. + +If we are not the trans handle that gets to commit the transaction, and +we're waiting on the commit to happen we get our return value from +cur_trans->aborted. If this was not set to anything because sync() hit +an error in the transaction commit before it could modify anything then +cur_trans->aborted would be 0. Thus we'd return 0 from +btrfs_commit_transaction() in create_snapshot. + +This is a problem because we then try to do things with +pending_snapshot->snap, which will be NULL because we didn't create the +snapshot, and then we'll get a NULL pointer dereference like the +following + +"BUG: kernel NULL pointer dereference, address: 00000000000001f0" +RIP: 0010:btrfs_orphan_cleanup+0x2d/0x330 +Call Trace: + ? btrfs_mksubvol.isra.31+0x3f2/0x510 + btrfs_mksubvol.isra.31+0x4bc/0x510 + ? __sb_start_write+0xfa/0x200 + ? mnt_want_write_file+0x24/0x50 + btrfs_ioctl_snap_create_transid+0x16c/0x1a0 + btrfs_ioctl_snap_create_v2+0x11e/0x1a0 + btrfs_ioctl+0x1534/0x2c10 + ? free_debug_processing+0x262/0x2a3 + do_vfs_ioctl+0xa6/0x6b0 + ? do_sys_open+0x188/0x220 + ? syscall_trace_enter+0x1f8/0x330 + ksys_ioctl+0x60/0x90 + __x64_sys_ioctl+0x16/0x20 + do_syscall_64+0x4a/0x1b0 + +In order to fix this we need to make sure anybody who calls +commit_transaction has trans->dirty set so that they properly set the +trans->transaction->aborted value properly so any waiters know bad +things happened. + +This was found while I was running generic/475 with my modified +fsstress, it reproduced within a few runs. I ran with this patch all +night and didn't see the problem again. + +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/transaction.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -2013,6 +2013,14 @@ int btrfs_commit_transaction(struct btrf + + ASSERT(refcount_read(&trans->use_count) == 1); + ++ /* ++ * Some places just start a transaction to commit it. We need to make ++ * sure that if this commit fails that the abort code actually marks the ++ * transaction as failed, so set trans->dirty to make the abort code do ++ * the right thing. ++ */ ++ trans->dirty = true; ++ + /* Stop the commit early if ->aborted is set */ + if (unlikely(READ_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; diff --git a/queue-5.5/drm-atmel-hlcdc-enable-clock-before-configuring-timing-engine.patch b/queue-5.5/drm-atmel-hlcdc-enable-clock-before-configuring-timing-engine.patch new file mode 100644 index 00000000000..87b11928dee --- /dev/null +++ b/queue-5.5/drm-atmel-hlcdc-enable-clock-before-configuring-timing-engine.patch @@ -0,0 +1,53 @@ +From 2c1fb9d86f6820abbfaa38a6836157c76ccb4e7b Mon Sep 17 00:00:00 2001 +From: Claudiu Beznea +Date: Wed, 18 Dec 2019 14:28:25 +0200 +Subject: drm: atmel-hlcdc: enable clock before configuring timing engine + +From: Claudiu Beznea + +commit 2c1fb9d86f6820abbfaa38a6836157c76ccb4e7b upstream. + +Changing pixel clock source without having this clock source enabled +will block the timing engine and the next operations after (in this case +setting ATMEL_HLCDC_CFG(5) settings in atmel_hlcdc_crtc_mode_set_nofb() +will fail). It is recomended (although in datasheet this is not present) +to actually enabled pixel clock source before doing any changes on timing +enginge (only SAM9X60 datasheet specifies that the peripheral clock and +pixel clock must be enabled before using LCD controller). + +Fixes: 1a396789f65a ("drm: add Atmel HLCDC Display Controller support") +Signed-off-by: Claudiu Beznea +Signed-off-by: Sam Ravnborg +Cc: Boris Brezillon +Cc: # v4.0+ +Link: https://patchwork.freedesktop.org/patch/msgid/1576672109-22707-3-git-send-email-claudiu.beznea@microchip.com +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c ++++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c +@@ -73,7 +73,11 @@ static void atmel_hlcdc_crtc_mode_set_no + unsigned long prate; + unsigned int mask = ATMEL_HLCDC_CLKDIV_MASK | ATMEL_HLCDC_CLKPOL; + unsigned int cfg = 0; +- int div; ++ int div, ret; ++ ++ ret = clk_prepare_enable(crtc->dc->hlcdc->sys_clk); ++ if (ret) ++ return; + + vm.vfront_porch = adj->crtc_vsync_start - adj->crtc_vdisplay; + vm.vback_porch = adj->crtc_vtotal - adj->crtc_vsync_end; +@@ -147,6 +151,8 @@ static void atmel_hlcdc_crtc_mode_set_no + ATMEL_HLCDC_VSPSU | ATMEL_HLCDC_VSPHO | + ATMEL_HLCDC_GUARDTIME_MASK | ATMEL_HLCDC_MODE_MASK, + cfg); ++ ++ clk_disable_unprepare(crtc->dc->hlcdc->sys_clk); + } + + static enum drm_mode_status diff --git a/queue-5.5/drm-atmel-hlcdc-prefer-a-lower-pixel-clock-than-requested.patch b/queue-5.5/drm-atmel-hlcdc-prefer-a-lower-pixel-clock-than-requested.patch new file mode 100644 index 00000000000..fa88f0e92e5 --- /dev/null +++ b/queue-5.5/drm-atmel-hlcdc-prefer-a-lower-pixel-clock-than-requested.patch @@ -0,0 +1,43 @@ +From 51a19d150b520f6cb42143f3bdffacd3c33d7ac5 Mon Sep 17 00:00:00 2001 +From: Peter Rosin +Date: Wed, 18 Dec 2019 14:28:28 +0200 +Subject: drm: atmel-hlcdc: prefer a lower pixel-clock than requested + +From: Peter Rosin + +commit 51a19d150b520f6cb42143f3bdffacd3c33d7ac5 upstream. + +The intention was to only select a higher pixel-clock rate than the +requested, if a slight overclocking would result in a rate significantly +closer to the requested rate than if the conservative lower pixel-clock +rate is selected. The fixed patch has the logic the other way around and +actually prefers the higher frequency. Fix that. + +Signed-off-by: Peter Rosin +Signed-off-by: Claudiu Beznea +Signed-off-by: Sam Ravnborg +Fixes: 9946a3a9dbed ("drm/atmel-hlcdc: allow selecting a higher pixel-clock than requested") +Reported-by: Claudiu Beznea +Tested-by: Claudiu Beznea +Cc: Boris Brezillon +Cc: # v4.20+ +Link: https://patchwork.freedesktop.org/patch/msgid/1576672109-22707-6-git-send-email-claudiu.beznea@microchip.com +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c ++++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c +@@ -121,8 +121,8 @@ static void atmel_hlcdc_crtc_mode_set_no + int div_low = prate / mode_rate; + + if (div_low >= 2 && +- ((prate / div_low - mode_rate) < +- 10 * (mode_rate - prate / div))) ++ (10 * (prate / div_low - mode_rate) < ++ (mode_rate - prate / div))) + /* + * At least 10 times better when using a higher + * frequency than requested, instead of a lower. diff --git a/queue-5.5/drm-atmel-hlcdc-use-double-rate-for-pixel-clock-only-if-supported.patch b/queue-5.5/drm-atmel-hlcdc-use-double-rate-for-pixel-clock-only-if-supported.patch new file mode 100644 index 00000000000..ca21ebf58cf --- /dev/null +++ b/queue-5.5/drm-atmel-hlcdc-use-double-rate-for-pixel-clock-only-if-supported.patch @@ -0,0 +1,45 @@ +From 07acf4bafe81dd37eff3fbcfbbdbc48084bc202b Mon Sep 17 00:00:00 2001 +From: Claudiu Beznea +Date: Wed, 18 Dec 2019 14:28:24 +0200 +Subject: drm: atmel-hlcdc: use double rate for pixel clock only if supported + +From: Claudiu Beznea + +commit 07acf4bafe81dd37eff3fbcfbbdbc48084bc202b upstream. + +Doubled system clock should be used as pixel cock source only if this +is supported. This is emphasized by the value of +atmel_hlcdc_crtc::dc::desc::fixed_clksrc. + +Fixes: a6eca2abdd42 ("drm: atmel-hlcdc: add config option for clock selection") +Signed-off-by: Claudiu Beznea +Signed-off-by: Sam Ravnborg +Cc: Boris Brezillon +Cc: # v5.3+ +Link: https://patchwork.freedesktop.org/patch/msgid/1576672109-22707-2-git-send-email-claudiu.beznea@microchip.com +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c ++++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c +@@ -95,14 +95,14 @@ static void atmel_hlcdc_crtc_mode_set_no + (adj->crtc_hdisplay - 1) | + ((adj->crtc_vdisplay - 1) << 16)); + ++ prate = clk_get_rate(crtc->dc->hlcdc->sys_clk); ++ mode_rate = adj->crtc_clock * 1000; + if (!crtc->dc->desc->fixed_clksrc) { ++ prate *= 2; + cfg |= ATMEL_HLCDC_CLKSEL; + mask |= ATMEL_HLCDC_CLKSEL; + } + +- prate = 2 * clk_get_rate(crtc->dc->hlcdc->sys_clk); +- mode_rate = adj->crtc_clock * 1000; +- + div = DIV_ROUND_UP(prate, mode_rate); + if (div < 2) { + div = 2; diff --git a/queue-5.5/drm-rect-avoid-division-by-zero.patch b/queue-5.5/drm-rect-avoid-division-by-zero.patch new file mode 100644 index 00000000000..10086397799 --- /dev/null +++ b/queue-5.5/drm-rect-avoid-division-by-zero.patch @@ -0,0 +1,47 @@ +From 433480c1afd44f3e1e664b85063d98cefeefa0ed Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= +Date: Fri, 22 Nov 2019 19:56:20 +0200 +Subject: drm/rect: Avoid division by zero +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Ville Syrjälä + +commit 433480c1afd44f3e1e664b85063d98cefeefa0ed upstream. + +Check for zero width/height destination rectangle in +drm_rect_clip_scaled() to avoid a division by zero. + +Cc: stable@vger.kernel.org +Fixes: f96bdf564f3e ("drm/rect: Handle rounding errors in drm_rect_clip_scaled, v3.") +Cc: Maarten Lankhorst +Cc: Benjamin Gaignard +Cc: Daniel Vetter +Testcase: igt/kms_selftest/drm_rect_clip_scaled_div_by_zero +Signed-off-by: Ville Syrjälä +Link: https://patchwork.freedesktop.org/patch/msgid/20191122175623.13565-2-ville.syrjala@linux.intel.com +Reviewed-by: Daniel Vetter +Reviewed-by: Benjamin Gaignard +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/drm_rect.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/drivers/gpu/drm/drm_rect.c ++++ b/drivers/gpu/drm/drm_rect.c +@@ -54,7 +54,12 @@ EXPORT_SYMBOL(drm_rect_intersect); + + static u32 clip_scaled(u32 src, u32 dst, u32 clip) + { +- u64 tmp = mul_u32_u32(src, dst - clip); ++ u64 tmp; ++ ++ if (dst == 0) ++ return 0; ++ ++ tmp = mul_u32_u32(src, dst - clip); + + /* + * Round toward 1.0 when clipping so that we don't accidentally diff --git a/queue-5.5/eventfd-track-eventfd_signal-recursion-depth.patch b/queue-5.5/eventfd-track-eventfd_signal-recursion-depth.patch new file mode 100644 index 00000000000..389b191ac21 --- /dev/null +++ b/queue-5.5/eventfd-track-eventfd_signal-recursion-depth.patch @@ -0,0 +1,102 @@ +From b5e683d5cab8cd433b06ae178621f083cabd4f63 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Sun, 2 Feb 2020 08:23:03 -0700 +Subject: eventfd: track eventfd_signal() recursion depth + +From: Jens Axboe + +commit b5e683d5cab8cd433b06ae178621f083cabd4f63 upstream. + +eventfd use cases from aio and io_uring can deadlock due to circular +or resursive calling, when eventfd_signal() tries to grab the waitqueue +lock. On top of that, it's also possible to construct notification +chains that are deep enough that we could blow the stack. + +Add a percpu counter that tracks the percpu recursion depth, warn if we +exceed it. The counter is also exposed so that users of eventfd_signal() +can do the right thing if it's non-zero in the context where it is +called. + +Cc: stable@vger.kernel.org # 4.19+ +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + fs/eventfd.c | 15 +++++++++++++++ + include/linux/eventfd.h | 14 ++++++++++++++ + 2 files changed, 29 insertions(+) + +--- a/fs/eventfd.c ++++ b/fs/eventfd.c +@@ -24,6 +24,8 @@ + #include + #include + ++DEFINE_PER_CPU(int, eventfd_wake_count); ++ + static DEFINE_IDA(eventfd_ida); + + struct eventfd_ctx { +@@ -60,12 +62,25 @@ __u64 eventfd_signal(struct eventfd_ctx + { + unsigned long flags; + ++ /* ++ * Deadlock or stack overflow issues can happen if we recurse here ++ * through waitqueue wakeup handlers. If the caller users potentially ++ * nested waitqueues with custom wakeup handlers, then it should ++ * check eventfd_signal_count() before calling this function. If ++ * it returns true, the eventfd_signal() call should be deferred to a ++ * safe context. ++ */ ++ if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) ++ return 0; ++ + spin_lock_irqsave(&ctx->wqh.lock, flags); ++ this_cpu_inc(eventfd_wake_count); + if (ULLONG_MAX - ctx->count < n) + n = ULLONG_MAX - ctx->count; + ctx->count += n; + if (waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, EPOLLIN); ++ this_cpu_dec(eventfd_wake_count); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return n; +--- a/include/linux/eventfd.h ++++ b/include/linux/eventfd.h +@@ -12,6 +12,8 @@ + #include + #include + #include ++#include ++#include + + /* + * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining +@@ -40,6 +42,13 @@ __u64 eventfd_signal(struct eventfd_ctx + int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, + __u64 *cnt); + ++DECLARE_PER_CPU(int, eventfd_wake_count); ++ ++static inline bool eventfd_signal_count(void) ++{ ++ return this_cpu_read(eventfd_wake_count); ++} ++ + #else /* CONFIG_EVENTFD */ + + /* +@@ -68,6 +77,11 @@ static inline int eventfd_ctx_remove_wai + return -ENOSYS; + } + ++static inline bool eventfd_signal_count(void) ++{ ++ return false; ++} ++ + #endif + + #endif /* _LINUX_EVENTFD_H */ diff --git a/queue-5.5/ext4-fix-deadlock-allocating-crypto-bounce-page-from-mempool.patch b/queue-5.5/ext4-fix-deadlock-allocating-crypto-bounce-page-from-mempool.patch new file mode 100644 index 00000000000..dd949250107 --- /dev/null +++ b/queue-5.5/ext4-fix-deadlock-allocating-crypto-bounce-page-from-mempool.patch @@ -0,0 +1,77 @@ +From 547c556f4db7c09447ecf5f833ab6aaae0c5ab58 Mon Sep 17 00:00:00 2001 +From: Eric Biggers +Date: Tue, 31 Dec 2019 12:11:49 -0600 +Subject: ext4: fix deadlock allocating crypto bounce page from mempool + +From: Eric Biggers + +commit 547c556f4db7c09447ecf5f833ab6aaae0c5ab58 upstream. + +ext4_writepages() on an encrypted file has to encrypt the data, but it +can't modify the pagecache pages in-place, so it encrypts the data into +bounce pages and writes those instead. All bounce pages are allocated +from a mempool using GFP_NOFS. + +This is not correct use of a mempool, and it can deadlock. This is +because GFP_NOFS includes __GFP_DIRECT_RECLAIM, which enables the "never +fail" mode for mempool_alloc() where a failed allocation will fall back +to waiting for one of the preallocated elements in the pool. + +But since this mode is used for all a bio's pages and not just the +first, it can deadlock waiting for pages already in the bio to be freed. + +This deadlock can be reproduced by patching mempool_alloc() to pretend +that pool->alloc() always fails (so that it always falls back to the +preallocations), and then creating an encrypted file of size > 128 KiB. + +Fix it by only using GFP_NOFS for the first page in the bio. For +subsequent pages just use GFP_NOWAIT, and if any of those fail, just +submit the bio and start a new one. + +This will need to be fixed in f2fs too, but that's less straightforward. + +Fixes: c9af28fdd449 ("ext4 crypto: don't let data integrity writebacks fail with ENOMEM") +Cc: stable@vger.kernel.org +Signed-off-by: Eric Biggers +Link: https://lore.kernel.org/r/20191231181149.47619-1-ebiggers@kernel.org +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/page-io.c | 19 ++++++++++++++----- + 1 file changed, 14 insertions(+), 5 deletions(-) + +--- a/fs/ext4/page-io.c ++++ b/fs/ext4/page-io.c +@@ -512,17 +512,26 @@ int ext4_bio_write_page(struct ext4_io_s + gfp_t gfp_flags = GFP_NOFS; + unsigned int enc_bytes = round_up(len, i_blocksize(inode)); + ++ /* ++ * Since bounce page allocation uses a mempool, we can only use ++ * a waiting mask (i.e. request guaranteed allocation) on the ++ * first page of the bio. Otherwise it can deadlock. ++ */ ++ if (io->io_bio) ++ gfp_flags = GFP_NOWAIT | __GFP_NOWARN; + retry_encrypt: + bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, + 0, gfp_flags); + if (IS_ERR(bounce_page)) { + ret = PTR_ERR(bounce_page); +- if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { +- if (io->io_bio) { ++ if (ret == -ENOMEM && ++ (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { ++ gfp_flags = GFP_NOFS; ++ if (io->io_bio) + ext4_io_submit(io); +- congestion_wait(BLK_RW_ASYNC, HZ/50); +- } +- gfp_flags |= __GFP_NOFAIL; ++ else ++ gfp_flags |= __GFP_NOFAIL; ++ congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_encrypt; + } + diff --git a/queue-5.5/ext4-fix-race-conditions-in-d_compare-and-d_hash.patch b/queue-5.5/ext4-fix-race-conditions-in-d_compare-and-d_hash.patch new file mode 100644 index 00000000000..9d20d18aba0 --- /dev/null +++ b/queue-5.5/ext4-fix-race-conditions-in-d_compare-and-d_hash.patch @@ -0,0 +1,79 @@ +From ec772f01307a2c06ebf6cdd221e6b518a71ddae7 Mon Sep 17 00:00:00 2001 +From: Eric Biggers +Date: Thu, 23 Jan 2020 20:12:34 -0800 +Subject: ext4: fix race conditions in ->d_compare() and ->d_hash() + +From: Eric Biggers + +commit ec772f01307a2c06ebf6cdd221e6b518a71ddae7 upstream. + +Since ->d_compare() and ->d_hash() can be called in RCU-walk mode, +->d_parent and ->d_inode can be concurrently modified, and in +particular, ->d_inode may be changed to NULL. For ext4_d_hash() this +resulted in a reproducible NULL dereference if a lookup is done in a +directory being deleted, e.g. with: + + int main() + { + if (fork()) { + for (;;) { + mkdir("subdir", 0700); + rmdir("subdir"); + } + } else { + for (;;) + access("subdir/file", 0); + } + } + +... or by running the 't_encrypted_d_revalidate' program from xfstests. +Both repros work in any directory on a filesystem with the encoding +feature, even if the directory doesn't actually have the casefold flag. + +I couldn't reproduce a crash in ext4_d_compare(), but it appears that a +similar crash is possible there. + +Fix these bugs by reading ->d_parent and ->d_inode using READ_ONCE() and +falling back to the case sensitive behavior if the inode is NULL. + +Reported-by: Al Viro +Fixes: b886ee3e778e ("ext4: Support case-insensitive file name lookups") +Cc: # v5.2+ +Signed-off-by: Eric Biggers +Link: https://lore.kernel.org/r/20200124041234.159740-1-ebiggers@kernel.org +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/dir.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/fs/ext4/dir.c ++++ b/fs/ext4/dir.c +@@ -672,9 +672,11 @@ static int ext4_d_compare(const struct d + const char *str, const struct qstr *name) + { + struct qstr qstr = {.name = str, .len = len }; +- struct inode *inode = dentry->d_parent->d_inode; ++ const struct dentry *parent = READ_ONCE(dentry->d_parent); ++ const struct inode *inode = READ_ONCE(parent->d_inode); + +- if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) { ++ if (!inode || !IS_CASEFOLDED(inode) || ++ !EXT4_SB(inode->i_sb)->s_encoding) { + if (len != name->len) + return -1; + return memcmp(str, name->name, len); +@@ -687,10 +689,11 @@ static int ext4_d_hash(const struct dent + { + const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb); + const struct unicode_map *um = sbi->s_encoding; ++ const struct inode *inode = READ_ONCE(dentry->d_inode); + unsigned char *norm; + int len, ret = 0; + +- if (!IS_CASEFOLDED(dentry->d_inode) || !um) ++ if (!inode || !IS_CASEFOLDED(inode) || !um) + return 0; + + norm = kmalloc(PATH_MAX, GFP_ATOMIC); diff --git a/queue-5.5/gfs2-fix-gfs2_find_jhead-that-returns-uninitialized-jhead-with-seq-0.patch b/queue-5.5/gfs2-fix-gfs2_find_jhead-that-returns-uninitialized-jhead-with-seq-0.patch new file mode 100644 index 00000000000..e02f54cb37c --- /dev/null +++ b/queue-5.5/gfs2-fix-gfs2_find_jhead-that-returns-uninitialized-jhead-with-seq-0.patch @@ -0,0 +1,38 @@ +From 7582026f6f3588ecebd281965c8a71aff6fb6158 Mon Sep 17 00:00:00 2001 +From: Abhi Das +Date: Tue, 4 Feb 2020 14:14:56 -0600 +Subject: gfs2: fix gfs2_find_jhead that returns uninitialized jhead with seq 0 + +From: Abhi Das + +commit 7582026f6f3588ecebd281965c8a71aff6fb6158 upstream. + +When the first log header in a journal happens to have a sequence +number of 0, a bug in gfs2_find_jhead() causes it to prematurely exit, +and return an uninitialized jhead with seq 0. This can cause failures +in the caller. For instance, a mount fails in one test case. + +The correct behavior is for it to continue searching through the journal +to find the correct journal head with the highest sequence number. + +Fixes: f4686c26ecc3 ("gfs2: read journal in large chunks") +Cc: stable@vger.kernel.org # v5.2+ +Signed-off-by: Abhi Das +Signed-off-by: Andreas Gruenbacher +Signed-off-by: Greg Kroah-Hartman + +--- + fs/gfs2/lops.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/gfs2/lops.c ++++ b/fs/gfs2/lops.c +@@ -422,7 +422,7 @@ static bool gfs2_jhead_pg_srch(struct gf + + for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { + if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { +- if (lh.lh_sequence > head->lh_sequence) ++ if (lh.lh_sequence >= head->lh_sequence) + *head = lh; + else { + ret = true; diff --git a/queue-5.5/gfs2-fix-o_sync-write-handling.patch b/queue-5.5/gfs2-fix-o_sync-write-handling.patch new file mode 100644 index 00000000000..b44e8cc4942 --- /dev/null +++ b/queue-5.5/gfs2-fix-o_sync-write-handling.patch @@ -0,0 +1,111 @@ +From 6e5e41e2dc4e4413296d5a4af54ac92d7cd52317 Mon Sep 17 00:00:00 2001 +From: Andreas Gruenbacher +Date: Tue, 14 Jan 2020 17:12:18 +0100 +Subject: gfs2: fix O_SYNC write handling + +From: Andreas Gruenbacher + +commit 6e5e41e2dc4e4413296d5a4af54ac92d7cd52317 upstream. + +In gfs2_file_write_iter, for direct writes, the error checking in the buffered +write fallback case is incomplete. This can cause inode write errors to go +undetected. Fix and clean up gfs2_file_write_iter along the way. + +Based on a proposed fix by Christoph Hellwig . + +Fixes: 967bcc91b044 ("gfs2: iomap direct I/O support") +Cc: stable@vger.kernel.org # v4.19+ +Signed-off-by: Andreas Gruenbacher +Signed-off-by: Greg Kroah-Hartman + +--- + fs/gfs2/file.c | 51 +++++++++++++++++++++------------------------------ + 1 file changed, 21 insertions(+), 30 deletions(-) + +--- a/fs/gfs2/file.c ++++ b/fs/gfs2/file.c +@@ -847,7 +847,7 @@ static ssize_t gfs2_file_write_iter(stru + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct gfs2_inode *ip = GFS2_I(inode); +- ssize_t written = 0, ret; ++ ssize_t ret; + + ret = gfs2_rsqa_alloc(ip); + if (ret) +@@ -879,55 +879,46 @@ static ssize_t gfs2_file_write_iter(stru + + if (iocb->ki_flags & IOCB_DIRECT) { + struct address_space *mapping = file->f_mapping; +- loff_t pos, endbyte; +- ssize_t buffered; ++ ssize_t buffered, ret2; + +- written = gfs2_file_direct_write(iocb, from); +- if (written < 0 || !iov_iter_count(from)) ++ ret = gfs2_file_direct_write(iocb, from); ++ if (ret < 0 || !iov_iter_count(from)) + goto out_unlock; + ++ iocb->ki_flags |= IOCB_DSYNC; + current->backing_dev_info = inode_to_bdi(inode); +- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); ++ buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + current->backing_dev_info = NULL; +- if (unlikely(ret < 0)) ++ if (unlikely(buffered <= 0)) + goto out_unlock; +- buffered = ret; + + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT +- * semantics. ++ * semantics. If the writeback or invalidate fails, only report ++ * the direct I/O range as we don't know if the buffered pages ++ * made it to disk. + */ +- pos = iocb->ki_pos; +- endbyte = pos + buffered - 1; +- ret = filemap_write_and_wait_range(mapping, pos, endbyte); +- if (!ret) { +- iocb->ki_pos += buffered; +- written += buffered; +- invalidate_mapping_pages(mapping, +- pos >> PAGE_SHIFT, +- endbyte >> PAGE_SHIFT); +- } else { +- /* +- * We don't know how much we wrote, so just return +- * the number of bytes which were direct-written +- */ +- } ++ iocb->ki_pos += buffered; ++ ret2 = generic_write_sync(iocb, buffered); ++ invalidate_mapping_pages(mapping, ++ (iocb->ki_pos - buffered) >> PAGE_SHIFT, ++ (iocb->ki_pos - 1) >> PAGE_SHIFT); ++ if (!ret || ret2 > 0) ++ ret += ret2; + } else { + current->backing_dev_info = inode_to_bdi(inode); + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + current->backing_dev_info = NULL; +- if (likely(ret > 0)) ++ if (likely(ret > 0)) { + iocb->ki_pos += ret; ++ ret = generic_write_sync(iocb, ret); ++ } + } + + out_unlock: + inode_unlock(inode); +- if (likely(ret > 0)) { +- /* Handle various SYNC-type writes */ +- ret = generic_write_sync(iocb, ret); +- } +- return written ? written : ret; ++ return ret; + } + + static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, diff --git a/queue-5.5/gfs2-move-setting-current-backing_dev_info.patch b/queue-5.5/gfs2-move-setting-current-backing_dev_info.patch new file mode 100644 index 00000000000..1612434f28b --- /dev/null +++ b/queue-5.5/gfs2-move-setting-current-backing_dev_info.patch @@ -0,0 +1,80 @@ +From 4c0e8dda608a51855225c611b5c6b442f95fbc56 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Wed, 15 Jan 2020 16:38:29 +0100 +Subject: gfs2: move setting current->backing_dev_info + +From: Christoph Hellwig + +commit 4c0e8dda608a51855225c611b5c6b442f95fbc56 upstream. + +Set current->backing_dev_info just around the buffered write calls to +prepare for the next fix. + +Fixes: 967bcc91b044 ("gfs2: iomap direct I/O support") +Cc: stable@vger.kernel.org # v4.19+ +Signed-off-by: Christoph Hellwig +Signed-off-by: Andreas Gruenbacher +Signed-off-by: Greg Kroah-Hartman + +--- + fs/gfs2/file.c | 21 ++++++++++----------- + 1 file changed, 10 insertions(+), 11 deletions(-) + +--- a/fs/gfs2/file.c ++++ b/fs/gfs2/file.c +@@ -867,18 +867,15 @@ static ssize_t gfs2_file_write_iter(stru + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) +- goto out; +- +- /* We can write back this queue in page reclaim */ +- current->backing_dev_info = inode_to_bdi(inode); ++ goto out_unlock; + + ret = file_remove_privs(file); + if (ret) +- goto out2; ++ goto out_unlock; + + ret = file_update_time(file); + if (ret) +- goto out2; ++ goto out_unlock; + + if (iocb->ki_flags & IOCB_DIRECT) { + struct address_space *mapping = file->f_mapping; +@@ -887,11 +884,13 @@ static ssize_t gfs2_file_write_iter(stru + + written = gfs2_file_direct_write(iocb, from); + if (written < 0 || !iov_iter_count(from)) +- goto out2; ++ goto out_unlock; + ++ current->backing_dev_info = inode_to_bdi(inode); + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); ++ current->backing_dev_info = NULL; + if (unlikely(ret < 0)) +- goto out2; ++ goto out_unlock; + buffered = ret; + + /* +@@ -915,14 +914,14 @@ static ssize_t gfs2_file_write_iter(stru + */ + } + } else { ++ current->backing_dev_info = inode_to_bdi(inode); + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); ++ current->backing_dev_info = NULL; + if (likely(ret > 0)) + iocb->ki_pos += ret; + } + +-out2: +- current->backing_dev_info = NULL; +-out: ++out_unlock: + inode_unlock(inode); + if (likely(ret > 0)) { + /* Handle various SYNC-type writes */ diff --git a/queue-5.5/io_uring-don-t-map-read-write-iovec-potentially-twice.patch b/queue-5.5/io_uring-don-t-map-read-write-iovec-potentially-twice.patch new file mode 100644 index 00000000000..1a6b684d577 --- /dev/null +++ b/queue-5.5/io_uring-don-t-map-read-write-iovec-potentially-twice.patch @@ -0,0 +1,42 @@ +From 5d204bcfa09330972ad3428a8f81c23f371d3e6d Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Fri, 31 Jan 2020 12:06:52 -0700 +Subject: io_uring: don't map read/write iovec potentially twice + +From: Jens Axboe + +commit 5d204bcfa09330972ad3428a8f81c23f371d3e6d upstream. + +If we have a read/write that is deferred, we already setup the async IO +context for that request, and mapped it. When we later try and execute +the request and we get -EAGAIN, we don't want to attempt to re-map it. +If we do, we end up with garbage in the iovec, which typically leads +to an -EFAULT or -EINVAL completion. + +Cc: stable@vger.kernel.org # 5.5 +Reported-by: Dan Melnic +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + fs/io_uring.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -1789,10 +1789,12 @@ static int io_setup_async_rw(struct io_k + if (req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED) + return 0; +- if (!req->io && io_alloc_async_ctx(req)) +- return -ENOMEM; ++ if (!req->io) { ++ if (io_alloc_async_ctx(req)) ++ return -ENOMEM; + +- io_req_map_rw(req, io_size, iovec, fast_iov, iter); ++ io_req_map_rw(req, io_size, iovec, fast_iov, iter); ++ } + req->work.func = io_rw_async; + return 0; + } diff --git a/queue-5.5/io_uring-spin-for-sq-thread-to-idle-on-shutdown.patch b/queue-5.5/io_uring-spin-for-sq-thread-to-idle-on-shutdown.patch new file mode 100644 index 00000000000..1df797d0c16 --- /dev/null +++ b/queue-5.5/io_uring-spin-for-sq-thread-to-idle-on-shutdown.patch @@ -0,0 +1,55 @@ +From df069d80c8e38c19531c392322e9a16617475c44 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Tue, 4 Feb 2020 16:48:34 -0700 +Subject: io_uring: spin for sq thread to idle on shutdown + +From: Jens Axboe + +commit df069d80c8e38c19531c392322e9a16617475c44 upstream. + +As part of io_uring shutdown, we cancel work that is pending and won't +necessarily complete on its own. That includes requests like poll +commands and timeouts. + +If we're using SQPOLL for kernel side submission and we shutdown the +ring immediately after queueing such work, we can race with the sqthread +doing the submission. This means we may miss cancelling some work, which +results in the io_uring shutdown hanging forever. + +Cc: stable@vger.kernel.org +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + fs/io_uring.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -3902,7 +3902,8 @@ static int io_sq_thread(void *data) + * reap events and wake us up. + */ + if (inflight || +- (!time_after(jiffies, timeout) && ret != -EBUSY)) { ++ (!time_after(jiffies, timeout) && ret != -EBUSY && ++ !percpu_ref_is_dying(&ctx->refs))) { + cond_resched(); + continue; + } +@@ -4983,6 +4984,16 @@ static void io_ring_ctx_wait_and_kill(st + percpu_ref_kill(&ctx->refs); + mutex_unlock(&ctx->uring_lock); + ++ /* ++ * Wait for sq thread to idle, if we have one. It won't spin on new ++ * work after we've killed the ctx ref above. This is important to do ++ * before we cancel existing commands, as the thread could otherwise ++ * be queueing new work post that. If that's work we need to cancel, ++ * it could cause shutdown to hang. ++ */ ++ while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait)) ++ cpu_relax(); ++ + io_kill_timeouts(ctx); + io_poll_remove_all(ctx); + diff --git a/queue-5.5/iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch b/queue-5.5/iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch new file mode 100644 index 00000000000..d4f7747fea4 --- /dev/null +++ b/queue-5.5/iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch @@ -0,0 +1,56 @@ +From 197288d5ba8a5289f22d3aeb4fca3824bfd9b4af Mon Sep 17 00:00:00 2001 +From: Luca Coelho +Date: Fri, 31 Jan 2020 15:45:25 +0200 +Subject: iwlwifi: don't throw error when trying to remove IGTK + +From: Luca Coelho + +commit 197288d5ba8a5289f22d3aeb4fca3824bfd9b4af upstream. + +The IGTK keys are only removed by mac80211 after it has already +removed the AP station. This causes the driver to throw an error +because mac80211 is trying to remove the IGTK when the station doesn't +exist anymore. + +The firmware is aware that the station has been removed and can deal +with it the next time we try to add an IGTK for a station, so we +shouldn't try to remove the key if the station ID is +IWL_MVM_INVALID_STA. Do this by removing the check for mvm_sta before +calling iwl_mvm_send_sta_igtk() and check return from that function +gracefully if the station ID is invalid. + +Cc: stable@vger.kernel.org # 4.12+ +Signed-off-by: Luca Coelho +Signed-off-by: Kalle Valo +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c ++++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c +@@ -3320,6 +3320,10 @@ static int iwl_mvm_send_sta_igtk(struct + igtk_cmd.sta_id = cpu_to_le32(sta_id); + + if (remove_key) { ++ /* This is a valid situation for IGTK */ ++ if (sta_id == IWL_MVM_INVALID_STA) ++ return 0; ++ + igtk_cmd.ctrl_flags |= cpu_to_le32(STA_KEY_NOT_VALID); + } else { + struct ieee80211_key_seq seq; +@@ -3574,9 +3578,9 @@ int iwl_mvm_remove_sta_key(struct iwl_mv + IWL_DEBUG_WEP(mvm, "mvm remove dynamic key: idx=%d sta=%d\n", + keyconf->keyidx, sta_id); + +- if (mvm_sta && (keyconf->cipher == WLAN_CIPHER_SUITE_AES_CMAC || +- keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_128 || +- keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_256)) ++ if (keyconf->cipher == WLAN_CIPHER_SUITE_AES_CMAC || ++ keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_128 || ++ keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_256) + return iwl_mvm_send_sta_igtk(mvm, keyconf, sta_id, true); + + if (!__test_and_clear_bit(keyconf->hw_key_idx, mvm->fw_key_table)) { diff --git a/queue-5.5/jbd2_seq_info_next-should-increase-position-index.patch b/queue-5.5/jbd2_seq_info_next-should-increase-position-index.patch new file mode 100644 index 00000000000..182ee3a5fe8 --- /dev/null +++ b/queue-5.5/jbd2_seq_info_next-should-increase-position-index.patch @@ -0,0 +1,39 @@ +From 1a8e9cf40c9a6a2e40b1e924b13ed303aeea4418 Mon Sep 17 00:00:00 2001 +From: Vasily Averin +Date: Thu, 23 Jan 2020 12:05:10 +0300 +Subject: jbd2_seq_info_next should increase position index + +From: Vasily Averin + +commit 1a8e9cf40c9a6a2e40b1e924b13ed303aeea4418 upstream. + +if seq_file .next fuction does not change position index, +read after some lseek can generate unexpected output. + +Script below generates endless output + $ q=;while read -r r;do echo "$((++q)) $r";done +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/d13805e5-695e-8ac3-b678-26ca2313629f@virtuozzo.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/jbd2/journal.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/jbd2/journal.c ++++ b/fs/jbd2/journal.c +@@ -982,6 +982,7 @@ static void *jbd2_seq_info_start(struct + + static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos) + { ++ (*pos)++; + return NULL; + } + diff --git a/queue-5.5/kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch b/queue-5.5/kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch new file mode 100644 index 00000000000..5d0066e706a --- /dev/null +++ b/queue-5.5/kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch @@ -0,0 +1,44 @@ +From 1a978d9d3e72ddfa40ac60d26301b154247ee0bc Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Wed, 18 Dec 2019 13:54:46 -0800 +Subject: KVM: PPC: Book3S HV: Uninit vCPU if vcore creation fails + +From: Sean Christopherson + +commit 1a978d9d3e72ddfa40ac60d26301b154247ee0bc upstream. + +Call kvm_vcpu_uninit() if vcore creation fails to avoid leaking any +resources allocated by kvm_vcpu_init(), i.e. the vcpu->run page. + +Fixes: 371fefd6f2dc4 ("KVM: PPC: Allow book3s_hv guests to use SMT processor modes") +Cc: stable@vger.kernel.org +Reviewed-by: Greg Kurz +Signed-off-by: Sean Christopherson +Acked-by: Paul Mackerras +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/powerpc/kvm/book3s_hv.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/arch/powerpc/kvm/book3s_hv.c ++++ b/arch/powerpc/kvm/book3s_hv.c +@@ -2368,7 +2368,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu + mutex_unlock(&kvm->lock); + + if (!vcore) +- goto free_vcpu; ++ goto uninit_vcpu; + + spin_lock(&vcore->lock); + ++vcore->num_threads; +@@ -2385,6 +2385,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu + + return vcpu; + ++uninit_vcpu: ++ kvm_vcpu_uninit(vcpu); + free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); + out: diff --git a/queue-5.5/kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch b/queue-5.5/kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch new file mode 100644 index 00000000000..05f0a261888 --- /dev/null +++ b/queue-5.5/kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch @@ -0,0 +1,41 @@ +From cb10bf9194f4d2c5d830eddca861f7ca0fecdbb4 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Wed, 18 Dec 2019 13:54:47 -0800 +Subject: KVM: PPC: Book3S PR: Free shared page if mmu initialization fails + +From: Sean Christopherson + +commit cb10bf9194f4d2c5d830eddca861f7ca0fecdbb4 upstream. + +Explicitly free the shared page if kvmppc_mmu_init() fails during +kvmppc_core_vcpu_create(), as the page is freed only in +kvmppc_core_vcpu_free(), which is not reached via kvm_vcpu_uninit(). + +Fixes: 96bc451a15329 ("KVM: PPC: Introduce shared page") +Cc: stable@vger.kernel.org +Reviewed-by: Greg Kurz +Signed-off-by: Sean Christopherson +Acked-by: Paul Mackerras +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/powerpc/kvm/book3s_pr.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/arch/powerpc/kvm/book3s_pr.c ++++ b/arch/powerpc/kvm/book3s_pr.c +@@ -1806,10 +1806,12 @@ static struct kvm_vcpu *kvmppc_core_vcpu + + err = kvmppc_mmu_init(vcpu); + if (err < 0) +- goto uninit_vcpu; ++ goto free_shared_page; + + return vcpu; + ++free_shared_page: ++ free_page((unsigned long)vcpu->arch.shared); + uninit_vcpu: + kvm_vcpu_uninit(vcpu); + free_shadow_vcpu: diff --git a/queue-5.5/kvm-svm-pku-not-currently-supported.patch b/queue-5.5/kvm-svm-pku-not-currently-supported.patch new file mode 100644 index 00000000000..dc89124a1d9 --- /dev/null +++ b/queue-5.5/kvm-svm-pku-not-currently-supported.patch @@ -0,0 +1,112 @@ +From a47970ed74a535b1accb4bc73643fd5a93993c3e Mon Sep 17 00:00:00 2001 +From: John Allen +Date: Thu, 19 Dec 2019 14:17:59 -0600 +Subject: kvm/svm: PKU not currently supported + +From: John Allen + +commit a47970ed74a535b1accb4bc73643fd5a93993c3e upstream. + +Current SVM implementation does not have support for handling PKU. Guests +running on a host with future AMD cpus that support the feature will read +garbage from the PKRU register and will hit segmentation faults on boot as +memory is getting marked as protected that should not be. Ensure that cpuid +from SVM does not advertise the feature. + +Signed-off-by: John Allen +Cc: stable@vger.kernel.org +Fixes: 0556cbdc2fbc ("x86/pkeys: Don't check if PKRU is zero before writing it") +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/kvm_host.h | 1 + + arch/x86/kvm/cpuid.c | 4 +++- + arch/x86/kvm/svm.c | 6 ++++++ + arch/x86/kvm/vmx/capabilities.h | 5 +++++ + arch/x86/kvm/vmx/vmx.c | 1 + + 5 files changed, 16 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -1145,6 +1145,7 @@ struct kvm_x86_ops { + bool (*xsaves_supported)(void); + bool (*umip_emulated)(void); + bool (*pt_supported)(void); ++ bool (*pku_supported)(void); + + int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); + void (*request_immediate_exit)(struct kvm_vcpu *vcpu); +--- a/arch/x86/kvm/cpuid.c ++++ b/arch/x86/kvm/cpuid.c +@@ -352,6 +352,7 @@ static inline void do_cpuid_7_mask(struc + unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; + unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; + unsigned f_la57; ++ unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0; + + /* cpuid 7.0.ebx */ + const u32 kvm_cpuid_7_0_ebx_x86_features = +@@ -363,7 +364,7 @@ static inline void do_cpuid_7_mask(struc + + /* cpuid 7.0.ecx*/ + const u32 kvm_cpuid_7_0_ecx_x86_features = +- F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | ++ F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) | + F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | + F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; +@@ -392,6 +393,7 @@ static inline void do_cpuid_7_mask(struc + /* Set LA57 based on hardware capability. */ + entry->ecx |= f_la57; + entry->ecx |= f_umip; ++ entry->ecx |= f_pku; + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) + entry->ecx &= ~F(PKU); +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -6001,6 +6001,11 @@ static bool svm_has_wbinvd_exit(void) + return true; + } + ++static bool svm_pku_supported(void) ++{ ++ return false; ++} ++ + #define PRE_EX(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_PRE_EXCEPT, } + #define POST_EX(exit) { .exit_code = (exit), \ +@@ -7341,6 +7346,7 @@ static struct kvm_x86_ops svm_x86_ops __ + .xsaves_supported = svm_xsaves_supported, + .umip_emulated = svm_umip_emulated, + .pt_supported = svm_pt_supported, ++ .pku_supported = svm_pku_supported, + + .set_supported_cpuid = svm_set_supported_cpuid, + +--- a/arch/x86/kvm/vmx/capabilities.h ++++ b/arch/x86/kvm/vmx/capabilities.h +@@ -145,6 +145,11 @@ static inline bool vmx_umip_emulated(voi + SECONDARY_EXEC_DESC; + } + ++static inline bool vmx_pku_supported(void) ++{ ++ return boot_cpu_has(X86_FEATURE_PKU); ++} ++ + static inline bool cpu_has_vmx_rdtscp(void) + { + return vmcs_config.cpu_based_2nd_exec_ctrl & +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7870,6 +7870,7 @@ static struct kvm_x86_ops vmx_x86_ops __ + .xsaves_supported = vmx_xsaves_supported, + .umip_emulated = vmx_umip_emulated, + .pt_supported = vmx_pt_supported, ++ .pku_supported = vmx_pku_supported, + + .request_immediate_exit = vmx_request_immediate_exit, + diff --git a/queue-5.5/kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch b/queue-5.5/kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch new file mode 100644 index 00000000000..5b97d338c8c --- /dev/null +++ b/queue-5.5/kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch @@ -0,0 +1,55 @@ +From f958bd2314d117f8c29f4821401bc1925bc2e5ef Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Mon, 9 Dec 2019 12:19:31 -0800 +Subject: KVM: x86: Fix potential put_fpu() w/o load_fpu() on MPX platform + +From: Sean Christopherson + +commit f958bd2314d117f8c29f4821401bc1925bc2e5ef upstream. + +Unlike most state managed by XSAVE, MPX is initialized to zero on INIT. +Because INITs are usually recognized in the context of a VCPU_RUN call, +kvm_vcpu_reset() puts the guest's FPU so that the FPU state is resident +in memory, zeros the MPX state, and reloads FPU state to hardware. But, +in the unlikely event that an INIT is recognized during +kvm_arch_vcpu_ioctl_get_mpstate() via kvm_apic_accept_events(), +kvm_vcpu_reset() will call kvm_put_guest_fpu() without a preceding +kvm_load_guest_fpu() and corrupt the guest's FPU state (and possibly +userspace's FPU state as well). + +Given that MPX is being removed from the kernel[*], fix the bug with the +simple-but-ugly approach of loading the guest's FPU during +KVM_GET_MP_STATE. + +[*] See commit f240652b6032b ("x86/mpx: Remove MPX APIs"). + +Fixes: f775b13eedee2 ("x86,kvm: move qemu/guest FPU switching out to vcpu_run") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8724,6 +8724,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(stru + struct kvm_mp_state *mp_state) + { + vcpu_load(vcpu); ++ if (kvm_mpx_supported()) ++ kvm_load_guest_fpu(vcpu); + + kvm_apic_accept_events(vcpu); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && +@@ -8732,6 +8734,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(stru + else + mp_state->mp_state = vcpu->arch.mp_state; + ++ if (kvm_mpx_supported()) ++ kvm_put_guest_fpu(vcpu); + vcpu_put(vcpu); + return 0; + } diff --git a/queue-5.5/kvm-x86-mmu-apply-max-pa-check-for-mmio-sptes-to-32-bit-kvm.patch b/queue-5.5/kvm-x86-mmu-apply-max-pa-check-for-mmio-sptes-to-32-bit-kvm.patch new file mode 100644 index 00000000000..96b1c35c062 --- /dev/null +++ b/queue-5.5/kvm-x86-mmu-apply-max-pa-check-for-mmio-sptes-to-32-bit-kvm.patch @@ -0,0 +1,42 @@ +From e30a7d623dccdb3f880fbcad980b0cb589a1da45 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Tue, 7 Jan 2020 16:12:10 -0800 +Subject: KVM: x86/mmu: Apply max PA check for MMIO sptes to 32-bit KVM + +From: Sean Christopherson + +commit e30a7d623dccdb3f880fbcad980b0cb589a1da45 upstream. + +Remove the bogus 64-bit only condition from the check that disables MMIO +spte optimization when the system supports the max PA, i.e. doesn't have +any reserved PA bits. 32-bit KVM always uses PAE paging for the shadow +MMU, and per Intel's SDM: + + PAE paging translates 32-bit linear addresses to 52-bit physical + addresses. + +The kernel's restrictions on max physical addresses are limits on how +much memory the kernel can reasonably use, not what physical addresses +are supported by hardware. + +Fixes: ce88decffd17 ("KVM: MMU: mmio page fault support") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/mmu/mmu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kvm/mmu/mmu.c ++++ b/arch/x86/kvm/mmu/mmu.c +@@ -6249,7 +6249,7 @@ static void kvm_set_mmio_spte_mask(void) + * If reserved bit is not supported, clear the present bit to disable + * mmio page fault. + */ +- if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52) ++ if (shadow_phys_bits == 52) + mask &= ~1ull; + + kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); diff --git a/queue-5.5/kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..b2f61ddb152 --- /dev/null +++ b/queue-5.5/kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,57 @@ +From ea740059ecb37807ba47b84b33d1447435a8d868 Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:52 -0800 +Subject: KVM: x86: Protect DR-based index computations from Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit ea740059ecb37807ba47b84b33d1447435a8d868 upstream. + +This fixes a Spectre-v1/L1TF vulnerability in __kvm_set_dr() and +kvm_get_dr(). +Both kvm_get_dr() and kvm_set_dr() (a wrapper of __kvm_set_dr()) are +exported symbols so KVM should tream them conservatively from a security +perspective. + +Fixes: 020df0794f57 ("KVM: move DR register access handling into generic code") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -1047,9 +1047,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu + + static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) + { ++ size_t size = ARRAY_SIZE(vcpu->arch.db); ++ + switch (dr) { + case 0 ... 3: +- vcpu->arch.db[dr] = val; ++ vcpu->arch.db[array_index_nospec(dr, size)] = val; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = val; + break; +@@ -1086,9 +1088,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr); + + int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) + { ++ size_t size = ARRAY_SIZE(vcpu->arch.db); ++ + switch (dr) { + case 0 ... 3: +- *val = vcpu->arch.db[dr]; ++ *val = vcpu->arch.db[array_index_nospec(dr, size)]; + break; + case 4: + /* fall through */ diff --git a/queue-5.5/kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..82db466398d --- /dev/null +++ b/queue-5.5/kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,58 @@ +From 8c86405f606ca8508b8d9280680166ca26723695 Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:44 -0800 +Subject: KVM: x86: Protect ioapic_read_indirect() from Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit 8c86405f606ca8508b8d9280680166ca26723695 upstream. + +This fixes a Spectre-v1/L1TF vulnerability in ioapic_read_indirect(). +This function contains index computations based on the +(attacker-controlled) IOREGSEL register. + +Fixes: a2c118bfab8b ("KVM: Fix bounds checking in ioapic indirect register reads (CVE-2013-1798)") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/ioapic.c | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +--- a/arch/x86/kvm/ioapic.c ++++ b/arch/x86/kvm/ioapic.c +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -68,13 +69,14 @@ static unsigned long ioapic_read_indirec + default: + { + u32 redir_index = (ioapic->ioregsel - 0x10) >> 1; +- u64 redir_content; ++ u64 redir_content = ~0ULL; + +- if (redir_index < IOAPIC_NUM_PINS) +- redir_content = +- ioapic->redirtbl[redir_index].bits; +- else +- redir_content = ~0ULL; ++ if (redir_index < IOAPIC_NUM_PINS) { ++ u32 index = array_index_nospec( ++ redir_index, IOAPIC_NUM_PINS); ++ ++ redir_content = ioapic->redirtbl[index].bits; ++ } + + result = (ioapic->ioregsel & 0x1) ? + (redir_content >> 32) & 0xffffffff : diff --git a/queue-5.5/kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..5770eef965d --- /dev/null +++ b/queue-5.5/kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,40 @@ +From 670564559ca35b439c8d8861fc399451ddf95137 Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:45 -0800 +Subject: KVM: x86: Protect ioapic_write_indirect() from Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit 670564559ca35b439c8d8861fc399451ddf95137 upstream. + +This fixes a Spectre-v1/L1TF vulnerability in ioapic_write_indirect(). +This function contains index computations based on the +(attacker-controlled) IOREGSEL register. + +This patch depends on patch +"KVM: x86: Protect ioapic_read_indirect() from Spectre-v1/L1TF attacks". + +Fixes: 70f93dae32ac ("KVM: Use temporary variable to shorten lines.") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/ioapic.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/x86/kvm/ioapic.c ++++ b/arch/x86/kvm/ioapic.c +@@ -292,6 +292,7 @@ static void ioapic_write_indirect(struct + + if (index >= IOAPIC_NUM_PINS) + return; ++ index = array_index_nospec(index, IOAPIC_NUM_PINS); + e = &ioapic->redirtbl[index]; + mask_before = e->fields.mask; + /* Preserve read-only fields */ diff --git a/queue-5.5/kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..f2c12645b67 --- /dev/null +++ b/queue-5.5/kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,59 @@ +From 8618793750071d66028584a83ed0b4fa7eb4f607 Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:42 -0800 +Subject: KVM: x86: Protect kvm_hv_msr_[get|set]_crash_data() from Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit 8618793750071d66028584a83ed0b4fa7eb4f607 upstream. + +This fixes Spectre-v1/L1TF vulnerabilities in kvm_hv_msr_get_crash_data() +and kvm_hv_msr_set_crash_data(). +These functions contain index computations that use the +(attacker-controlled) MSR number. + +Fixes: e7d9513b60e8 ("kvm/x86: added hyper-v crash msrs into kvm hyperv context") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/hyperv.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/hyperv.c ++++ b/arch/x86/kvm/hyperv.c +@@ -809,11 +809,12 @@ static int kvm_hv_msr_get_crash_data(str + u32 index, u64 *pdata) + { + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; ++ size_t size = ARRAY_SIZE(hv->hv_crash_param); + +- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) ++ if (WARN_ON_ONCE(index >= size)) + return -EINVAL; + +- *pdata = hv->hv_crash_param[index]; ++ *pdata = hv->hv_crash_param[array_index_nospec(index, size)]; + return 0; + } + +@@ -852,11 +853,12 @@ static int kvm_hv_msr_set_crash_data(str + u32 index, u64 data) + { + struct kvm_hv *hv = &vcpu->kvm->arch.hyperv; ++ size_t size = ARRAY_SIZE(hv->hv_crash_param); + +- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param))) ++ if (WARN_ON_ONCE(index >= size)) + return -EINVAL; + +- hv->hv_crash_param[index] = data; ++ hv->hv_crash_param[array_index_nospec(index, size)] = data; + return 0; + } + diff --git a/queue-5.5/kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..ae78aa358b4 --- /dev/null +++ b/queue-5.5/kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,54 @@ +From 4bf79cb089f6b1c6c632492c0271054ce52ad766 Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:46 -0800 +Subject: KVM: x86: Protect kvm_lapic_reg_write() from Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit 4bf79cb089f6b1c6c632492c0271054ce52ad766 upstream. + +This fixes a Spectre-v1/L1TF vulnerability in kvm_lapic_reg_write(). +This function contains index computations based on the +(attacker-controlled) MSR number. + +Fixes: 0105d1a52640 ("KVM: x2apic interface to lapic") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/lapic.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/lapic.c ++++ b/arch/x86/kvm/lapic.c +@@ -1963,15 +1963,20 @@ int kvm_lapic_reg_write(struct kvm_lapic + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT1: +- case APIC_LVTERR: ++ case APIC_LVTERR: { + /* TODO: Check vector */ ++ size_t size; ++ u32 index; ++ + if (!kvm_apic_sw_enabled(apic)) + val |= APIC_LVT_MASKED; +- +- val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4]; ++ size = ARRAY_SIZE(apic_lvt_mask); ++ index = array_index_nospec( ++ (reg - APIC_LVTT) >> 4, size); ++ val &= apic_lvt_mask[index]; + kvm_lapic_set_reg(apic, reg, val); +- + break; ++ } + + case APIC_LVTT: + if (!kvm_apic_sw_enabled(apic)) diff --git a/queue-5.5/kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch b/queue-5.5/kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch new file mode 100644 index 00000000000..79218fb168b --- /dev/null +++ b/queue-5.5/kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch @@ -0,0 +1,54 @@ +From 6ec4c5eee1750d5d17951c4e1960d953376a0dda Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:49 -0800 +Subject: KVM: x86: Protect MSR-based index computations from Spectre-v1/L1TF attacks in x86.c + +From: Marios Pomonis + +commit 6ec4c5eee1750d5d17951c4e1960d953376a0dda upstream. + +This fixes a Spectre-v1/L1TF vulnerability in set_msr_mce() and +get_msr_mce(). +Both functions contain index computations based on the +(attacker-controlled) MSR number. + +Fixes: 890ca9aefa78 ("KVM: Add MCE support") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/x86.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -2489,7 +2489,10 @@ static int set_msr_mce(struct kvm_vcpu * + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MCx_CTL(bank_num)) { +- u32 offset = msr - MSR_IA32_MC0_CTL; ++ u32 offset = array_index_nospec( ++ msr - MSR_IA32_MC0_CTL, ++ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); ++ + /* only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore +@@ -2930,7 +2933,10 @@ static int get_msr_mce(struct kvm_vcpu * + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MCx_CTL(bank_num)) { +- u32 offset = msr - MSR_IA32_MC0_CTL; ++ u32 offset = array_index_nospec( ++ msr - MSR_IA32_MC0_CTL, ++ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); ++ + data = vcpu->arch.mce_banks[offset]; + break; + } diff --git a/queue-5.5/kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..57ffe4a3157 --- /dev/null +++ b/queue-5.5/kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,47 @@ +From 25a5edea71b7c154b6a0b8cec14c711cafa31d26 Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:47 -0800 +Subject: KVM: x86: Protect MSR-based index computations in fixed_msr_to_seg_unit() from Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit 25a5edea71b7c154b6a0b8cec14c711cafa31d26 upstream. + +This fixes a Spectre-v1/L1TF vulnerability in fixed_msr_to_seg_unit(). +This function contains index computations based on the +(attacker-controlled) MSR number. + +Fixes: de9aef5e1ad6 ("KVM: MTRR: introduce fixed_mtrr_segment table") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/mtrr.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/mtrr.c ++++ b/arch/x86/kvm/mtrr.c +@@ -192,11 +192,15 @@ static bool fixed_msr_to_seg_unit(u32 ms + break; + case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: + *seg = 1; +- *unit = msr - MSR_MTRRfix16K_80000; ++ *unit = array_index_nospec( ++ msr - MSR_MTRRfix16K_80000, ++ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1); + break; + case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: + *seg = 2; +- *unit = msr - MSR_MTRRfix4K_C0000; ++ *unit = array_index_nospec( ++ msr - MSR_MTRRfix4K_C0000, ++ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1); + break; + default: + return false; diff --git a/queue-5.5/kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..2c24782754c --- /dev/null +++ b/queue-5.5/kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,69 @@ +From 13c5183a4e643cc2b03a22d0e582c8e17bb7457d Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:48 -0800 +Subject: KVM: x86: Protect MSR-based index computations in pmu.h from Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit 13c5183a4e643cc2b03a22d0e582c8e17bb7457d upstream. + +This fixes a Spectre-v1/L1TF vulnerability in the get_gp_pmc() and +get_fixed_pmc() functions. +They both contain index computations based on the (attacker-controlled) +MSR number. + +Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/pmu.h | 18 ++++++++++++++---- + 1 file changed, 14 insertions(+), 4 deletions(-) + +--- a/arch/x86/kvm/pmu.h ++++ b/arch/x86/kvm/pmu.h +@@ -2,6 +2,8 @@ + #ifndef __KVM_X86_PMU_H + #define __KVM_X86_PMU_H + ++#include ++ + #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) + #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) + #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) +@@ -102,8 +104,12 @@ static inline bool kvm_valid_perf_global + static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, + u32 base) + { +- if (msr >= base && msr < base + pmu->nr_arch_gp_counters) +- return &pmu->gp_counters[msr - base]; ++ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) { ++ u32 index = array_index_nospec(msr - base, ++ pmu->nr_arch_gp_counters); ++ ++ return &pmu->gp_counters[index]; ++ } + + return NULL; + } +@@ -113,8 +119,12 @@ static inline struct kvm_pmc *get_fixed_ + { + int base = MSR_CORE_PERF_FIXED_CTR0; + +- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) +- return &pmu->fixed_counters[msr - base]; ++ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) { ++ u32 index = array_index_nospec(msr - base, ++ pmu->nr_arch_fixed_counters); ++ ++ return &pmu->fixed_counters[index]; ++ } + + return NULL; + } diff --git a/queue-5.5/kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..92758bf0d28 --- /dev/null +++ b/queue-5.5/kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,76 @@ +From 66061740f1a487f4ed54fde75e724709f805da53 Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:53 -0800 +Subject: KVM: x86: Protect pmu_intel.c from Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit 66061740f1a487f4ed54fde75e724709f805da53 upstream. + +This fixes Spectre-v1/L1TF vulnerabilities in intel_find_fixed_event() +and intel_rdpmc_ecx_to_pmc(). +kvm_rdpmc() (ancestor of intel_find_fixed_event()) and +reprogram_fixed_counter() (ancestor of intel_rdpmc_ecx_to_pmc()) are +exported symbols so KVM should treat them conservatively from a security +perspective. + +Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx/pmu_intel.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +--- a/arch/x86/kvm/vmx/pmu_intel.c ++++ b/arch/x86/kvm/vmx/pmu_intel.c +@@ -86,10 +86,14 @@ static unsigned intel_find_arch_event(st + + static unsigned intel_find_fixed_event(int idx) + { +- if (idx >= ARRAY_SIZE(fixed_pmc_events)) ++ u32 event; ++ size_t size = ARRAY_SIZE(fixed_pmc_events); ++ ++ if (idx >= size) + return PERF_COUNT_HW_MAX; + +- return intel_arch_events[fixed_pmc_events[idx]].event_type; ++ event = fixed_pmc_events[array_index_nospec(idx, size)]; ++ return intel_arch_events[event].event_type; + } + + /* check if a PMC is enabled by comparing it with globl_ctrl bits. */ +@@ -130,16 +134,20 @@ static struct kvm_pmc *intel_rdpmc_ecx_t + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fixed = idx & (1u << 30); + struct kvm_pmc *counters; ++ unsigned int num_counters; + + idx &= ~(3u << 30); +- if (!fixed && idx >= pmu->nr_arch_gp_counters) +- return NULL; +- if (fixed && idx >= pmu->nr_arch_fixed_counters) ++ if (fixed) { ++ counters = pmu->fixed_counters; ++ num_counters = pmu->nr_arch_fixed_counters; ++ } else { ++ counters = pmu->gp_counters; ++ num_counters = pmu->nr_arch_gp_counters; ++ } ++ if (idx >= num_counters) + return NULL; +- counters = fixed ? pmu->fixed_counters : pmu->gp_counters; + *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP]; +- +- return &counters[idx]; ++ return &counters[array_index_nospec(idx, num_counters)]; + } + + static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) diff --git a/queue-5.5/kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..854ae734b22 --- /dev/null +++ b/queue-5.5/kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,48 @@ +From 3c9053a2cae7ba2ba73766a34cea41baa70f57f7 Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:41 -0800 +Subject: KVM: x86: Protect x86_decode_insn from Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit 3c9053a2cae7ba2ba73766a34cea41baa70f57f7 upstream. + +This fixes a Spectre-v1/L1TF vulnerability in x86_decode_insn(). +kvm_emulate_instruction() (an ancestor of x86_decode_insn()) is an exported +symbol, so KVM should treat it conservatively from a security perspective. + +Fixes: 045a282ca415 ("KVM: emulator: implement fninit, fnstsw, fnstcw") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/emulate.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -5315,10 +5315,15 @@ done_prefixes: + } + break; + case Escape: +- if (ctxt->modrm > 0xbf) +- opcode = opcode.u.esc->high[ctxt->modrm - 0xc0]; +- else ++ if (ctxt->modrm > 0xbf) { ++ size_t size = ARRAY_SIZE(opcode.u.esc->high); ++ u32 index = array_index_nospec( ++ ctxt->modrm - 0xc0, size); ++ ++ opcode = opcode.u.esc->high[index]; ++ } else { + opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; ++ } + break; + case InstrDual: + if ((ctxt->modrm >> 6) == 3) diff --git a/queue-5.5/kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..eb7e20daeab --- /dev/null +++ b/queue-5.5/kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,45 @@ +From 14e32321f3606e4b0970200b6e5e47ee6f1e6410 Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:43 -0800 +Subject: KVM: x86: Refactor picdev_write() to prevent Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit 14e32321f3606e4b0970200b6e5e47ee6f1e6410 upstream. + +This fixes a Spectre-v1/L1TF vulnerability in picdev_write(). +It replaces index computations based on the (attacked-controlled) port +number with constants through a minor refactoring. + +Fixes: 85f455f7ddbe ("KVM: Add support for in-kernel PIC emulation") + +Signed-off-by: Nick Finco +Signed-off-by: Marios Pomonis +Reviewed-by: Andrew Honig +Cc: stable@vger.kernel.org +Reviewed-by: Jim Mattson +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/i8259.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/i8259.c ++++ b/arch/x86/kvm/i8259.c +@@ -460,10 +460,14 @@ static int picdev_write(struct kvm_pic * + switch (addr) { + case 0x20: + case 0x21: ++ pic_lock(s); ++ pic_ioport_write(&s->pics[0], addr, data); ++ pic_unlock(s); ++ break; + case 0xa0: + case 0xa1: + pic_lock(s); +- pic_ioport_write(&s->pics[addr >> 7], addr, data); ++ pic_ioport_write(&s->pics[1], addr, data); + pic_unlock(s); + break; + case 0x4d0: diff --git a/queue-5.5/kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch b/queue-5.5/kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch new file mode 100644 index 00000000000..16323a9346e --- /dev/null +++ b/queue-5.5/kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch @@ -0,0 +1,57 @@ +From 125ffc5e0a56a3eded608dc51e09d5ebf72cf652 Mon Sep 17 00:00:00 2001 +From: Marios Pomonis +Date: Wed, 11 Dec 2019 12:47:50 -0800 +Subject: KVM: x86: Refactor prefix decoding to prevent Spectre-v1/L1TF attacks + +From: Marios Pomonis + +commit 125ffc5e0a56a3eded608dc51e09d5ebf72cf652 upstream. + +This fixes Spectre-v1/L1TF vulnerabilities in +vmx_read_guest_seg_selector(), vmx_read_guest_seg_base(), +vmx_read_guest_seg_limit() and vmx_read_guest_seg_ar(). When +invoked from emulation, these functions contain index computations +based on the (attacker-influenced) segment value. Using constants +prevents the attack. + +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/emulate.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +--- a/arch/x86/kvm/emulate.c ++++ b/arch/x86/kvm/emulate.c +@@ -5210,16 +5210,28 @@ int x86_decode_insn(struct x86_emulate_c + ctxt->ad_bytes = def_ad_bytes ^ 6; + break; + case 0x26: /* ES override */ ++ has_seg_override = true; ++ ctxt->seg_override = VCPU_SREG_ES; ++ break; + case 0x2e: /* CS override */ ++ has_seg_override = true; ++ ctxt->seg_override = VCPU_SREG_CS; ++ break; + case 0x36: /* SS override */ ++ has_seg_override = true; ++ ctxt->seg_override = VCPU_SREG_SS; ++ break; + case 0x3e: /* DS override */ + has_seg_override = true; +- ctxt->seg_override = (ctxt->b >> 3) & 3; ++ ctxt->seg_override = VCPU_SREG_DS; + break; + case 0x64: /* FS override */ ++ has_seg_override = true; ++ ctxt->seg_override = VCPU_SREG_FS; ++ break; + case 0x65: /* GS override */ + has_seg_override = true; +- ctxt->seg_override = ctxt->b & 7; ++ ctxt->seg_override = VCPU_SREG_GS; + break; + case 0x40 ... 0x4f: /* REX */ + if (mode != X86EMUL_MODE_PROT64) diff --git a/queue-5.5/media-iguanair-fix-endpoint-sanity-check.patch b/queue-5.5/media-iguanair-fix-endpoint-sanity-check.patch new file mode 100644 index 00000000000..641d7a30c39 --- /dev/null +++ b/queue-5.5/media-iguanair-fix-endpoint-sanity-check.patch @@ -0,0 +1,40 @@ +From 1b257870a78b0a9ce98fdfb052c58542022ffb5b Mon Sep 17 00:00:00 2001 +From: Johan Hovold +Date: Fri, 3 Jan 2020 17:35:13 +0100 +Subject: media: iguanair: fix endpoint sanity check + +From: Johan Hovold + +commit 1b257870a78b0a9ce98fdfb052c58542022ffb5b upstream. + +Make sure to use the current alternate setting, which need not be the +first one by index, when verifying the endpoint descriptors and +initialising the URBs. + +Failing to do so could cause the driver to misbehave or trigger a WARN() +in usb_submit_urb() that kernels with panic_on_warn set would choke on. + +Fixes: 26ff63137c45 ("[media] Add support for the IguanaWorks USB IR Transceiver") +Fixes: ab1cbdf159be ("media: iguanair: add sanity checks") +Cc: stable # 3.6 +Cc: Oliver Neukum +Signed-off-by: Johan Hovold +Signed-off-by: Sean Young +Signed-off-by: Mauro Carvalho Chehab +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/media/rc/iguanair.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/media/rc/iguanair.c ++++ b/drivers/media/rc/iguanair.c +@@ -413,7 +413,7 @@ static int iguanair_probe(struct usb_int + int ret, pipein, pipeout; + struct usb_host_interface *idesc; + +- idesc = intf->altsetting; ++ idesc = intf->cur_altsetting; + if (idesc->desc.bNumEndpoints < 2) + return -ENODEV; + diff --git a/queue-5.5/media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch b/queue-5.5/media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch new file mode 100644 index 00000000000..8cdd96b8317 --- /dev/null +++ b/queue-5.5/media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch @@ -0,0 +1,145 @@ +From 080d89f522e2baddb4fbbd1af4b67b5f92537ef8 Mon Sep 17 00:00:00 2001 +From: Sean Young +Date: Thu, 21 Nov 2019 11:10:47 +0100 +Subject: media: rc: ensure lirc is initialized before registering input device + +From: Sean Young + +commit 080d89f522e2baddb4fbbd1af4b67b5f92537ef8 upstream. + +Once rc_open is called on the input device, lirc events can be delivered. +Ensure lirc is ready to do so else we might get this: + +Registered IR keymap rc-hauppauge +rc rc0: Hauppauge WinTV PVR-350 as +/devices/pci0000:00/0000:00:1e.0/0000:04:00.0/i2c-0/0-0018/rc/rc0 +input: Hauppauge WinTV PVR-350 as +/devices/pci0000:00/0000:00:1e.0/0000:04:00.0/i2c-0/0-0018/rc/rc0/input9 +BUG: kernel NULL pointer dereference, address: 0000000000000038 +PGD 0 P4D 0 +Oops: 0000 [#1] SMP PTI +CPU: 1 PID: 17 Comm: kworker/1:0 Not tainted 5.3.11-300.fc31.x86_64 #1 +Hardware name: /DG43NB, BIOS NBG4310H.86A.0096.2009.0903.1845 09/03/2009 +Workqueue: events ir_work [ir_kbd_i2c] +RIP: 0010:ir_lirc_scancode_event+0x3d/0xb0 +Code: a6 b4 07 00 00 49 81 c6 b8 07 00 00 55 53 e8 ba a7 9d ff 4c 89 +e7 49 89 45 00 e8 5e 7a 25 00 49 8b 1e 48 89 c5 4c 39 f3 74 58 <8b> 43 +38 8b 53 40 89 c1 2b 4b 3c 39 ca 72 41 21 d0 49 8b 7d 00 49 +RSP: 0018:ffffaae2000b3d88 EFLAGS: 00010017 +RAX: 0000000000000002 RBX: 0000000000000000 RCX: 0000000000000019 +RDX: 0000000000000001 RSI: 006e801b1f26ce6a RDI: ffff9e39797c37b4 +RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000001 +R10: 0000000000000001 R11: 0000000000000001 R12: ffff9e39797c37b4 +R13: ffffaae2000b3db8 R14: ffff9e39797c37b8 R15: ffff9e39797c33d8 +FS: 0000000000000000(0000) GS:ffff9e397b680000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000000000038 CR3: 0000000035844000 CR4: 00000000000006e0 +Call Trace: +ir_do_keydown+0x8e/0x2b0 +rc_keydown+0x52/0xc0 +ir_work+0xb8/0x130 [ir_kbd_i2c] +process_one_work+0x19d/0x340 +worker_thread+0x50/0x3b0 +kthread+0xfb/0x130 +? process_one_work+0x340/0x340 +? kthread_park+0x80/0x80 +ret_from_fork+0x35/0x40 +Modules linked in: rc_hauppauge tuner msp3400 saa7127 saa7115 ivtv(+) +tveeprom cx2341x v4l2_common videodev mc i2c_algo_bit ir_kbd_i2c +ip_tables firewire_ohci e1000e serio_raw firewire_core ata_generic +crc_itu_t pata_acpi pata_jmicron fuse +CR2: 0000000000000038 +---[ end trace c67c2697a99fa74b ]--- +RIP: 0010:ir_lirc_scancode_event+0x3d/0xb0 +Code: a6 b4 07 00 00 49 81 c6 b8 07 00 00 55 53 e8 ba a7 9d ff 4c 89 +e7 49 89 45 00 e8 5e 7a 25 00 49 8b 1e 48 89 c5 4c 39 f3 74 58 <8b> 43 +38 8b 53 40 89 c1 2b 4b 3c 39 ca 72 41 21 d0 49 8b 7d 00 49 +RSP: 0018:ffffaae2000b3d88 EFLAGS: 00010017 +RAX: 0000000000000002 RBX: 0000000000000000 RCX: 0000000000000019 +RDX: 0000000000000001 RSI: 006e801b1f26ce6a RDI: ffff9e39797c37b4 +RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000001 +R10: 0000000000000001 R11: 0000000000000001 R12: ffff9e39797c37b4 +R13: ffffaae2000b3db8 R14: ffff9e39797c37b8 R15: ffff9e39797c33d8 +FS: 0000000000000000(0000) GS:ffff9e397b680000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000000000038 CR3: 0000000035844000 CR4: 00000000000006e0 +rc rc0: lirc_dev: driver ir_kbd_i2c registered at minor = 0, scancode +receiver, no transmitter +tuner-simple 0-0061: creating new instance +tuner-simple 0-0061: type set to 2 (Philips NTSC (FI1236,FM1236 and +compatibles)) +ivtv0: Registered device video0 for encoder MPG (4096 kB) +ivtv0: Registered device video32 for encoder YUV (2048 kB) +ivtv0: Registered device vbi0 for encoder VBI (1024 kB) +ivtv0: Registered device video24 for encoder PCM (320 kB) +ivtv0: Registered device radio0 for encoder radio +ivtv0: Registered device video16 for decoder MPG (1024 kB) +ivtv0: Registered device vbi8 for decoder VBI (64 kB) +ivtv0: Registered device vbi16 for decoder VOUT +ivtv0: Registered device video48 for decoder YUV (1024 kB) + +Cc: stable@vger.kernel.org +Tested-by: Nick French +Reported-by: Nick French +Signed-off-by: Sean Young +Signed-off-by: Mauro Carvalho Chehab +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/media/rc/rc-main.c | 27 ++++++++++++++++----------- + 1 file changed, 16 insertions(+), 11 deletions(-) + +--- a/drivers/media/rc/rc-main.c ++++ b/drivers/media/rc/rc-main.c +@@ -1891,23 +1891,28 @@ int rc_register_device(struct rc_dev *de + + dev->registered = true; + +- if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { +- rc = rc_setup_rx_device(dev); +- if (rc) +- goto out_dev; +- } +- +- /* Ensure that the lirc kfifo is setup before we start the thread */ ++ /* ++ * once the the input device is registered in rc_setup_rx_device, ++ * userspace can open the input device and rc_open() will be called ++ * as a result. This results in driver code being allowed to submit ++ * keycodes with rc_keydown, so lirc must be registered first. ++ */ + if (dev->allowed_protocols != RC_PROTO_BIT_CEC) { + rc = ir_lirc_register(dev); + if (rc < 0) +- goto out_rx; ++ goto out_dev; ++ } ++ ++ if (dev->driver_type != RC_DRIVER_IR_RAW_TX) { ++ rc = rc_setup_rx_device(dev); ++ if (rc) ++ goto out_lirc; + } + + if (dev->driver_type == RC_DRIVER_IR_RAW) { + rc = ir_raw_event_register(dev); + if (rc < 0) +- goto out_lirc; ++ goto out_rx; + } + + dev_dbg(&dev->dev, "Registered rc%u (driver: %s)\n", dev->minor, +@@ -1915,11 +1920,11 @@ int rc_register_device(struct rc_dev *de + + return 0; + ++out_rx: ++ rc_free_rx_device(dev); + out_lirc: + if (dev->allowed_protocols != RC_PROTO_BIT_CEC) + ir_lirc_unregister(dev); +-out_rx: +- rc_free_rx_device(dev); + out_dev: + device_del(&dev->dev); + out_rx_free: diff --git a/queue-5.5/mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch b/queue-5.5/mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch new file mode 100644 index 00000000000..e740b6ec0e1 --- /dev/null +++ b/queue-5.5/mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch @@ -0,0 +1,35 @@ +From 65b1aae0d9d5962faccc06bdb8e91a2a0b09451c Mon Sep 17 00:00:00 2001 +From: Brian Norris +Date: Mon, 6 Jan 2020 14:42:12 -0800 +Subject: mwifiex: fix unbalanced locking in mwifiex_process_country_ie() + +From: Brian Norris + +commit 65b1aae0d9d5962faccc06bdb8e91a2a0b09451c upstream. + +We called rcu_read_lock(), so we need to call rcu_read_unlock() before +we return. + +Fixes: 3d94a4a8373b ("mwifiex: fix possible heap overflow in mwifiex_process_country_ie()") +Cc: stable@vger.kernel.org +Cc: huangwen +Cc: Ganapathi Bhat +Signed-off-by: Brian Norris +Acked-by: Ganapathi Bhat +Signed-off-by: Kalle Valo +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/wireless/marvell/mwifiex/sta_ioctl.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c ++++ b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c +@@ -232,6 +232,7 @@ static int mwifiex_process_country_ie(st + + if (country_ie_len > + (IEEE80211_COUNTRY_STRING_LEN + MWIFIEX_MAX_TRIPLET_802_11D)) { ++ rcu_read_unlock(); + mwifiex_dbg(priv->adapter, ERROR, + "11D: country_ie_len overflow!, deauth AP\n"); + return -EINVAL; diff --git a/queue-5.5/nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch b/queue-5.5/nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch new file mode 100644 index 00000000000..43b2d9faa1e --- /dev/null +++ b/queue-5.5/nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch @@ -0,0 +1,112 @@ +From 114de38225d9b300f027e2aec9afbb6e0def154b Mon Sep 17 00:00:00 2001 +From: Trond Myklebust +Date: Sun, 2 Feb 2020 17:53:54 -0500 +Subject: NFS: Directory page cache pages need to be locked when read + +From: Trond Myklebust + +commit 114de38225d9b300f027e2aec9afbb6e0def154b upstream. + +When a NFS directory page cache page is removed from the page cache, +its contents are freed through a call to nfs_readdir_clear_array(). +To prevent the removal of the page cache entry until after we've +finished reading it, we must take the page lock. + +Fixes: 11de3b11e08c ("NFS: Fix a memory leak in nfs_readdir") +Cc: stable@vger.kernel.org # v2.6.37+ +Signed-off-by: Trond Myklebust +Reviewed-by: Benjamin Coddington +Signed-off-by: Anna Schumaker +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/dir.c | 30 +++++++++++++++++++----------- + 1 file changed, 19 insertions(+), 11 deletions(-) + +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -702,8 +702,6 @@ int nfs_readdir_filler(void *data, struc + static + void cache_page_release(nfs_readdir_descriptor_t *desc) + { +- if (!desc->page->mapping) +- nfs_readdir_clear_array(desc->page); + put_page(desc->page); + desc->page = NULL; + } +@@ -717,19 +715,28 @@ struct page *get_cache_page(nfs_readdir_ + + /* + * Returns 0 if desc->dir_cookie was found on page desc->page_index ++ * and locks the page to prevent removal from the page cache. + */ + static +-int find_cache_page(nfs_readdir_descriptor_t *desc) ++int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc) + { + int res; + + desc->page = get_cache_page(desc); + if (IS_ERR(desc->page)) + return PTR_ERR(desc->page); +- +- res = nfs_readdir_search_array(desc); ++ res = lock_page_killable(desc->page); + if (res != 0) +- cache_page_release(desc); ++ goto error; ++ res = -EAGAIN; ++ if (desc->page->mapping != NULL) { ++ res = nfs_readdir_search_array(desc); ++ if (res == 0) ++ return 0; ++ } ++ unlock_page(desc->page); ++error: ++ cache_page_release(desc); + return res; + } + +@@ -744,7 +751,7 @@ int readdir_search_pagecache(nfs_readdir + desc->last_cookie = 0; + } + do { +- res = find_cache_page(desc); ++ res = find_and_lock_cache_page(desc); + } while (res == -EAGAIN); + return res; + } +@@ -783,7 +790,6 @@ int nfs_do_filldir(nfs_readdir_descripto + desc->eof = true; + + kunmap(desc->page); +- cache_page_release(desc); + dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", + (unsigned long long)*desc->dir_cookie, res); + return res; +@@ -829,13 +835,13 @@ int uncached_readdir(nfs_readdir_descrip + + status = nfs_do_filldir(desc); + ++ out_release: ++ nfs_readdir_clear_array(desc->page); ++ cache_page_release(desc); + out: + dfprintk(DIRCACHE, "NFS: %s: returns %d\n", + __func__, status); + return status; +- out_release: +- cache_page_release(desc); +- goto out; + } + + /* The file offset position represents the dirent entry number. A +@@ -900,6 +906,8 @@ static int nfs_readdir(struct file *file + break; + + res = nfs_do_filldir(desc); ++ unlock_page(desc->page); ++ cache_page_release(desc); + if (res < 0) + break; + } while (!desc->eof); diff --git a/queue-5.5/nfs-fix-memory-leaks-and-corruption-in-readdir.patch b/queue-5.5/nfs-fix-memory-leaks-and-corruption-in-readdir.patch new file mode 100644 index 00000000000..9202917ed15 --- /dev/null +++ b/queue-5.5/nfs-fix-memory-leaks-and-corruption-in-readdir.patch @@ -0,0 +1,81 @@ +From 4b310319c6a8ce708f1033d57145e2aa027a883c Mon Sep 17 00:00:00 2001 +From: Trond Myklebust +Date: Sun, 2 Feb 2020 17:53:53 -0500 +Subject: NFS: Fix memory leaks and corruption in readdir + +From: Trond Myklebust + +commit 4b310319c6a8ce708f1033d57145e2aa027a883c upstream. + +nfs_readdir_xdr_to_array() must not exit without having initialised +the array, so that the page cache deletion routines can safely +call nfs_readdir_clear_array(). +Furthermore, we should ensure that if we exit nfs_readdir_filler() +with an error, we free up any page contents to prevent a leak +if we try to fill the page again. + +Fixes: 11de3b11e08c ("NFS: Fix a memory leak in nfs_readdir") +Cc: stable@vger.kernel.org # v2.6.37+ +Signed-off-by: Trond Myklebust +Reviewed-by: Benjamin Coddington +Signed-off-by: Anna Schumaker +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/dir.c | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) + +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -162,6 +162,17 @@ typedef struct { + bool eof; + } nfs_readdir_descriptor_t; + ++static ++void nfs_readdir_init_array(struct page *page) ++{ ++ struct nfs_cache_array *array; ++ ++ array = kmap_atomic(page); ++ memset(array, 0, sizeof(struct nfs_cache_array)); ++ array->eof_index = -1; ++ kunmap_atomic(array); ++} ++ + /* + * we are freeing strings created by nfs_add_to_readdir_array() + */ +@@ -174,6 +185,7 @@ void nfs_readdir_clear_array(struct page + array = kmap_atomic(page); + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); ++ array->size = 0; + kunmap_atomic(array); + } + +@@ -610,6 +622,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir + int status = -ENOMEM; + unsigned int array_size = ARRAY_SIZE(pages); + ++ nfs_readdir_init_array(page); ++ + entry.prev_cookie = 0; + entry.cookie = desc->last_cookie; + entry.eof = 0; +@@ -626,8 +640,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir + } + + array = kmap(page); +- memset(array, 0, sizeof(struct nfs_cache_array)); +- array->eof_index = -1; + + status = nfs_readdir_alloc_pages(pages, array_size); + if (status < 0) +@@ -682,6 +694,7 @@ int nfs_readdir_filler(void *data, struc + unlock_page(page); + return 0; + error: ++ nfs_readdir_clear_array(page); + unlock_page(page); + return ret; + } diff --git a/queue-5.5/nfsd-fix-filecache-lookup.patch b/queue-5.5/nfsd-fix-filecache-lookup.patch new file mode 100644 index 00000000000..5c206281dd3 --- /dev/null +++ b/queue-5.5/nfsd-fix-filecache-lookup.patch @@ -0,0 +1,44 @@ +From 28c7d86bb6172ffbb1a1237c6388e77f9fe5f181 Mon Sep 17 00:00:00 2001 +From: Trond Myklebust +Date: Mon, 6 Jan 2020 13:18:03 -0500 +Subject: nfsd: fix filecache lookup + +From: Trond Myklebust + +commit 28c7d86bb6172ffbb1a1237c6388e77f9fe5f181 upstream. + +If the lookup keeps finding a nfsd_file with an unhashed open file, +then retry once only. + +Signed-off-by: Trond Myklebust +Cc: stable@vger.kernel.org +Fixes: 65294c1f2c5e "nfsd: add a new struct file caching facility to nfsd" +Signed-off-by: J. Bruce Fields +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfsd/filecache.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/nfsd/filecache.c ++++ b/fs/nfsd/filecache.c +@@ -789,6 +789,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp + struct nfsd_file *nf, *new; + struct inode *inode; + unsigned int hashval; ++ bool retry = true; + + /* FIXME: skip this if fh_dentry is already set? */ + status = fh_verify(rqstp, fhp, S_IFREG, +@@ -824,6 +825,11 @@ wait_for_construction: + + /* Did construction of this file fail? */ + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { ++ if (!retry) { ++ status = nfserr_jukebox; ++ goto out; ++ } ++ retry = false; + nfsd_file_put_noref(nf); + goto retry; + } diff --git a/queue-5.5/powerpc-futex-fix-incorrect-user-access-blocking.patch b/queue-5.5/powerpc-futex-fix-incorrect-user-access-blocking.patch new file mode 100644 index 00000000000..1c4bfb667bf --- /dev/null +++ b/queue-5.5/powerpc-futex-fix-incorrect-user-access-blocking.patch @@ -0,0 +1,105 @@ +From 9dc086f1e9ef39dd823bd27954b884b2062f9e70 Mon Sep 17 00:00:00 2001 +From: Michael Ellerman +Date: Fri, 7 Feb 2020 22:15:46 +1100 +Subject: powerpc/futex: Fix incorrect user access blocking + +From: Michael Ellerman + +commit 9dc086f1e9ef39dd823bd27954b884b2062f9e70 upstream. + +The early versions of our kernel user access prevention (KUAP) were +written by Russell and Christophe, and didn't have separate +read/write access. + +At some point I picked up the series and added the read/write access, +but I failed to update the usages in futex.h to correctly allow read +and write. + +However we didn't notice because of another bug which was causing the +low-level code to always enable read and write. That bug was fixed +recently in commit 1d8f739b07bd ("powerpc/kuap: Fix set direction in +allow/prevent_user_access()"). + +futex_atomic_cmpxchg_inatomic() is passed the user address as %3 and +does: + + 1: lwarx %1, 0, %3 + cmpw 0, %1, %4 + bne- 3f + 2: stwcx. %5, 0, %3 + +Which clearly loads and stores from/to %3. The logic in +arch_futex_atomic_op_inuser() is similar, so fix both of them to use +allow_read_write_user(). + +Without this fix, and with PPC_KUAP_DEBUG=y, we see eg: + + Bug: Read fault blocked by AMR! + WARNING: CPU: 94 PID: 149215 at arch/powerpc/include/asm/book3s/64/kup-radix.h:126 __do_page_fault+0x600/0xf30 + CPU: 94 PID: 149215 Comm: futex_requeue_p Tainted: G W 5.5.0-rc7-gcc9x-g4c25df5640ae #1 + ... + NIP [c000000000070680] __do_page_fault+0x600/0xf30 + LR [c00000000007067c] __do_page_fault+0x5fc/0xf30 + Call Trace: + [c00020138e5637e0] [c00000000007067c] __do_page_fault+0x5fc/0xf30 (unreliable) + [c00020138e5638c0] [c00000000000ada8] handle_page_fault+0x10/0x30 + --- interrupt: 301 at cmpxchg_futex_value_locked+0x68/0xd0 + LR = futex_lock_pi_atomic+0xe0/0x1f0 + [c00020138e563bc0] [c000000000217b50] futex_lock_pi_atomic+0x80/0x1f0 (unreliable) + [c00020138e563c30] [c00000000021b668] futex_requeue+0x438/0xb60 + [c00020138e563d60] [c00000000021c6cc] do_futex+0x1ec/0x2b0 + [c00020138e563d90] [c00000000021c8b8] sys_futex+0x128/0x200 + [c00020138e563e20] [c00000000000b7ac] system_call+0x5c/0x68 + +Fixes: de78a9c42a79 ("powerpc: Add a framework for Kernel Userspace Access Protection") +Cc: stable@vger.kernel.org # v5.2+ +Reported-by: syzbot+e808452bad7c375cbee6@syzkaller-ppc64.appspotmail.com +Signed-off-by: Michael Ellerman +Reviewed-by: Christophe Leroy +Link: https://lore.kernel.org/r/20200207122145.11928-1-mpe@ellerman.id.au +Signed-off-by: Greg Kroah-Hartman + +--- + arch/powerpc/include/asm/futex.h | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/arch/powerpc/include/asm/futex.h ++++ b/arch/powerpc/include/asm/futex.h +@@ -35,7 +35,7 @@ static inline int arch_futex_atomic_op_i + { + int oldval = 0, ret; + +- allow_write_to_user(uaddr, sizeof(*uaddr)); ++ allow_read_write_user(uaddr, uaddr, sizeof(*uaddr)); + pagefault_disable(); + + switch (op) { +@@ -62,7 +62,7 @@ static inline int arch_futex_atomic_op_i + + *oval = oldval; + +- prevent_write_to_user(uaddr, sizeof(*uaddr)); ++ prevent_read_write_user(uaddr, uaddr, sizeof(*uaddr)); + return ret; + } + +@@ -76,7 +76,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, + if (!access_ok(uaddr, sizeof(u32))) + return -EFAULT; + +- allow_write_to_user(uaddr, sizeof(*uaddr)); ++ allow_read_write_user(uaddr, uaddr, sizeof(*uaddr)); ++ + __asm__ __volatile__ ( + PPC_ATOMIC_ENTRY_BARRIER + "1: lwarx %1,0,%3 # futex_atomic_cmpxchg_inatomic\n\ +@@ -97,7 +98,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, + : "cc", "memory"); + + *uval = prev; +- prevent_write_to_user(uaddr, sizeof(*uaddr)); ++ prevent_read_write_user(uaddr, uaddr, sizeof(*uaddr)); ++ + return ret; + } + diff --git a/queue-5.5/scsi-qla2xxx-fix-unbound-nvme-response-length.patch b/queue-5.5/scsi-qla2xxx-fix-unbound-nvme-response-length.patch new file mode 100644 index 00000000000..7b622b9be37 --- /dev/null +++ b/queue-5.5/scsi-qla2xxx-fix-unbound-nvme-response-length.patch @@ -0,0 +1,78 @@ +From 00fe717ee1ea3c2979db4f94b1533c57aed8dea9 Mon Sep 17 00:00:00 2001 +From: Arun Easi +Date: Thu, 23 Jan 2020 20:50:14 -0800 +Subject: scsi: qla2xxx: Fix unbound NVME response length + +From: Arun Easi + +commit 00fe717ee1ea3c2979db4f94b1533c57aed8dea9 upstream. + +On certain cases when response length is less than 32, NVME response data +is supplied inline in IOCB. This is indicated by some combination of state +flags. There was an instance when a high, and incorrect, response length +was indicated causing driver to overrun buffers. Fix this by checking and +limiting the response payload length. + +Fixes: 7401bc18d1ee3 ("scsi: qla2xxx: Add FC-NVMe command handling") +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20200124045014.23554-1-hmadhani@marvell.com +Signed-off-by: Arun Easi +Signed-off-by: Himanshu Madhani +Reviewed-by: Ewan D. Milne +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/scsi/qla2xxx/qla_dbg.c | 6 ------ + drivers/scsi/qla2xxx/qla_dbg.h | 6 ++++++ + drivers/scsi/qla2xxx/qla_isr.c | 12 ++++++++++++ + 3 files changed, 18 insertions(+), 6 deletions(-) + +--- a/drivers/scsi/qla2xxx/qla_dbg.c ++++ b/drivers/scsi/qla2xxx/qla_dbg.c +@@ -2519,12 +2519,6 @@ qla83xx_fw_dump_failed: + /* Driver Debug Functions. */ + /****************************************************************************/ + +-static inline int +-ql_mask_match(uint level) +-{ +- return (level & ql2xextended_error_logging) == level; +-} +- + /* + * This function is for formatting and logging debug information. + * It is to be used when vha is available. It formats the message +--- a/drivers/scsi/qla2xxx/qla_dbg.h ++++ b/drivers/scsi/qla2xxx/qla_dbg.h +@@ -374,3 +374,9 @@ extern int qla24xx_dump_ram(struct qla_h + extern void qla24xx_pause_risc(struct device_reg_24xx __iomem *, + struct qla_hw_data *); + extern int qla24xx_soft_reset(struct qla_hw_data *); ++ ++static inline int ++ql_mask_match(uint level) ++{ ++ return (level & ql2xextended_error_logging) == level; ++} +--- a/drivers/scsi/qla2xxx/qla_isr.c ++++ b/drivers/scsi/qla2xxx/qla_isr.c +@@ -1918,6 +1918,18 @@ static void qla24xx_nvme_iocb_entry(scsi + inbuf = (uint32_t *)&sts->nvme_ersp_data; + outbuf = (uint32_t *)fd->rspaddr; + iocb->u.nvme.rsp_pyld_len = le16_to_cpu(sts->nvme_rsp_pyld_len); ++ if (unlikely(iocb->u.nvme.rsp_pyld_len > ++ sizeof(struct nvme_fc_ersp_iu))) { ++ if (ql_mask_match(ql_dbg_io)) { ++ WARN_ONCE(1, "Unexpected response payload length %u.\n", ++ iocb->u.nvme.rsp_pyld_len); ++ ql_log(ql_log_warn, fcport->vha, 0x5100, ++ "Unexpected response payload length %u.\n", ++ iocb->u.nvme.rsp_pyld_len); ++ } ++ iocb->u.nvme.rsp_pyld_len = ++ sizeof(struct nvme_fc_ersp_iu); ++ } + iter = iocb->u.nvme.rsp_pyld_len >> 2; + for (; iter; iter--) + *outbuf++ = swab32(*inbuf++); diff --git a/queue-5.5/series b/queue-5.5/series index 72a505ca0f2..f7041a62f0c 100644 --- a/queue-5.5/series +++ b/queue-5.5/series @@ -184,3 +184,60 @@ crypto-hisilicon-select-crypto_skcipher-not-crypto_blkcipher.patch crypto-atmel-aes-fix-counter-overflow-in-ctr-mode.patch crypto-api-fix-race-condition-in-crypto_spawn_alg.patch crypto-picoxcell-adjust-the-position-of-tasklet_init-and-fix-missed-tasklet_kill.patch +powerpc-futex-fix-incorrect-user-access-blocking.patch +scsi-qla2xxx-fix-unbound-nvme-response-length.patch +nfs-fix-memory-leaks-and-corruption-in-readdir.patch +nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch +nfsd-fix-filecache-lookup.patch +jbd2_seq_info_next-should-increase-position-index.patch +ext4-fix-deadlock-allocating-crypto-bounce-page-from-mempool.patch +ext4-fix-race-conditions-in-d_compare-and-d_hash.patch +btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch +btrfs-make-deduplication-with-range-including-the-last-block-work.patch +btrfs-fix-infinite-loop-during-fsync-after-rename-operations.patch +btrfs-set-trans-drity-in-btrfs_commit_transaction.patch +btrfs-drop-log-root-for-dropped-roots.patch +btrfs-free-block-groups-after-free-ing-fs-trees.patch +btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch +btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch +btrfs-correctly-handle-empty-trees-in-find_first_clear_extent_bit.patch +btrfs-send-fix-emission-of-invalid-clone-operations-within-the-same-file.patch +arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch +iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch +mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch +sunrpc-expiry_time-should-be-seconds-not-timeval.patch +gfs2-fix-gfs2_find_jhead-that-returns-uninitialized-jhead-with-seq-0.patch +gfs2-move-setting-current-backing_dev_info.patch +gfs2-fix-o_sync-write-handling.patch +drm-atmel-hlcdc-use-double-rate-for-pixel-clock-only-if-supported.patch +drm-atmel-hlcdc-enable-clock-before-configuring-timing-engine.patch +drm-atmel-hlcdc-prefer-a-lower-pixel-clock-than-requested.patch +drm-rect-avoid-division-by-zero.patch +media-iguanair-fix-endpoint-sanity-check.patch +media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch +tools-kvm_stat-fix-kvm_exit-filter-name.patch +xen-balloon-support-xend-based-toolstack-take-two.patch +xen-gntdev-do-not-use-mm-notifiers-with-autotranslating-guests.patch +watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch +bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch +io_uring-don-t-map-read-write-iovec-potentially-twice.patch +io_uring-spin-for-sq-thread-to-idle-on-shutdown.patch +eventfd-track-eventfd_signal-recursion-depth.patch +aio-prevent-potential-eventfd-recursion-on-poll.patch +kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch +kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch +kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch +kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch +kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch +kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch +kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch +kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch +kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch +kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch +kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch +kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch +kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch +kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch +kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch +kvm-svm-pku-not-currently-supported.patch +kvm-x86-mmu-apply-max-pa-check-for-mmio-sptes-to-32-bit-kvm.patch diff --git a/queue-5.5/sunrpc-expiry_time-should-be-seconds-not-timeval.patch b/queue-5.5/sunrpc-expiry_time-should-be-seconds-not-timeval.patch new file mode 100644 index 00000000000..13de17a4b1e --- /dev/null +++ b/queue-5.5/sunrpc-expiry_time-should-be-seconds-not-timeval.patch @@ -0,0 +1,54 @@ +From 3d96208c30f84d6edf9ab4fac813306ac0d20c10 Mon Sep 17 00:00:00 2001 +From: Roberto Bergantinos Corpas +Date: Tue, 4 Feb 2020 11:32:56 +0100 +Subject: sunrpc: expiry_time should be seconds not timeval + +From: Roberto Bergantinos Corpas + +commit 3d96208c30f84d6edf9ab4fac813306ac0d20c10 upstream. + +When upcalling gssproxy, cache_head.expiry_time is set as a +timeval, not seconds since boot. As such, RPC cache expiry +logic will not clean expired objects created under +auth.rpcsec.context cache. + +This has proven to cause kernel memory leaks on field. Using +64 bit variants of getboottime/timespec + +Expiration times have worked this way since 2010's c5b29f885afe "sunrpc: +use seconds since boot in expiry cache". The gssproxy code introduced +in 2012 added gss_proxy_save_rsc and introduced the bug. That's a while +for this to lurk, but it required a bit of an extreme case to make it +obvious. + +Signed-off-by: Roberto Bergantinos Corpas +Cc: stable@vger.kernel.org +Fixes: 030d794bf498 "SUNRPC: Use gssproxy upcall for server..." +Tested-By: Frank Sorenson +Signed-off-by: J. Bruce Fields +Signed-off-by: Greg Kroah-Hartman + +--- + net/sunrpc/auth_gss/svcauth_gss.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/net/sunrpc/auth_gss/svcauth_gss.c ++++ b/net/sunrpc/auth_gss/svcauth_gss.c +@@ -1248,6 +1248,7 @@ static int gss_proxy_save_rsc(struct cac + dprintk("RPC: No creds found!\n"); + goto out; + } else { ++ struct timespec64 boot; + + /* steal creds */ + rsci.cred = ud->creds; +@@ -1268,6 +1269,9 @@ static int gss_proxy_save_rsc(struct cac + &expiry, GFP_KERNEL); + if (status) + goto out; ++ ++ getboottime64(&boot); ++ expiry -= boot.tv_sec; + } + + rsci.h.expiry_time = expiry; diff --git a/queue-5.5/tools-kvm_stat-fix-kvm_exit-filter-name.patch b/queue-5.5/tools-kvm_stat-fix-kvm_exit-filter-name.patch new file mode 100644 index 00000000000..21d720fccf8 --- /dev/null +++ b/queue-5.5/tools-kvm_stat-fix-kvm_exit-filter-name.patch @@ -0,0 +1,73 @@ +From 5fcf3a55a62afb0760ccb6f391d62f20bce4a42f Mon Sep 17 00:00:00 2001 +From: Gavin Shan +Date: Tue, 10 Dec 2019 15:48:29 +1100 +Subject: tools/kvm_stat: Fix kvm_exit filter name + +From: Gavin Shan + +commit 5fcf3a55a62afb0760ccb6f391d62f20bce4a42f upstream. + +The filter name is fixed to "exit_reason" for some kvm_exit events, no +matter what architect we have. Actually, the filter name ("exit_reason") +is only applicable to x86, meaning it's broken on other architects +including aarch64. + +This fixes the issue by providing various kvm_exit filter names, depending +on architect we're on. Afterwards, the variable filter name is picked and +applied through ioctl(fd, SET_FILTER). + +Reported-by: Andrew Jones +Signed-off-by: Gavin Shan +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + tools/kvm/kvm_stat/kvm_stat | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/tools/kvm/kvm_stat/kvm_stat ++++ b/tools/kvm/kvm_stat/kvm_stat +@@ -270,6 +270,7 @@ class ArchX86(Arch): + def __init__(self, exit_reasons): + self.sc_perf_evt_open = 298 + self.ioctl_numbers = IOCTL_NUMBERS ++ self.exit_reason_field = 'exit_reason' + self.exit_reasons = exit_reasons + + def debugfs_is_child(self, field): +@@ -289,6 +290,7 @@ class ArchPPC(Arch): + # numbers depend on the wordsize. + char_ptr_size = ctypes.sizeof(ctypes.c_char_p) + self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 ++ self.exit_reason_field = 'exit_nr' + self.exit_reasons = {} + + def debugfs_is_child(self, field): +@@ -300,6 +302,7 @@ class ArchA64(Arch): + def __init__(self): + self.sc_perf_evt_open = 241 + self.ioctl_numbers = IOCTL_NUMBERS ++ self.exit_reason_field = 'esr_ec' + self.exit_reasons = AARCH64_EXIT_REASONS + + def debugfs_is_child(self, field): +@@ -311,6 +314,7 @@ class ArchS390(Arch): + def __init__(self): + self.sc_perf_evt_open = 331 + self.ioctl_numbers = IOCTL_NUMBERS ++ self.exit_reason_field = None + self.exit_reasons = None + + def debugfs_is_child(self, field): +@@ -541,8 +545,8 @@ class TracepointProvider(Provider): + """ + filters = {} + filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS) +- if ARCH.exit_reasons: +- filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons) ++ if ARCH.exit_reason_field and ARCH.exit_reasons: ++ filters['kvm_exit'] = (ARCH.exit_reason_field, ARCH.exit_reasons) + return filters + + def _get_available_fields(self): diff --git a/queue-5.5/watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch b/queue-5.5/watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch new file mode 100644 index 00000000000..c30f0f18113 --- /dev/null +++ b/queue-5.5/watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch @@ -0,0 +1,197 @@ +From 69503e585192fdd84b240f18a0873d20e18a2e0a Mon Sep 17 00:00:00 2001 +From: Vladis Dronov +Date: Wed, 8 Jan 2020 13:53:47 +0100 +Subject: watchdog: fix UAF in reboot notifier handling in watchdog core code + +From: Vladis Dronov + +commit 69503e585192fdd84b240f18a0873d20e18a2e0a upstream. + +After the commit 44ea39420fc9 ("drivers/watchdog: make use of +devm_register_reboot_notifier()") the struct notifier_block reboot_nb in +the struct watchdog_device is removed from the reboot notifiers chain at +the time watchdog's chardev is closed. But at least in i6300esb.c case +reboot_nb is embedded in the struct esb_dev which can be freed on its +device removal and before the chardev is closed, thus UAF at reboot: + +[ 7.728581] esb_probe: esb_dev.watchdog_device ffff91316f91ab28 +ts# uname -r note the address ^^^ +5.5.0-rc5-ae6088-wdog +ts# ./openwdog0 & +[1] 696 +ts# opened /dev/watchdog0, sleeping 10s... +ts# echo 1 > /sys/devices/pci0000\:00/0000\:00\:09.0/remove +[ 178.086079] devres:rel_nodes: dev ffff91317668a0b0 data ffff91316f91ab28 + esb_dev.watchdog_device.reboot_nb memory is freed here ^^^ +ts# ...woken up +[ 181.459010] devres:rel_nodes: dev ffff913171781000 data ffff913174a1dae8 +[ 181.460195] devm_unreg_reboot_notifier: res ffff913174a1dae8 nb ffff91316f91ab78 + attempt to use memory already freed ^^^ +[ 181.461063] devm_unreg_reboot_notifier: nb->call 6b6b6b6b6b6b6b6b +[ 181.461243] devm_unreg_reboot_notifier: nb->next 6b6b6b6b6b6b6b6b + freed memory is filled with a slub poison ^^^ +[1]+ Done ./openwdog0 +ts# reboot +[ 229.921862] systemd-shutdown[1]: Rebooting. +[ 229.939265] notifier_call_chain: nb ffffffff9c6c2f20 nb->next ffffffff9c6d50c0 +[ 229.943080] notifier_call_chain: nb ffffffff9c6d50c0 nb->next 6b6b6b6b6b6b6b6b +[ 229.946054] notifier_call_chain: nb 6b6b6b6b6b6b6b6b INVAL +[ 229.957584] general protection fault: 0000 [#1] SMP +[ 229.958770] CPU: 0 PID: 1 Comm: systemd-shutdow Not tainted 5.5.0-rc5-ae6088-wdog +[ 229.960224] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ... +[ 229.963288] RIP: 0010:notifier_call_chain+0x66/0xd0 +[ 229.969082] RSP: 0018:ffffb20dc0013d88 EFLAGS: 00010246 +[ 229.970812] RAX: 000000000000002e RBX: 6b6b6b6b6b6b6b6b RCX: 00000000000008b3 +[ 229.972929] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffffffff9ccc46ac +[ 229.975028] RBP: 0000000000000001 R08: 0000000000000000 R09: 00000000000008b3 +[ 229.977039] R10: 0000000000000001 R11: ffffffff9c26c740 R12: 0000000000000000 +[ 229.979155] R13: 6b6b6b6b6b6b6b6b R14: 0000000000000000 R15: 00000000fffffffa +... slub_debug=FZP poison ^^^ +[ 229.989089] Call Trace: +[ 229.990157] blocking_notifier_call_chain+0x43/0x59 +[ 229.991401] kernel_restart_prepare+0x14/0x30 +[ 229.992607] kernel_restart+0x9/0x30 +[ 229.993800] __do_sys_reboot+0x1d2/0x210 +[ 230.000149] do_syscall_64+0x3d/0x130 +[ 230.001277] entry_SYSCALL_64_after_hwframe+0x44/0xa9 +[ 230.002639] RIP: 0033:0x7f5461bdd177 +[ 230.016402] Modules linked in: i6300esb +[ 230.050261] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b + +Fix the crash by reverting 44ea39420fc9 so unregister_reboot_notifier() +is called when watchdog device is removed. This also makes handling of +the reboot notifier unified with the handling of the restart handler, +which is freed with unregister_restart_handler() in the same place. + +Fixes: 44ea39420fc9 ("drivers/watchdog: make use of devm_register_reboot_notifier()") +Cc: stable@vger.kernel.org # v4.15+ +Signed-off-by: Vladis Dronov +Reviewed-by: Guenter Roeck +Link: https://lore.kernel.org/r/20200108125347.6067-1-vdronov@redhat.com +Signed-off-by: Guenter Roeck +Signed-off-by: Wim Van Sebroeck +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/watchdog/watchdog_core.c | 35 +++++++++++++++++++++++++++++++++++ + drivers/watchdog/watchdog_dev.c | 36 +----------------------------------- + 2 files changed, 36 insertions(+), 35 deletions(-) + +--- a/drivers/watchdog/watchdog_core.c ++++ b/drivers/watchdog/watchdog_core.c +@@ -147,6 +147,25 @@ int watchdog_init_timeout(struct watchdo + } + EXPORT_SYMBOL_GPL(watchdog_init_timeout); + ++static int watchdog_reboot_notifier(struct notifier_block *nb, ++ unsigned long code, void *data) ++{ ++ struct watchdog_device *wdd; ++ ++ wdd = container_of(nb, struct watchdog_device, reboot_nb); ++ if (code == SYS_DOWN || code == SYS_HALT) { ++ if (watchdog_active(wdd)) { ++ int ret; ++ ++ ret = wdd->ops->stop(wdd); ++ if (ret) ++ return NOTIFY_BAD; ++ } ++ } ++ ++ return NOTIFY_DONE; ++} ++ + static int watchdog_restart_notifier(struct notifier_block *nb, + unsigned long action, void *data) + { +@@ -235,6 +254,19 @@ static int __watchdog_register_device(st + } + } + ++ if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) { ++ wdd->reboot_nb.notifier_call = watchdog_reboot_notifier; ++ ++ ret = register_reboot_notifier(&wdd->reboot_nb); ++ if (ret) { ++ pr_err("watchdog%d: Cannot register reboot notifier (%d)\n", ++ wdd->id, ret); ++ watchdog_dev_unregister(wdd); ++ ida_simple_remove(&watchdog_ida, id); ++ return ret; ++ } ++ } ++ + if (wdd->ops->restart) { + wdd->restart_nb.notifier_call = watchdog_restart_notifier; + +@@ -289,6 +321,9 @@ static void __watchdog_unregister_device + if (wdd->ops->restart) + unregister_restart_handler(&wdd->restart_nb); + ++ if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) ++ unregister_reboot_notifier(&wdd->reboot_nb); ++ + watchdog_dev_unregister(wdd); + ida_simple_remove(&watchdog_ida, wdd->id); + } +--- a/drivers/watchdog/watchdog_dev.c ++++ b/drivers/watchdog/watchdog_dev.c +@@ -38,7 +38,6 @@ + #include /* For handling misc devices */ + #include /* For module stuff/... */ + #include /* For mutexes */ +-#include /* For reboot notifier */ + #include /* For memory functions */ + #include /* For standard types (like size_t) */ + #include /* For watchdog specific items */ +@@ -1097,25 +1096,6 @@ static void watchdog_cdev_unregister(str + put_device(&wd_data->dev); + } + +-static int watchdog_reboot_notifier(struct notifier_block *nb, +- unsigned long code, void *data) +-{ +- struct watchdog_device *wdd; +- +- wdd = container_of(nb, struct watchdog_device, reboot_nb); +- if (code == SYS_DOWN || code == SYS_HALT) { +- if (watchdog_active(wdd)) { +- int ret; +- +- ret = wdd->ops->stop(wdd); +- if (ret) +- return NOTIFY_BAD; +- } +- } +- +- return NOTIFY_DONE; +-} +- + /* + * watchdog_dev_register: register a watchdog device + * @wdd: watchdog device +@@ -1134,22 +1114,8 @@ int watchdog_dev_register(struct watchdo + return ret; + + ret = watchdog_register_pretimeout(wdd); +- if (ret) { ++ if (ret) + watchdog_cdev_unregister(wdd); +- return ret; +- } +- +- if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) { +- wdd->reboot_nb.notifier_call = watchdog_reboot_notifier; +- +- ret = devm_register_reboot_notifier(&wdd->wd_data->dev, +- &wdd->reboot_nb); +- if (ret) { +- pr_err("watchdog%d: Cannot register reboot notifier (%d)\n", +- wdd->id, ret); +- watchdog_dev_unregister(wdd); +- } +- } + + return ret; + } diff --git a/queue-5.5/xen-balloon-support-xend-based-toolstack-take-two.patch b/queue-5.5/xen-balloon-support-xend-based-toolstack-take-two.patch new file mode 100644 index 00000000000..8090dfc11ae --- /dev/null +++ b/queue-5.5/xen-balloon-support-xend-based-toolstack-take-two.patch @@ -0,0 +1,47 @@ +From eda4eabf86fd6806eaabc23fb90dd056fdac037b Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Fri, 17 Jan 2020 14:49:31 +0100 +Subject: xen/balloon: Support xend-based toolstack take two + +From: Juergen Gross + +commit eda4eabf86fd6806eaabc23fb90dd056fdac037b upstream. + +Commit 3aa6c19d2f38be ("xen/balloon: Support xend-based toolstack") +tried to fix a regression with running on rather ancient Xen versions. +Unfortunately the fix was based on the assumption that xend would +just use another Xenstore node, but in reality only some downstream +versions of xend are doing that. The upstream xend does not write +that Xenstore node at all, so the problem must be fixed in another +way. + +The easiest way to achieve that is to fall back to the behavior +before commit 96edd61dcf4436 ("xen/balloon: don't online new memory +initially") in case the static memory maximum can't be read. + +This is achieved by setting static_max to the current number of +memory pages known by the system resulting in target_diff becoming +zero. + +Fixes: 3aa6c19d2f38be ("xen/balloon: Support xend-based toolstack") +Signed-off-by: Juergen Gross +Reviewed-by: Boris Ostrovsky +Cc: # 4.13 +Signed-off-by: Boris Ostrovsky +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/xen/xen-balloon.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/xen/xen-balloon.c ++++ b/drivers/xen/xen-balloon.c +@@ -94,7 +94,7 @@ static void watch_target(struct xenbus_w + "%llu", &static_max) == 1)) + static_max >>= PAGE_SHIFT - 10; + else +- static_max = new_target; ++ static_max = balloon_stats.current_pages; + + target_diff = (xen_pv_domain() || xen_initial_domain()) ? 0 + : static_max - balloon_stats.target_pages; diff --git a/queue-5.5/xen-gntdev-do-not-use-mm-notifiers-with-autotranslating-guests.patch b/queue-5.5/xen-gntdev-do-not-use-mm-notifiers-with-autotranslating-guests.patch new file mode 100644 index 00000000000..e448da44c20 --- /dev/null +++ b/queue-5.5/xen-gntdev-do-not-use-mm-notifiers-with-autotranslating-guests.patch @@ -0,0 +1,63 @@ +From 9293724192a73f49c722e9685d45649c6df67dfe Mon Sep 17 00:00:00 2001 +From: Boris Ostrovsky +Date: Tue, 28 Jan 2020 10:31:26 -0500 +Subject: xen/gntdev: Do not use mm notifiers with autotranslating guests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Boris Ostrovsky + +commit 9293724192a73f49c722e9685d45649c6df67dfe upstream. + +Commit d3eeb1d77c5d ("xen/gntdev: use mmu_interval_notifier_insert") +missed a test for use_ptemod when calling mmu_interval_read_begin(). Fix +that. + +Fixes: d3eeb1d77c5d ("xen/gntdev: use mmu_interval_notifier_insert") +CC: stable@vger.kernel.org # 5.5 +Reported-by: Ilpo Järvinen +Tested-by: Ilpo Järvinen +Signed-off-by: Boris Ostrovsky +Reviewed-by: Jason Gunthorpe +Acked-by: Juergen Gross +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/xen/gntdev.c | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +--- a/drivers/xen/gntdev.c ++++ b/drivers/xen/gntdev.c +@@ -1006,19 +1006,19 @@ static int gntdev_mmap(struct file *flip + } + mutex_unlock(&priv->lock); + +- /* +- * gntdev takes the address of the PTE in find_grant_ptes() and passes +- * it to the hypervisor in gntdev_map_grant_pages(). The purpose of +- * the notifier is to prevent the hypervisor pointer to the PTE from +- * going stale. +- * +- * Since this vma's mappings can't be touched without the mmap_sem, +- * and we are holding it now, there is no need for the notifier_range +- * locking pattern. +- */ +- mmu_interval_read_begin(&map->notifier); +- + if (use_ptemod) { ++ /* ++ * gntdev takes the address of the PTE in find_grant_ptes() and ++ * passes it to the hypervisor in gntdev_map_grant_pages(). The ++ * purpose of the notifier is to prevent the hypervisor pointer ++ * to the PTE from going stale. ++ * ++ * Since this vma's mappings can't be touched without the ++ * mmap_sem, and we are holding it now, there is no need for ++ * the notifier_range locking pattern. ++ */ ++ mmu_interval_read_begin(&map->notifier); ++ + map->pages_vm_start = vma->vm_start; + err = apply_to_page_range(vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, -- 2.47.3