]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.19-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 9 Feb 2020 12:27:34 +0000 (13:27 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 9 Feb 2020 12:27:34 +0000 (13:27 +0100)
added patches:
aio-prevent-potential-eventfd-recursion-on-poll.patch
arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch
bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch
btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch
btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch
btrfs-set-trans-drity-in-btrfs_commit_transaction.patch
drm-rect-avoid-division-by-zero.patch
eventfd-track-eventfd_signal-recursion-depth.patch
gfs2-fix-o_sync-write-handling.patch
gfs2-move-setting-current-backing_dev_info.patch
iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch
jbd2_seq_info_next-should-increase-position-index.patch
kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch
kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch
kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch
kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch
kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch
kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch
kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch
kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch
kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch
kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch
kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch
kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch
kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch
kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch
kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch
media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch
mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch
nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch
nfs-fix-memory-leaks-and-corruption-in-readdir.patch
scsi-qla2xxx-fix-unbound-nvme-response-length.patch
sunrpc-expiry_time-should-be-seconds-not-timeval.patch
tools-kvm_stat-fix-kvm_exit-filter-name.patch
watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch
xen-balloon-support-xend-based-toolstack-take-two.patch

37 files changed:
queue-4.19/aio-prevent-potential-eventfd-recursion-on-poll.patch [new file with mode: 0644]
queue-4.19/arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch [new file with mode: 0644]
queue-4.19/bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch [new file with mode: 0644]
queue-4.19/btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch [new file with mode: 0644]
queue-4.19/btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch [new file with mode: 0644]
queue-4.19/btrfs-set-trans-drity-in-btrfs_commit_transaction.patch [new file with mode: 0644]
queue-4.19/drm-rect-avoid-division-by-zero.patch [new file with mode: 0644]
queue-4.19/eventfd-track-eventfd_signal-recursion-depth.patch [new file with mode: 0644]
queue-4.19/gfs2-fix-o_sync-write-handling.patch [new file with mode: 0644]
queue-4.19/gfs2-move-setting-current-backing_dev_info.patch [new file with mode: 0644]
queue-4.19/iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch [new file with mode: 0644]
queue-4.19/jbd2_seq_info_next-should-increase-position-index.patch [new file with mode: 0644]
queue-4.19/kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch [new file with mode: 0644]
queue-4.19/kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch [new file with mode: 0644]
queue-4.19/kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch [new file with mode: 0644]
queue-4.19/kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch [new file with mode: 0644]
queue-4.19/kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch [new file with mode: 0644]
queue-4.19/media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch [new file with mode: 0644]
queue-4.19/mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch [new file with mode: 0644]
queue-4.19/nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch [new file with mode: 0644]
queue-4.19/nfs-fix-memory-leaks-and-corruption-in-readdir.patch [new file with mode: 0644]
queue-4.19/scsi-qla2xxx-fix-unbound-nvme-response-length.patch [new file with mode: 0644]
queue-4.19/series
queue-4.19/sunrpc-expiry_time-should-be-seconds-not-timeval.patch [new file with mode: 0644]
queue-4.19/tools-kvm_stat-fix-kvm_exit-filter-name.patch [new file with mode: 0644]
queue-4.19/watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch [new file with mode: 0644]
queue-4.19/xen-balloon-support-xend-based-toolstack-take-two.patch [new file with mode: 0644]

diff --git a/queue-4.19/aio-prevent-potential-eventfd-recursion-on-poll.patch b/queue-4.19/aio-prevent-potential-eventfd-recursion-on-poll.patch
new file mode 100644 (file)
index 0000000..b8e742e
--- /dev/null
@@ -0,0 +1,70 @@
+From 01d7a356872eec22ef34a33a5f9cfa917d145468 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Mon, 3 Feb 2020 10:33:42 -0700
+Subject: aio: prevent potential eventfd recursion on poll
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 01d7a356872eec22ef34a33a5f9cfa917d145468 upstream.
+
+If we have nested or circular eventfd wakeups, then we can deadlock if
+we run them inline from our poll waitqueue wakeup handler. It's also
+possible to have very long chains of notifications, to the extent where
+we could risk blowing the stack.
+
+Check the eventfd recursion count before calling eventfd_signal(). If
+it's non-zero, then punt the signaling to async context. This is always
+safe, as it takes us out-of-line in terms of stack and locking context.
+
+Cc: stable@vger.kernel.org # 4.19+
+Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   20 ++++++++++++++++++--
+ 1 file changed, 18 insertions(+), 2 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1600,6 +1600,14 @@ static int aio_fsync(struct fsync_iocb *
+       return 0;
+ }
++static void aio_poll_put_work(struct work_struct *work)
++{
++      struct poll_iocb *req = container_of(work, struct poll_iocb, work);
++      struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
++
++      iocb_put(iocb);
++}
++
+ static void aio_poll_complete_work(struct work_struct *work)
+ {
+       struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+@@ -1664,6 +1672,8 @@ static int aio_poll_wake(struct wait_que
+       list_del_init(&req->wait.entry);
+       if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
++              struct kioctx *ctx = iocb->ki_ctx;
++
+               /*
+                * Try to complete the iocb inline if we can. Use
+                * irqsave/irqrestore because not all filesystems (e.g. fuse)
+@@ -1673,8 +1683,14 @@ static int aio_poll_wake(struct wait_que
+               list_del(&iocb->ki_list);
+               iocb->ki_res.res = mangle_poll(mask);
+               req->done = true;
+-              spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
+-              iocb_put(iocb);
++              if (iocb->ki_eventfd && eventfd_signal_count()) {
++                      iocb = NULL;
++                      INIT_WORK(&req->work, aio_poll_put_work);
++                      schedule_work(&req->work);
++              }
++              spin_unlock_irqrestore(&ctx->ctx_lock, flags);
++              if (iocb)
++                      iocb_put(iocb);
+       } else {
+               schedule_work(&req->work);
+       }
diff --git a/queue-4.19/arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch b/queue-4.19/arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch
new file mode 100644 (file)
index 0000000..4b48d3a
--- /dev/null
@@ -0,0 +1,70 @@
+From 1a3388d506bf5b45bb283e6a4c4706cfb4897333 Mon Sep 17 00:00:00 2001
+From: Stephen Warren <swarren@nvidia.com>
+Date: Thu, 3 Oct 2019 14:50:31 -0600
+Subject: ARM: tegra: Enable PLLP bypass during Tegra124 LP1
+
+From: Stephen Warren <swarren@nvidia.com>
+
+commit 1a3388d506bf5b45bb283e6a4c4706cfb4897333 upstream.
+
+For a little over a year, U-Boot has configured the flow controller to
+perform automatic RAM re-repair on off->on power transitions of the CPU
+rail[1]. This is mandatory for correct operation of Tegra124. However,
+RAM re-repair relies on certain clocks, which the kernel must enable and
+leave running. PLLP is one of those clocks. This clock is shut down
+during LP1 in order to save power. Enable bypass (which I believe routes
+osc_div_clk, essentially the crystal clock, to the PLL output) so that
+this clock signal toggles even though the PLL is not active. This is
+required so that LP1 power mode (system suspend) operates correctly.
+
+The bypass configuration must then be undone when resuming from LP1, so
+that all peripheral clocks run at the expected rate. Without this, many
+peripherals won't work correctly; for example, the UART baud rate would
+be incorrect.
+
+NVIDIA's downstream kernel code only does this if not compiled for
+Tegra30, so the added code is made conditional upon the chip ID.
+NVIDIA's downstream code makes this change conditional upon the active
+CPU cluster. The upstream kernel currently doesn't support cluster
+switching, so this patch doesn't test the active CPU cluster ID.
+
+[1] 3cc7942a4ae5 ARM: tegra: implement RAM repair
+
+Reported-by: Jonathan Hunter <jonathanh@nvidia.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Stephen Warren <swarren@nvidia.com>
+Signed-off-by: Thierry Reding <treding@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/mach-tegra/sleep-tegra30.S |   11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/arch/arm/mach-tegra/sleep-tegra30.S
++++ b/arch/arm/mach-tegra/sleep-tegra30.S
+@@ -382,6 +382,14 @@ _pll_m_c_x_done:
+       pll_locked r1, r0, CLK_RESET_PLLC_BASE
+       pll_locked r1, r0, CLK_RESET_PLLX_BASE
++      tegra_get_soc_id TEGRA_APB_MISC_BASE, r1
++      cmp     r1, #TEGRA30
++      beq     1f
++      ldr     r1, [r0, #CLK_RESET_PLLP_BASE]
++      bic     r1, r1, #(1<<31)        @ disable PllP bypass
++      str     r1, [r0, #CLK_RESET_PLLP_BASE]
++1:
++
+       mov32   r7, TEGRA_TMRUS_BASE
+       ldr     r1, [r7]
+       add     r1, r1, #LOCK_DELAY
+@@ -641,7 +649,10 @@ tegra30_switch_cpu_to_clk32k:
+       str     r0, [r4, #PMC_PLLP_WB0_OVERRIDE]
+       /* disable PLLP, PLLA, PLLC and PLLX */
++      tegra_get_soc_id TEGRA_APB_MISC_BASE, r1
++      cmp     r1, #TEGRA30
+       ldr     r0, [r5, #CLK_RESET_PLLP_BASE]
++      orrne   r0, r0, #(1 << 31)      @ enable PllP bypass on fast cluster
+       bic     r0, r0, #(1 << 30)
+       str     r0, [r5, #CLK_RESET_PLLP_BASE]
+       ldr     r0, [r5, #CLK_RESET_PLLA_BASE]
diff --git a/queue-4.19/bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch b/queue-4.19/bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch
new file mode 100644 (file)
index 0000000..b3b364f
--- /dev/null
@@ -0,0 +1,139 @@
+From 038ba8cc1bffc51250add4a9b9249d4331576d8f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 1 Feb 2020 22:42:33 +0800
+Subject: bcache: add readahead cache policy options via sysfs interface
+
+From: Coly Li <colyli@suse.de>
+
+commit 038ba8cc1bffc51250add4a9b9249d4331576d8f upstream.
+
+In year 2007 high performance SSD was still expensive, in order to
+save more space for real workload or meta data, the readahead I/Os
+for non-meta data was bypassed and not cached on SSD.
+
+In now days, SSD price drops a lot and people can find larger size
+SSD with more comfortable price. It is unncessary to alway bypass
+normal readahead I/Os to save SSD space for now.
+
+This patch adds options for readahead data cache policies via sysfs
+file /sys/block/bcache<N>/readahead_cache_policy, the options are,
+- "all": cache all readahead data I/Os.
+- "meta-only": only cache meta data, and bypass other regular I/Os.
+
+If users want to make bcache continue to only cache readahead request
+for metadata and bypass regular data readahead, please set "meta-only"
+to this sysfs file. By default, bcache will back to cache all read-
+ahead requests now.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Coly Li <colyli@suse.de>
+Acked-by: Eric Wheeler <bcache@linux.ewheeler.net>
+Cc: Michael Lyle <mlyle@lyle.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/bcache/bcache.h  |    3 +++
+ drivers/md/bcache/request.c |   17 ++++++++++++-----
+ drivers/md/bcache/sysfs.c   |   22 ++++++++++++++++++++++
+ 3 files changed, 37 insertions(+), 5 deletions(-)
+
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -329,6 +329,9 @@ struct cached_dev {
+        */
+       atomic_t                has_dirty;
++#define BCH_CACHE_READA_ALL           0
++#define BCH_CACHE_READA_META_ONLY     1
++      unsigned int            cache_readahead_policy;
+       struct bch_ratelimit    writeback_rate;
+       struct delayed_work     writeback_rate_update;
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -391,13 +391,20 @@ static bool check_should_bypass(struct c
+               goto skip;
+       /*
+-       * Flag for bypass if the IO is for read-ahead or background,
+-       * unless the read-ahead request is for metadata
++       * If the bio is for read-ahead or background IO, bypass it or
++       * not depends on the following situations,
++       * - If the IO is for meta data, always cache it and no bypass
++       * - If the IO is not meta data, check dc->cache_reada_policy,
++       *      BCH_CACHE_READA_ALL: cache it and not bypass
++       *      BCH_CACHE_READA_META_ONLY: not cache it and bypass
++       * That is, read-ahead request for metadata always get cached
+        * (eg, for gfs2 or xfs).
+        */
+-      if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
+-          !(bio->bi_opf & (REQ_META|REQ_PRIO)))
+-              goto skip;
++      if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) {
++              if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
++                  (dc->cache_readahead_policy != BCH_CACHE_READA_ALL))
++                      goto skip;
++      }
+       if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
+           bio_sectors(bio) & (c->sb.block_size - 1)) {
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -25,6 +25,12 @@ static const char * const bch_cache_mode
+       NULL
+ };
++static const char * const bch_reada_cache_policies[] = {
++      "all",
++      "meta-only",
++      NULL
++};
++
+ /* Default is -1; we skip past it for stop_when_cache_set_failed */
+ static const char * const bch_stop_on_failure_modes[] = {
+       "auto",
+@@ -94,6 +100,7 @@ rw_attribute(congested_write_threshold_u
+ rw_attribute(sequential_cutoff);
+ rw_attribute(data_csum);
+ rw_attribute(cache_mode);
++rw_attribute(readahead_cache_policy);
+ rw_attribute(stop_when_cache_set_failed);
+ rw_attribute(writeback_metadata);
+ rw_attribute(writeback_running);
+@@ -160,6 +167,11 @@ SHOW(__bch_cached_dev)
+                                              bch_cache_modes,
+                                              BDEV_CACHE_MODE(&dc->sb));
++      if (attr == &sysfs_readahead_cache_policy)
++              return bch_snprint_string_list(buf, PAGE_SIZE,
++                                            bch_reada_cache_policies,
++                                            dc->cache_readahead_policy);
++
+       if (attr == &sysfs_stop_when_cache_set_failed)
+               return bch_snprint_string_list(buf, PAGE_SIZE,
+                                              bch_stop_on_failure_modes,
+@@ -324,6 +336,15 @@ STORE(__cached_dev)
+               }
+       }
++      if (attr == &sysfs_readahead_cache_policy) {
++              v = __sysfs_match_string(bch_reada_cache_policies, -1, buf);
++              if (v < 0)
++                      return v;
++
++              if ((unsigned int) v != dc->cache_readahead_policy)
++                      dc->cache_readahead_policy = v;
++      }
++
+       if (attr == &sysfs_stop_when_cache_set_failed) {
+               v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
+               if (v < 0)
+@@ -417,6 +438,7 @@ static struct attribute *bch_cached_dev_
+       &sysfs_data_csum,
+ #endif
+       &sysfs_cache_mode,
++      &sysfs_readahead_cache_policy,
+       &sysfs_stop_when_cache_set_failed,
+       &sysfs_writeback_metadata,
+       &sysfs_writeback_running,
diff --git a/queue-4.19/btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch b/queue-4.19/btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch
new file mode 100644 (file)
index 0000000..172b38f
--- /dev/null
@@ -0,0 +1,693 @@
+From 0e56315ca147b3e60c7bf240233a301d3c7fb508 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 19 Nov 2019 12:07:33 +0000
+Subject: Btrfs: fix missing hole after hole punching and fsync when using NO_HOLES
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 0e56315ca147b3e60c7bf240233a301d3c7fb508 upstream.
+
+When using the NO_HOLES feature, if we punch a hole into a file and then
+fsync it, there are cases where a subsequent fsync will miss the fact that
+a hole was punched, resulting in the holes not existing after replaying
+the log tree.
+
+Essentially these cases all imply that, tree-log.c:copy_items(), is not
+invoked for the leafs that delimit holes, because nothing changed those
+leafs in the current transaction. And it's precisely copy_items() where
+we currenly detect and log holes, which works as long as the holes are
+between file extent items in the input leaf or between the beginning of
+input leaf and the previous leaf or between the last item in the leaf
+and the next leaf.
+
+First example where we miss a hole:
+
+  *) The extent items of the inode span multiple leafs;
+
+  *) The punched hole covers a range that affects only the extent items of
+     the first leaf;
+
+  *) The fsync operation is done in full mode (BTRFS_INODE_NEEDS_FULL_SYNC
+     is set in the inode's runtime flags).
+
+  That results in the hole not existing after replaying the log tree.
+
+  For example, if the fs/subvolume tree has the following layout for a
+  particular inode:
+
+      Leaf N, generation 10:
+
+      [ ... INODE_ITEM INODE_REF EXTENT_ITEM (0 64K) EXTENT_ITEM (64K 128K) ]
+
+      Leaf N + 1, generation 10:
+
+      [ EXTENT_ITEM (128K 64K) ... ]
+
+  If at transaction 11 we punch a hole coverting the range [0, 128K[, we end
+  up dropping the two extent items from leaf N, but we don't touch the other
+  leaf, so we end up in the following state:
+
+      Leaf N, generation 11:
+
+      [ ... INODE_ITEM INODE_REF ]
+
+      Leaf N + 1, generation 10:
+
+      [ EXTENT_ITEM (128K 64K) ... ]
+
+  A full fsync after punching the hole will only process leaf N because it
+  was modified in the current transaction, but not leaf N + 1, since it
+  was not modified in the current transaction (generation 10 and not 11).
+  As a result the fsync will not log any holes, because it didn't process
+  any leaf with extent items.
+
+Second example where we will miss a hole:
+
+  *) An inode as its items spanning 5 (or more) leafs;
+
+  *) A hole is punched and it covers only the extents items of the 3rd
+     leaf. This resulsts in deleting the entire leaf and not touching any
+     of the other leafs.
+
+  So the only leaf that is modified in the current transaction, when
+  punching the hole, is the first leaf, which contains the inode item.
+  During the full fsync, the only leaf that is passed to copy_items()
+  is that first leaf, and that's not enough for the hole detection
+  code in copy_items() to determine there's a hole between the last
+  file extent item in the 2nd leaf and the first file extent item in
+  the 3rd leaf (which was the 4th leaf before punching the hole).
+
+Fix this by scanning all leafs and punch holes as necessary when doing a
+full fsync (less common than a non-full fsync) when the NO_HOLES feature
+is enabled. The lack of explicit file extent items to mark holes makes it
+necessary to scan existing extents to determine if holes exist.
+
+A test case for fstests follows soon.
+
+Fixes: 16e7549f045d33 ("Btrfs: incompatible format change to remove hole extents")
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-log.c |  388 +++++++++++++---------------------------------------
+ 1 file changed, 100 insertions(+), 288 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3892,7 +3892,7 @@ static int log_csums(struct btrfs_trans_
+ static noinline int copy_items(struct btrfs_trans_handle *trans,
+                              struct btrfs_inode *inode,
+                              struct btrfs_path *dst_path,
+-                             struct btrfs_path *src_path, u64 *last_extent,
++                             struct btrfs_path *src_path,
+                              int start_slot, int nr, int inode_only,
+                              u64 logged_isize)
+ {
+@@ -3903,7 +3903,6 @@ static noinline int copy_items(struct bt
+       struct btrfs_file_extent_item *extent;
+       struct btrfs_inode_item *inode_item;
+       struct extent_buffer *src = src_path->nodes[0];
+-      struct btrfs_key first_key, last_key, key;
+       int ret;
+       struct btrfs_key *ins_keys;
+       u32 *ins_sizes;
+@@ -3911,9 +3910,6 @@ static noinline int copy_items(struct bt
+       int i;
+       struct list_head ordered_sums;
+       int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
+-      bool has_extents = false;
+-      bool need_find_last_extent = true;
+-      bool done = false;
+       INIT_LIST_HEAD(&ordered_sums);
+@@ -3922,8 +3918,6 @@ static noinline int copy_items(struct bt
+       if (!ins_data)
+               return -ENOMEM;
+-      first_key.objectid = (u64)-1;
+-
+       ins_sizes = (u32 *)ins_data;
+       ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+@@ -3944,9 +3938,6 @@ static noinline int copy_items(struct bt
+               src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+-              if (i == nr - 1)
+-                      last_key = ins_keys[i];
+-
+               if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+                       inode_item = btrfs_item_ptr(dst_path->nodes[0],
+                                                   dst_path->slots[0],
+@@ -3960,20 +3951,6 @@ static noinline int copy_items(struct bt
+                                          src_offset, ins_sizes[i]);
+               }
+-              /*
+-               * We set need_find_last_extent here in case we know we were
+-               * processing other items and then walk into the first extent in
+-               * the inode.  If we don't hit an extent then nothing changes,
+-               * we'll do the last search the next time around.
+-               */
+-              if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
+-                      has_extents = true;
+-                      if (first_key.objectid == (u64)-1)
+-                              first_key = ins_keys[i];
+-              } else {
+-                      need_find_last_extent = false;
+-              }
+-
+               /* take a reference on file data extents so that truncates
+                * or deletes of this inode don't have to relog the inode
+                * again
+@@ -4039,167 +4016,6 @@ static noinline int copy_items(struct bt
+               kfree(sums);
+       }
+-      if (!has_extents)
+-              return ret;
+-
+-      if (need_find_last_extent && *last_extent == first_key.offset) {
+-              /*
+-               * We don't have any leafs between our current one and the one
+-               * we processed before that can have file extent items for our
+-               * inode (and have a generation number smaller than our current
+-               * transaction id).
+-               */
+-              need_find_last_extent = false;
+-      }
+-
+-      /*
+-       * Because we use btrfs_search_forward we could skip leaves that were
+-       * not modified and then assume *last_extent is valid when it really
+-       * isn't.  So back up to the previous leaf and read the end of the last
+-       * extent before we go and fill in holes.
+-       */
+-      if (need_find_last_extent) {
+-              u64 len;
+-
+-              ret = btrfs_prev_leaf(inode->root, src_path);
+-              if (ret < 0)
+-                      return ret;
+-              if (ret)
+-                      goto fill_holes;
+-              if (src_path->slots[0])
+-                      src_path->slots[0]--;
+-              src = src_path->nodes[0];
+-              btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
+-              if (key.objectid != btrfs_ino(inode) ||
+-                  key.type != BTRFS_EXTENT_DATA_KEY)
+-                      goto fill_holes;
+-              extent = btrfs_item_ptr(src, src_path->slots[0],
+-                                      struct btrfs_file_extent_item);
+-              if (btrfs_file_extent_type(src, extent) ==
+-                  BTRFS_FILE_EXTENT_INLINE) {
+-                      len = btrfs_file_extent_ram_bytes(src, extent);
+-                      *last_extent = ALIGN(key.offset + len,
+-                                           fs_info->sectorsize);
+-              } else {
+-                      len = btrfs_file_extent_num_bytes(src, extent);
+-                      *last_extent = key.offset + len;
+-              }
+-      }
+-fill_holes:
+-      /* So we did prev_leaf, now we need to move to the next leaf, but a few
+-       * things could have happened
+-       *
+-       * 1) A merge could have happened, so we could currently be on a leaf
+-       * that holds what we were copying in the first place.
+-       * 2) A split could have happened, and now not all of the items we want
+-       * are on the same leaf.
+-       *
+-       * So we need to adjust how we search for holes, we need to drop the
+-       * path and re-search for the first extent key we found, and then walk
+-       * forward until we hit the last one we copied.
+-       */
+-      if (need_find_last_extent) {
+-              /* btrfs_prev_leaf could return 1 without releasing the path */
+-              btrfs_release_path(src_path);
+-              ret = btrfs_search_slot(NULL, inode->root, &first_key,
+-                              src_path, 0, 0);
+-              if (ret < 0)
+-                      return ret;
+-              ASSERT(ret == 0);
+-              src = src_path->nodes[0];
+-              i = src_path->slots[0];
+-      } else {
+-              i = start_slot;
+-      }
+-
+-      /*
+-       * Ok so here we need to go through and fill in any holes we may have
+-       * to make sure that holes are punched for those areas in case they had
+-       * extents previously.
+-       */
+-      while (!done) {
+-              u64 offset, len;
+-              u64 extent_end;
+-
+-              if (i >= btrfs_header_nritems(src_path->nodes[0])) {
+-                      ret = btrfs_next_leaf(inode->root, src_path);
+-                      if (ret < 0)
+-                              return ret;
+-                      ASSERT(ret == 0);
+-                      src = src_path->nodes[0];
+-                      i = 0;
+-                      need_find_last_extent = true;
+-              }
+-
+-              btrfs_item_key_to_cpu(src, &key, i);
+-              if (!btrfs_comp_cpu_keys(&key, &last_key))
+-                      done = true;
+-              if (key.objectid != btrfs_ino(inode) ||
+-                  key.type != BTRFS_EXTENT_DATA_KEY) {
+-                      i++;
+-                      continue;
+-              }
+-              extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
+-              if (btrfs_file_extent_type(src, extent) ==
+-                  BTRFS_FILE_EXTENT_INLINE) {
+-                      len = btrfs_file_extent_ram_bytes(src, extent);
+-                      extent_end = ALIGN(key.offset + len,
+-                                         fs_info->sectorsize);
+-              } else {
+-                      len = btrfs_file_extent_num_bytes(src, extent);
+-                      extent_end = key.offset + len;
+-              }
+-              i++;
+-
+-              if (*last_extent == key.offset) {
+-                      *last_extent = extent_end;
+-                      continue;
+-              }
+-              offset = *last_extent;
+-              len = key.offset - *last_extent;
+-              ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
+-                              offset, 0, 0, len, 0, len, 0, 0, 0);
+-              if (ret)
+-                      break;
+-              *last_extent = extent_end;
+-      }
+-
+-      /*
+-       * Check if there is a hole between the last extent found in our leaf
+-       * and the first extent in the next leaf. If there is one, we need to
+-       * log an explicit hole so that at replay time we can punch the hole.
+-       */
+-      if (ret == 0 &&
+-          key.objectid == btrfs_ino(inode) &&
+-          key.type == BTRFS_EXTENT_DATA_KEY &&
+-          i == btrfs_header_nritems(src_path->nodes[0])) {
+-              ret = btrfs_next_leaf(inode->root, src_path);
+-              need_find_last_extent = true;
+-              if (ret > 0) {
+-                      ret = 0;
+-              } else if (ret == 0) {
+-                      btrfs_item_key_to_cpu(src_path->nodes[0], &key,
+-                                            src_path->slots[0]);
+-                      if (key.objectid == btrfs_ino(inode) &&
+-                          key.type == BTRFS_EXTENT_DATA_KEY &&
+-                          *last_extent < key.offset) {
+-                              const u64 len = key.offset - *last_extent;
+-
+-                              ret = btrfs_insert_file_extent(trans, log,
+-                                                             btrfs_ino(inode),
+-                                                             *last_extent, 0,
+-                                                             0, len, 0, len,
+-                                                             0, 0, 0);
+-                              *last_extent += len;
+-                      }
+-              }
+-      }
+-      /*
+-       * Need to let the callers know we dropped the path so they should
+-       * re-search.
+-       */
+-      if (!ret && need_find_last_extent)
+-              ret = 1;
+       return ret;
+ }
+@@ -4365,7 +4181,7 @@ static int btrfs_log_prealloc_extents(st
+       const u64 i_size = i_size_read(&inode->vfs_inode);
+       const u64 ino = btrfs_ino(inode);
+       struct btrfs_path *dst_path = NULL;
+-      u64 last_extent = (u64)-1;
++      bool dropped_extents = false;
+       int ins_nr = 0;
+       int start_slot;
+       int ret;
+@@ -4387,8 +4203,7 @@ static int btrfs_log_prealloc_extents(st
+               if (slot >= btrfs_header_nritems(leaf)) {
+                       if (ins_nr > 0) {
+                               ret = copy_items(trans, inode, dst_path, path,
+-                                               &last_extent, start_slot,
+-                                               ins_nr, 1, 0);
++                                               start_slot, ins_nr, 1, 0);
+                               if (ret < 0)
+                                       goto out;
+                               ins_nr = 0;
+@@ -4412,8 +4227,7 @@ static int btrfs_log_prealloc_extents(st
+                       path->slots[0]++;
+                       continue;
+               }
+-              if (last_extent == (u64)-1) {
+-                      last_extent = key.offset;
++              if (!dropped_extents) {
+                       /*
+                        * Avoid logging extent items logged in past fsync calls
+                        * and leading to duplicate keys in the log tree.
+@@ -4427,6 +4241,7 @@ static int btrfs_log_prealloc_extents(st
+                       } while (ret == -EAGAIN);
+                       if (ret)
+                               goto out;
++                      dropped_extents = true;
+               }
+               if (ins_nr == 0)
+                       start_slot = slot;
+@@ -4441,7 +4256,7 @@ static int btrfs_log_prealloc_extents(st
+               }
+       }
+       if (ins_nr > 0) {
+-              ret = copy_items(trans, inode, dst_path, path, &last_extent,
++              ret = copy_items(trans, inode, dst_path, path,
+                                start_slot, ins_nr, 1, 0);
+               if (ret > 0)
+                       ret = 0;
+@@ -4636,13 +4451,8 @@ static int btrfs_log_all_xattrs(struct b
+               if (slot >= nritems) {
+                       if (ins_nr > 0) {
+-                              u64 last_extent = 0;
+-
+                               ret = copy_items(trans, inode, dst_path, path,
+-                                               &last_extent, start_slot,
+-                                               ins_nr, 1, 0);
+-                              /* can't be 1, extent items aren't processed */
+-                              ASSERT(ret <= 0);
++                                               start_slot, ins_nr, 1, 0);
+                               if (ret < 0)
+                                       return ret;
+                               ins_nr = 0;
+@@ -4666,13 +4476,8 @@ static int btrfs_log_all_xattrs(struct b
+               cond_resched();
+       }
+       if (ins_nr > 0) {
+-              u64 last_extent = 0;
+-
+               ret = copy_items(trans, inode, dst_path, path,
+-                               &last_extent, start_slot,
+-                               ins_nr, 1, 0);
+-              /* can't be 1, extent items aren't processed */
+-              ASSERT(ret <= 0);
++                               start_slot, ins_nr, 1, 0);
+               if (ret < 0)
+                       return ret;
+       }
+@@ -4681,100 +4486,119 @@ static int btrfs_log_all_xattrs(struct b
+ }
+ /*
+- * If the no holes feature is enabled we need to make sure any hole between the
+- * last extent and the i_size of our inode is explicitly marked in the log. This
+- * is to make sure that doing something like:
+- *
+- *      1) create file with 128Kb of data
+- *      2) truncate file to 64Kb
+- *      3) truncate file to 256Kb
+- *      4) fsync file
+- *      5) <crash/power failure>
+- *      6) mount fs and trigger log replay
+- *
+- * Will give us a file with a size of 256Kb, the first 64Kb of data match what
+- * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
+- * file correspond to a hole. The presence of explicit holes in a log tree is
+- * what guarantees that log replay will remove/adjust file extent items in the
+- * fs/subvol tree.
+- *
+- * Here we do not need to care about holes between extents, that is already done
+- * by copy_items(). We also only need to do this in the full sync path, where we
+- * lookup for extents from the fs/subvol tree only. In the fast path case, we
+- * lookup the list of modified extent maps and if any represents a hole, we
+- * insert a corresponding extent representing a hole in the log tree.
++ * When using the NO_HOLES feature if we punched a hole that causes the
++ * deletion of entire leafs or all the extent items of the first leaf (the one
++ * that contains the inode item and references) we may end up not processing
++ * any extents, because there are no leafs with a generation matching the
++ * current transaction that have extent items for our inode. So we need to find
++ * if any holes exist and then log them. We also need to log holes after any
++ * truncate operation that changes the inode's size.
+  */
+-static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
+-                                 struct btrfs_root *root,
+-                                 struct btrfs_inode *inode,
+-                                 struct btrfs_path *path)
++static int btrfs_log_holes(struct btrfs_trans_handle *trans,
++                         struct btrfs_root *root,
++                         struct btrfs_inode *inode,
++                         struct btrfs_path *path)
+ {
+       struct btrfs_fs_info *fs_info = root->fs_info;
+-      int ret;
+       struct btrfs_key key;
+-      u64 hole_start;
+-      u64 hole_size;
+-      struct extent_buffer *leaf;
+-      struct btrfs_root *log = root->log_root;
+       const u64 ino = btrfs_ino(inode);
+       const u64 i_size = i_size_read(&inode->vfs_inode);
++      u64 prev_extent_end = 0;
++      int ret;
+-      if (!btrfs_fs_incompat(fs_info, NO_HOLES))
++      if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
+               return 0;
+       key.objectid = ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+-      key.offset = (u64)-1;
++      key.offset = 0;
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+-      ASSERT(ret != 0);
+       if (ret < 0)
+               return ret;
+-      ASSERT(path->slots[0] > 0);
+-      path->slots[0]--;
+-      leaf = path->nodes[0];
+-      btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+-
+-      if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+-              /* inode does not have any extents */
+-              hole_start = 0;
+-              hole_size = i_size;
+-      } else {
++      while (true) {
+               struct btrfs_file_extent_item *extent;
++              struct extent_buffer *leaf = path->nodes[0];
+               u64 len;
+-              /*
+-               * If there's an extent beyond i_size, an explicit hole was
+-               * already inserted by copy_items().
+-               */
+-              if (key.offset >= i_size)
+-                      return 0;
++              if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
++                      ret = btrfs_next_leaf(root, path);
++                      if (ret < 0)
++                              return ret;
++                      if (ret > 0) {
++                              ret = 0;
++                              break;
++                      }
++                      leaf = path->nodes[0];
++              }
++
++              btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++              if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
++                      break;
++
++              /* We have a hole, log it. */
++              if (prev_extent_end < key.offset) {
++                      const u64 hole_len = key.offset - prev_extent_end;
++
++                      /*
++                       * Release the path to avoid deadlocks with other code
++                       * paths that search the root while holding locks on
++                       * leafs from the log root.
++                       */
++                      btrfs_release_path(path);
++                      ret = btrfs_insert_file_extent(trans, root->log_root,
++                                                     ino, prev_extent_end, 0,
++                                                     0, hole_len, 0, hole_len,
++                                                     0, 0, 0);
++                      if (ret < 0)
++                              return ret;
++
++                      /*
++                       * Search for the same key again in the root. Since it's
++                       * an extent item and we are holding the inode lock, the
++                       * key must still exist. If it doesn't just emit warning
++                       * and return an error to fall back to a transaction
++                       * commit.
++                       */
++                      ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++                      if (ret < 0)
++                              return ret;
++                      if (WARN_ON(ret > 0))
++                              return -ENOENT;
++                      leaf = path->nodes[0];
++              }
+               extent = btrfs_item_ptr(leaf, path->slots[0],
+                                       struct btrfs_file_extent_item);
+-
+               if (btrfs_file_extent_type(leaf, extent) ==
+-                  BTRFS_FILE_EXTENT_INLINE)
+-                      return 0;
++                  BTRFS_FILE_EXTENT_INLINE) {
++                      len = btrfs_file_extent_ram_bytes(leaf, extent);
++                      prev_extent_end = ALIGN(key.offset + len,
++                                              fs_info->sectorsize);
++              } else {
++                      len = btrfs_file_extent_num_bytes(leaf, extent);
++                      prev_extent_end = key.offset + len;
++              }
+-              len = btrfs_file_extent_num_bytes(leaf, extent);
+-              /* Last extent goes beyond i_size, no need to log a hole. */
+-              if (key.offset + len > i_size)
+-                      return 0;
+-              hole_start = key.offset + len;
+-              hole_size = i_size - hole_start;
++              path->slots[0]++;
++              cond_resched();
+       }
+-      btrfs_release_path(path);
+-      /* Last extent ends at i_size. */
+-      if (hole_size == 0)
+-              return 0;
++      if (prev_extent_end < i_size) {
++              u64 hole_len;
+-      hole_size = ALIGN(hole_size, fs_info->sectorsize);
+-      ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
+-                                     hole_size, 0, hole_size, 0, 0, 0);
+-      return ret;
++              btrfs_release_path(path);
++              hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
++              ret = btrfs_insert_file_extent(trans, root->log_root,
++                                             ino, prev_extent_end, 0, 0,
++                                             hole_len, 0, hole_len,
++                                             0, 0, 0);
++              if (ret < 0)
++                      return ret;
++      }
++
++      return 0;
+ }
+ /*
+@@ -4934,7 +4758,6 @@ static int btrfs_log_inode(struct btrfs_
+       struct btrfs_key min_key;
+       struct btrfs_key max_key;
+       struct btrfs_root *log = root->log_root;
+-      u64 last_extent = 0;
+       int err = 0;
+       int ret;
+       int nritems;
+@@ -5108,7 +4931,7 @@ again:
+                                       ins_start_slot = path->slots[0];
+                               }
+                               ret = copy_items(trans, inode, dst_path, path,
+-                                               &last_extent, ins_start_slot,
++                                               ins_start_slot,
+                                                ins_nr, inode_only,
+                                                logged_isize);
+                               if (ret < 0) {
+@@ -5161,17 +4984,13 @@ again:
+                       if (ins_nr == 0)
+                               goto next_slot;
+                       ret = copy_items(trans, inode, dst_path, path,
+-                                       &last_extent, ins_start_slot,
++                                       ins_start_slot,
+                                        ins_nr, inode_only, logged_isize);
+                       if (ret < 0) {
+                               err = ret;
+                               goto out_unlock;
+                       }
+                       ins_nr = 0;
+-                      if (ret) {
+-                              btrfs_release_path(path);
+-                              continue;
+-                      }
+                       goto next_slot;
+               }
+@@ -5184,18 +5003,13 @@ again:
+                       goto next_slot;
+               }
+-              ret = copy_items(trans, inode, dst_path, path, &last_extent,
++              ret = copy_items(trans, inode, dst_path, path,
+                                ins_start_slot, ins_nr, inode_only,
+                                logged_isize);
+               if (ret < 0) {
+                       err = ret;
+                       goto out_unlock;
+               }
+-              if (ret) {
+-                      ins_nr = 0;
+-                      btrfs_release_path(path);
+-                      continue;
+-              }
+               ins_nr = 1;
+               ins_start_slot = path->slots[0];
+ next_slot:
+@@ -5209,13 +5023,12 @@ next_slot:
+               }
+               if (ins_nr) {
+                       ret = copy_items(trans, inode, dst_path, path,
+-                                       &last_extent, ins_start_slot,
++                                       ins_start_slot,
+                                        ins_nr, inode_only, logged_isize);
+                       if (ret < 0) {
+                               err = ret;
+                               goto out_unlock;
+                       }
+-                      ret = 0;
+                       ins_nr = 0;
+               }
+               btrfs_release_path(path);
+@@ -5230,14 +5043,13 @@ next_key:
+               }
+       }
+       if (ins_nr) {
+-              ret = copy_items(trans, inode, dst_path, path, &last_extent,
++              ret = copy_items(trans, inode, dst_path, path,
+                                ins_start_slot, ins_nr, inode_only,
+                                logged_isize);
+               if (ret < 0) {
+                       err = ret;
+                       goto out_unlock;
+               }
+-              ret = 0;
+               ins_nr = 0;
+       }
+@@ -5250,7 +5062,7 @@ next_key:
+       if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+               btrfs_release_path(path);
+               btrfs_release_path(dst_path);
+-              err = btrfs_log_trailing_hole(trans, root, inode, path);
++              err = btrfs_log_holes(trans, root, inode, path);
+               if (err)
+                       goto out_unlock;
+       }
diff --git a/queue-4.19/btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch b/queue-4.19/btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch
new file mode 100644 (file)
index 0000000..0dbdfed
--- /dev/null
@@ -0,0 +1,237 @@
+From 7227ff4de55d931bbdc156c8ef0ce4f100c78a5b Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 22 Jan 2020 12:23:20 +0000
+Subject: Btrfs: fix race between adding and putting tree mod seq elements and nodes
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 7227ff4de55d931bbdc156c8ef0ce4f100c78a5b upstream.
+
+There is a race between adding and removing elements to the tree mod log
+list and rbtree that can lead to use-after-free problems.
+
+Consider the following example that explains how/why the problems happens:
+
+1) Task A has mod log element with sequence number 200. It currently is
+   the only element in the mod log list;
+
+2) Task A calls btrfs_put_tree_mod_seq() because it no longer needs to
+   access the tree mod log. When it enters the function, it initializes
+   'min_seq' to (u64)-1. Then it acquires the lock 'tree_mod_seq_lock'
+   before checking if there are other elements in the mod seq list.
+   Since the list it empty, 'min_seq' remains set to (u64)-1. Then it
+   unlocks the lock 'tree_mod_seq_lock';
+
+3) Before task A acquires the lock 'tree_mod_log_lock', task B adds
+   itself to the mod seq list through btrfs_get_tree_mod_seq() and gets a
+   sequence number of 201;
+
+4) Some other task, name it task C, modifies a btree and because there
+   elements in the mod seq list, it adds a tree mod elem to the tree
+   mod log rbtree. That node added to the mod log rbtree is assigned
+   a sequence number of 202;
+
+5) Task B, which is doing fiemap and resolving indirect back references,
+   calls btrfs get_old_root(), with 'time_seq' == 201, which in turn
+   calls tree_mod_log_search() - the search returns the mod log node
+   from the rbtree with sequence number 202, created by task C;
+
+6) Task A now acquires the lock 'tree_mod_log_lock', starts iterating
+   the mod log rbtree and finds the node with sequence number 202. Since
+   202 is less than the previously computed 'min_seq', (u64)-1, it
+   removes the node and frees it;
+
+7) Task B still has a pointer to the node with sequence number 202, and
+   it dereferences the pointer itself and through the call to
+   __tree_mod_log_rewind(), resulting in a use-after-free problem.
+
+This issue can be triggered sporadically with the test case generic/561
+from fstests, and it happens more frequently with a higher number of
+duperemove processes. When it happens to me, it either freezes the VM or
+it produces a trace like the following before crashing:
+
+  [ 1245.321140] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
+  [ 1245.321200] CPU: 1 PID: 26997 Comm: pool Not tainted 5.5.0-rc6-btrfs-next-52 #1
+  [ 1245.321235] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014
+  [ 1245.321287] RIP: 0010:rb_next+0x16/0x50
+  [ 1245.321307] Code: ....
+  [ 1245.321372] RSP: 0018:ffffa151c4d039b0 EFLAGS: 00010202
+  [ 1245.321388] RAX: 6b6b6b6b6b6b6b6b RBX: ffff8ae221363c80 RCX: 6b6b6b6b6b6b6b6b
+  [ 1245.321409] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8ae221363c80
+  [ 1245.321439] RBP: ffff8ae20fcc4688 R08: 0000000000000002 R09: 0000000000000000
+  [ 1245.321475] R10: ffff8ae20b120910 R11: 00000000243f8bb1 R12: 0000000000000038
+  [ 1245.321506] R13: ffff8ae221363c80 R14: 000000000000075f R15: ffff8ae223f762b8
+  [ 1245.321539] FS:  00007fdee1ec7700(0000) GS:ffff8ae236c80000(0000) knlGS:0000000000000000
+  [ 1245.321591] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  [ 1245.321614] CR2: 00007fded4030c48 CR3: 000000021da16003 CR4: 00000000003606e0
+  [ 1245.321642] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+  [ 1245.321668] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+  [ 1245.321706] Call Trace:
+  [ 1245.321798]  __tree_mod_log_rewind+0xbf/0x280 [btrfs]
+  [ 1245.321841]  btrfs_search_old_slot+0x105/0xd00 [btrfs]
+  [ 1245.321877]  resolve_indirect_refs+0x1eb/0xc60 [btrfs]
+  [ 1245.321912]  find_parent_nodes+0x3dc/0x11b0 [btrfs]
+  [ 1245.321947]  btrfs_check_shared+0x115/0x1c0 [btrfs]
+  [ 1245.321980]  ? extent_fiemap+0x59d/0x6d0 [btrfs]
+  [ 1245.322029]  extent_fiemap+0x59d/0x6d0 [btrfs]
+  [ 1245.322066]  do_vfs_ioctl+0x45a/0x750
+  [ 1245.322081]  ksys_ioctl+0x70/0x80
+  [ 1245.322092]  ? trace_hardirqs_off_thunk+0x1a/0x1c
+  [ 1245.322113]  __x64_sys_ioctl+0x16/0x20
+  [ 1245.322126]  do_syscall_64+0x5c/0x280
+  [ 1245.322139]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
+  [ 1245.322155] RIP: 0033:0x7fdee3942dd7
+  [ 1245.322177] Code: ....
+  [ 1245.322258] RSP: 002b:00007fdee1ec6c88 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+  [ 1245.322294] RAX: ffffffffffffffda RBX: 00007fded40210d8 RCX: 00007fdee3942dd7
+  [ 1245.322314] RDX: 00007fded40210d8 RSI: 00000000c020660b RDI: 0000000000000004
+  [ 1245.322337] RBP: 0000562aa89e7510 R08: 0000000000000000 R09: 00007fdee1ec6d44
+  [ 1245.322369] R10: 0000000000000073 R11: 0000000000000246 R12: 00007fdee1ec6d48
+  [ 1245.322390] R13: 00007fdee1ec6d40 R14: 00007fded40210d0 R15: 00007fdee1ec6d50
+  [ 1245.322423] Modules linked in: ....
+  [ 1245.323443] ---[ end trace 01de1e9ec5dff3cd ]---
+
+Fix this by ensuring that btrfs_put_tree_mod_seq() computes the minimum
+sequence number and iterates the rbtree while holding the lock
+'tree_mod_log_lock' in write mode. Also get rid of the 'tree_mod_seq_lock'
+lock, since it is now redundant.
+
+Fixes: bd989ba359f2ac ("Btrfs: add tree modification log functions")
+Fixes: 097b8a7c9e48e2 ("Btrfs: join tree mod log code with the code holding back delayed refs")
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.c             |    8 ++------
+ fs/btrfs/ctree.h             |    6 ++----
+ fs/btrfs/delayed-ref.c       |    8 ++++----
+ fs/btrfs/disk-io.c           |    1 -
+ fs/btrfs/tests/btrfs-tests.c |    1 -
+ 5 files changed, 8 insertions(+), 16 deletions(-)
+
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -337,12 +337,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_
+                          struct seq_list *elem)
+ {
+       write_lock(&fs_info->tree_mod_log_lock);
+-      spin_lock(&fs_info->tree_mod_seq_lock);
+       if (!elem->seq) {
+               elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+               list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+       }
+-      spin_unlock(&fs_info->tree_mod_seq_lock);
+       write_unlock(&fs_info->tree_mod_log_lock);
+       return elem->seq;
+@@ -362,7 +360,7 @@ void btrfs_put_tree_mod_seq(struct btrfs
+       if (!seq_putting)
+               return;
+-      spin_lock(&fs_info->tree_mod_seq_lock);
++      write_lock(&fs_info->tree_mod_log_lock);
+       list_del(&elem->list);
+       elem->seq = 0;
+@@ -373,19 +371,17 @@ void btrfs_put_tree_mod_seq(struct btrfs
+                                * blocker with lower sequence number exists, we
+                                * cannot remove anything from the log
+                                */
+-                              spin_unlock(&fs_info->tree_mod_seq_lock);
++                              write_unlock(&fs_info->tree_mod_log_lock);
+                               return;
+                       }
+                       min_seq = cur_elem->seq;
+               }
+       }
+-      spin_unlock(&fs_info->tree_mod_seq_lock);
+       /*
+        * anything that's lower than the lowest existing (read: blocked)
+        * sequence number can be removed from the tree.
+        */
+-      write_lock(&fs_info->tree_mod_log_lock);
+       tm_root = &fs_info->tree_mod_log;
+       for (node = rb_first(tm_root); node; node = next) {
+               next = rb_next(node);
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -893,14 +893,12 @@ struct btrfs_fs_info {
+       struct list_head delayed_iputs;
+       struct mutex cleaner_delayed_iput_mutex;
+-      /* this protects tree_mod_seq_list */
+-      spinlock_t tree_mod_seq_lock;
+       atomic64_t tree_mod_seq;
+-      struct list_head tree_mod_seq_list;
+-      /* this protects tree_mod_log */
++      /* this protects tree_mod_log and tree_mod_seq_list */
+       rwlock_t tree_mod_log_lock;
+       struct rb_root tree_mod_log;
++      struct list_head tree_mod_seq_list;
+       atomic_t async_delalloc_pages;
+--- a/fs/btrfs/delayed-ref.c
++++ b/fs/btrfs/delayed-ref.c
+@@ -301,7 +301,7 @@ void btrfs_merge_delayed_refs(struct btr
+       if (head->is_data)
+               return;
+-      spin_lock(&fs_info->tree_mod_seq_lock);
++      read_lock(&fs_info->tree_mod_log_lock);
+       if (!list_empty(&fs_info->tree_mod_seq_list)) {
+               struct seq_list *elem;
+@@ -309,7 +309,7 @@ void btrfs_merge_delayed_refs(struct btr
+                                       struct seq_list, list);
+               seq = elem->seq;
+       }
+-      spin_unlock(&fs_info->tree_mod_seq_lock);
++      read_unlock(&fs_info->tree_mod_log_lock);
+ again:
+       for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+@@ -326,7 +326,7 @@ int btrfs_check_delayed_seq(struct btrfs
+       struct seq_list *elem;
+       int ret = 0;
+-      spin_lock(&fs_info->tree_mod_seq_lock);
++      read_lock(&fs_info->tree_mod_log_lock);
+       if (!list_empty(&fs_info->tree_mod_seq_list)) {
+               elem = list_first_entry(&fs_info->tree_mod_seq_list,
+                                       struct seq_list, list);
+@@ -339,7 +339,7 @@ int btrfs_check_delayed_seq(struct btrfs
+               }
+       }
+-      spin_unlock(&fs_info->tree_mod_seq_lock);
++      read_unlock(&fs_info->tree_mod_log_lock);
+       return ret;
+ }
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -2645,7 +2645,6 @@ int open_ctree(struct super_block *sb,
+       spin_lock_init(&fs_info->fs_roots_radix_lock);
+       spin_lock_init(&fs_info->delayed_iput_lock);
+       spin_lock_init(&fs_info->defrag_inodes_lock);
+-      spin_lock_init(&fs_info->tree_mod_seq_lock);
+       spin_lock_init(&fs_info->super_lock);
+       spin_lock_init(&fs_info->qgroup_op_lock);
+       spin_lock_init(&fs_info->buffer_lock);
+--- a/fs/btrfs/tests/btrfs-tests.c
++++ b/fs/btrfs/tests/btrfs-tests.c
+@@ -102,7 +102,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_
+       spin_lock_init(&fs_info->qgroup_op_lock);
+       spin_lock_init(&fs_info->super_lock);
+       spin_lock_init(&fs_info->fs_roots_radix_lock);
+-      spin_lock_init(&fs_info->tree_mod_seq_lock);
+       mutex_init(&fs_info->qgroup_ioctl_lock);
+       mutex_init(&fs_info->qgroup_rescan_lock);
+       rwlock_init(&fs_info->tree_mod_log_lock);
diff --git a/queue-4.19/btrfs-set-trans-drity-in-btrfs_commit_transaction.patch b/queue-4.19/btrfs-set-trans-drity-in-btrfs_commit_transaction.patch
new file mode 100644 (file)
index 0000000..a3e1201
--- /dev/null
@@ -0,0 +1,96 @@
+From d62b23c94952e78211a383b7d90ef0afbd9a3717 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Fri, 17 Jan 2020 08:57:51 -0500
+Subject: btrfs: set trans->drity in btrfs_commit_transaction
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit d62b23c94952e78211a383b7d90ef0afbd9a3717 upstream.
+
+If we abort a transaction we have the following sequence
+
+if (!trans->dirty && list_empty(&trans->new_bgs))
+       return;
+WRITE_ONCE(trans->transaction->aborted, err);
+
+The idea being if we didn't modify anything with our trans handle then
+we don't really need to abort the whole transaction, maybe the other
+trans handles are fine and we can carry on.
+
+However in the case of create_snapshot we add a pending_snapshot object
+to our transaction and then commit the transaction.  We don't actually
+modify anything.  sync() behaves the same way, attach to an existing
+transaction and commit it.  This means that if we have an IO error in
+the right places we could abort the committing transaction with our
+trans->dirty being not set and thus not set transaction->aborted.
+
+This is a problem because in the create_snapshot() case we depend on
+pending->error being set to something, or btrfs_commit_transaction
+returning an error.
+
+If we are not the trans handle that gets to commit the transaction, and
+we're waiting on the commit to happen we get our return value from
+cur_trans->aborted.  If this was not set to anything because sync() hit
+an error in the transaction commit before it could modify anything then
+cur_trans->aborted would be 0.  Thus we'd return 0 from
+btrfs_commit_transaction() in create_snapshot.
+
+This is a problem because we then try to do things with
+pending_snapshot->snap, which will be NULL because we didn't create the
+snapshot, and then we'll get a NULL pointer dereference like the
+following
+
+"BUG: kernel NULL pointer dereference, address: 00000000000001f0"
+RIP: 0010:btrfs_orphan_cleanup+0x2d/0x330
+Call Trace:
+ ? btrfs_mksubvol.isra.31+0x3f2/0x510
+ btrfs_mksubvol.isra.31+0x4bc/0x510
+ ? __sb_start_write+0xfa/0x200
+ ? mnt_want_write_file+0x24/0x50
+ btrfs_ioctl_snap_create_transid+0x16c/0x1a0
+ btrfs_ioctl_snap_create_v2+0x11e/0x1a0
+ btrfs_ioctl+0x1534/0x2c10
+ ? free_debug_processing+0x262/0x2a3
+ do_vfs_ioctl+0xa6/0x6b0
+ ? do_sys_open+0x188/0x220
+ ? syscall_trace_enter+0x1f8/0x330
+ ksys_ioctl+0x60/0x90
+ __x64_sys_ioctl+0x16/0x20
+ do_syscall_64+0x4a/0x1b0
+
+In order to fix this we need to make sure anybody who calls
+commit_transaction has trans->dirty set so that they properly set the
+trans->transaction->aborted value properly so any waiters know bad
+things happened.
+
+This was found while I was running generic/475 with my modified
+fsstress, it reproduced within a few runs.  I ran with this patch all
+night and didn't see the problem again.
+
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/transaction.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1936,6 +1936,14 @@ int btrfs_commit_transaction(struct btrf
+       struct btrfs_transaction *prev_trans = NULL;
+       int ret;
++      /*
++       * Some places just start a transaction to commit it.  We need to make
++       * sure that if this commit fails that the abort code actually marks the
++       * transaction as failed, so set trans->dirty to make the abort code do
++       * the right thing.
++       */
++      trans->dirty = true;
++
+       /* Stop the commit early if ->aborted is set */
+       if (unlikely(READ_ONCE(cur_trans->aborted))) {
+               ret = cur_trans->aborted;
diff --git a/queue-4.19/drm-rect-avoid-division-by-zero.patch b/queue-4.19/drm-rect-avoid-division-by-zero.patch
new file mode 100644 (file)
index 0000000..012d065
--- /dev/null
@@ -0,0 +1,47 @@
+From 433480c1afd44f3e1e664b85063d98cefeefa0ed Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
+Date: Fri, 22 Nov 2019 19:56:20 +0200
+Subject: drm/rect: Avoid division by zero
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ville Syrjälä <ville.syrjala@linux.intel.com>
+
+commit 433480c1afd44f3e1e664b85063d98cefeefa0ed upstream.
+
+Check for zero width/height destination rectangle in
+drm_rect_clip_scaled() to avoid a division by zero.
+
+Cc: stable@vger.kernel.org
+Fixes: f96bdf564f3e ("drm/rect: Handle rounding errors in drm_rect_clip_scaled, v3.")
+Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+Cc: Benjamin Gaignard <benjamin.gaignard@st.com>
+Cc: Daniel Vetter <daniel@ffwll.ch>
+Testcase: igt/kms_selftest/drm_rect_clip_scaled_div_by_zero
+Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20191122175623.13565-2-ville.syrjala@linux.intel.com
+Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Reviewed-by: Benjamin Gaignard <benjamin.gaignard@st.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/drm_rect.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/drm_rect.c
++++ b/drivers/gpu/drm/drm_rect.c
+@@ -52,7 +52,12 @@ EXPORT_SYMBOL(drm_rect_intersect);
+ static u32 clip_scaled(u32 src, u32 dst, u32 clip)
+ {
+-      u64 tmp = mul_u32_u32(src, dst - clip);
++      u64 tmp;
++
++      if (dst == 0)
++              return 0;
++
++      tmp = mul_u32_u32(src, dst - clip);
+       /*
+        * Round toward 1.0 when clipping so that we don't accidentally
diff --git a/queue-4.19/eventfd-track-eventfd_signal-recursion-depth.patch b/queue-4.19/eventfd-track-eventfd_signal-recursion-depth.patch
new file mode 100644 (file)
index 0000000..2431e3f
--- /dev/null
@@ -0,0 +1,102 @@
+From b5e683d5cab8cd433b06ae178621f083cabd4f63 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Sun, 2 Feb 2020 08:23:03 -0700
+Subject: eventfd: track eventfd_signal() recursion depth
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit b5e683d5cab8cd433b06ae178621f083cabd4f63 upstream.
+
+eventfd use cases from aio and io_uring can deadlock due to circular
+or resursive calling, when eventfd_signal() tries to grab the waitqueue
+lock. On top of that, it's also possible to construct notification
+chains that are deep enough that we could blow the stack.
+
+Add a percpu counter that tracks the percpu recursion depth, warn if we
+exceed it. The counter is also exposed so that users of eventfd_signal()
+can do the right thing if it's non-zero in the context where it is
+called.
+
+Cc: stable@vger.kernel.org # 4.19+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/eventfd.c            |   15 +++++++++++++++
+ include/linux/eventfd.h |   14 ++++++++++++++
+ 2 files changed, 29 insertions(+)
+
+--- a/fs/eventfd.c
++++ b/fs/eventfd.c
+@@ -22,6 +22,8 @@
+ #include <linux/proc_fs.h>
+ #include <linux/seq_file.h>
++DEFINE_PER_CPU(int, eventfd_wake_count);
++
+ struct eventfd_ctx {
+       struct kref kref;
+       wait_queue_head_t wqh;
+@@ -55,12 +57,25 @@ __u64 eventfd_signal(struct eventfd_ctx
+ {
+       unsigned long flags;
++      /*
++       * Deadlock or stack overflow issues can happen if we recurse here
++       * through waitqueue wakeup handlers. If the caller users potentially
++       * nested waitqueues with custom wakeup handlers, then it should
++       * check eventfd_signal_count() before calling this function. If
++       * it returns true, the eventfd_signal() call should be deferred to a
++       * safe context.
++       */
++      if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
++              return 0;
++
+       spin_lock_irqsave(&ctx->wqh.lock, flags);
++      this_cpu_inc(eventfd_wake_count);
+       if (ULLONG_MAX - ctx->count < n)
+               n = ULLONG_MAX - ctx->count;
+       ctx->count += n;
+       if (waitqueue_active(&ctx->wqh))
+               wake_up_locked_poll(&ctx->wqh, EPOLLIN);
++      this_cpu_dec(eventfd_wake_count);
+       spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+       return n;
+--- a/include/linux/eventfd.h
++++ b/include/linux/eventfd.h
+@@ -12,6 +12,8 @@
+ #include <linux/fcntl.h>
+ #include <linux/wait.h>
+ #include <linux/err.h>
++#include <linux/percpu-defs.h>
++#include <linux/percpu.h>
+ /*
+  * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
+@@ -40,6 +42,13 @@ __u64 eventfd_signal(struct eventfd_ctx
+ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
+                                 __u64 *cnt);
++DECLARE_PER_CPU(int, eventfd_wake_count);
++
++static inline bool eventfd_signal_count(void)
++{
++      return this_cpu_read(eventfd_wake_count);
++}
++
+ #else /* CONFIG_EVENTFD */
+ /*
+@@ -68,6 +77,11 @@ static inline int eventfd_ctx_remove_wai
+       return -ENOSYS;
+ }
++static inline bool eventfd_signal_count(void)
++{
++      return false;
++}
++
+ #endif
+ #endif /* _LINUX_EVENTFD_H */
diff --git a/queue-4.19/gfs2-fix-o_sync-write-handling.patch b/queue-4.19/gfs2-fix-o_sync-write-handling.patch
new file mode 100644 (file)
index 0000000..752e0ee
--- /dev/null
@@ -0,0 +1,111 @@
+From 6e5e41e2dc4e4413296d5a4af54ac92d7cd52317 Mon Sep 17 00:00:00 2001
+From: Andreas Gruenbacher <agruenba@redhat.com>
+Date: Tue, 14 Jan 2020 17:12:18 +0100
+Subject: gfs2: fix O_SYNC write handling
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 6e5e41e2dc4e4413296d5a4af54ac92d7cd52317 upstream.
+
+In gfs2_file_write_iter, for direct writes, the error checking in the buffered
+write fallback case is incomplete.  This can cause inode write errors to go
+undetected.  Fix and clean up gfs2_file_write_iter along the way.
+
+Based on a proposed fix by Christoph Hellwig <hch@lst.de>.
+
+Fixes: 967bcc91b044 ("gfs2: iomap direct I/O support")
+Cc: stable@vger.kernel.org # v4.19+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/gfs2/file.c |   51 +++++++++++++++++++++------------------------------
+ 1 file changed, 21 insertions(+), 30 deletions(-)
+
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -780,7 +780,7 @@ static ssize_t gfs2_file_write_iter(stru
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file);
+       struct gfs2_inode *ip = GFS2_I(inode);
+-      ssize_t written = 0, ret;
++      ssize_t ret;
+       ret = gfs2_rsqa_alloc(ip);
+       if (ret)
+@@ -812,55 +812,46 @@ static ssize_t gfs2_file_write_iter(stru
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               struct address_space *mapping = file->f_mapping;
+-              loff_t pos, endbyte;
+-              ssize_t buffered;
++              ssize_t buffered, ret2;
+-              written = gfs2_file_direct_write(iocb, from);
+-              if (written < 0 || !iov_iter_count(from))
++              ret = gfs2_file_direct_write(iocb, from);
++              if (ret < 0 || !iov_iter_count(from))
+                       goto out_unlock;
++              iocb->ki_flags |= IOCB_DSYNC;
+               current->backing_dev_info = inode_to_bdi(inode);
+-              ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
++              buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+               current->backing_dev_info = NULL;
+-              if (unlikely(ret < 0))
++              if (unlikely(buffered <= 0))
+                       goto out_unlock;
+-              buffered = ret;
+               /*
+                * We need to ensure that the page cache pages are written to
+                * disk and invalidated to preserve the expected O_DIRECT
+-               * semantics.
++               * semantics.  If the writeback or invalidate fails, only report
++               * the direct I/O range as we don't know if the buffered pages
++               * made it to disk.
+                */
+-              pos = iocb->ki_pos;
+-              endbyte = pos + buffered - 1;
+-              ret = filemap_write_and_wait_range(mapping, pos, endbyte);
+-              if (!ret) {
+-                      iocb->ki_pos += buffered;
+-                      written += buffered;
+-                      invalidate_mapping_pages(mapping,
+-                                               pos >> PAGE_SHIFT,
+-                                               endbyte >> PAGE_SHIFT);
+-              } else {
+-                      /*
+-                       * We don't know how much we wrote, so just return
+-                       * the number of bytes which were direct-written
+-                       */
+-              }
++              iocb->ki_pos += buffered;
++              ret2 = generic_write_sync(iocb, buffered);
++              invalidate_mapping_pages(mapping,
++                              (iocb->ki_pos - buffered) >> PAGE_SHIFT,
++                              (iocb->ki_pos - 1) >> PAGE_SHIFT);
++              if (!ret || ret2 > 0)
++                      ret += ret2;
+       } else {
+               current->backing_dev_info = inode_to_bdi(inode);
+               ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+               current->backing_dev_info = NULL;
+-              if (likely(ret > 0))
++              if (likely(ret > 0)) {
+                       iocb->ki_pos += ret;
++                      ret = generic_write_sync(iocb, ret);
++              }
+       }
+ out_unlock:
+       inode_unlock(inode);
+-      if (likely(ret > 0)) {
+-              /* Handle various SYNC-type writes */
+-              ret = generic_write_sync(iocb, ret);
+-      }
+-      return written ? written : ret;
++      return ret;
+ }
+ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
diff --git a/queue-4.19/gfs2-move-setting-current-backing_dev_info.patch b/queue-4.19/gfs2-move-setting-current-backing_dev_info.patch
new file mode 100644 (file)
index 0000000..4b45487
--- /dev/null
@@ -0,0 +1,80 @@
+From 4c0e8dda608a51855225c611b5c6b442f95fbc56 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Wed, 15 Jan 2020 16:38:29 +0100
+Subject: gfs2: move setting current->backing_dev_info
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 4c0e8dda608a51855225c611b5c6b442f95fbc56 upstream.
+
+Set current->backing_dev_info just around the buffered write calls to
+prepare for the next fix.
+
+Fixes: 967bcc91b044 ("gfs2: iomap direct I/O support")
+Cc: stable@vger.kernel.org # v4.19+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/gfs2/file.c |   21 ++++++++++-----------
+ 1 file changed, 10 insertions(+), 11 deletions(-)
+
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -800,18 +800,15 @@ static ssize_t gfs2_file_write_iter(stru
+       inode_lock(inode);
+       ret = generic_write_checks(iocb, from);
+       if (ret <= 0)
+-              goto out;
+-
+-      /* We can write back this queue in page reclaim */
+-      current->backing_dev_info = inode_to_bdi(inode);
++              goto out_unlock;
+       ret = file_remove_privs(file);
+       if (ret)
+-              goto out2;
++              goto out_unlock;
+       ret = file_update_time(file);
+       if (ret)
+-              goto out2;
++              goto out_unlock;
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               struct address_space *mapping = file->f_mapping;
+@@ -820,11 +817,13 @@ static ssize_t gfs2_file_write_iter(stru
+               written = gfs2_file_direct_write(iocb, from);
+               if (written < 0 || !iov_iter_count(from))
+-                      goto out2;
++                      goto out_unlock;
++              current->backing_dev_info = inode_to_bdi(inode);
+               ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
++              current->backing_dev_info = NULL;
+               if (unlikely(ret < 0))
+-                      goto out2;
++                      goto out_unlock;
+               buffered = ret;
+               /*
+@@ -848,14 +847,14 @@ static ssize_t gfs2_file_write_iter(stru
+                        */
+               }
+       } else {
++              current->backing_dev_info = inode_to_bdi(inode);
+               ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
++              current->backing_dev_info = NULL;
+               if (likely(ret > 0))
+                       iocb->ki_pos += ret;
+       }
+-out2:
+-      current->backing_dev_info = NULL;
+-out:
++out_unlock:
+       inode_unlock(inode);
+       if (likely(ret > 0)) {
+               /* Handle various SYNC-type writes */
diff --git a/queue-4.19/iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch b/queue-4.19/iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch
new file mode 100644 (file)
index 0000000..0bbcdf9
--- /dev/null
@@ -0,0 +1,56 @@
+From 197288d5ba8a5289f22d3aeb4fca3824bfd9b4af Mon Sep 17 00:00:00 2001
+From: Luca Coelho <luciano.coelho@intel.com>
+Date: Fri, 31 Jan 2020 15:45:25 +0200
+Subject: iwlwifi: don't throw error when trying to remove IGTK
+
+From: Luca Coelho <luciano.coelho@intel.com>
+
+commit 197288d5ba8a5289f22d3aeb4fca3824bfd9b4af upstream.
+
+The IGTK keys are only removed by mac80211 after it has already
+removed the AP station.  This causes the driver to throw an error
+because mac80211 is trying to remove the IGTK when the station doesn't
+exist anymore.
+
+The firmware is aware that the station has been removed and can deal
+with it the next time we try to add an IGTK for a station, so we
+shouldn't try to remove the key if the station ID is
+IWL_MVM_INVALID_STA.  Do this by removing the check for mvm_sta before
+calling iwl_mvm_send_sta_igtk() and check return from that function
+gracefully if the station ID is invalid.
+
+Cc: stable@vger.kernel.org # 4.12+
+Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
+Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/wireless/intel/iwlwifi/mvm/sta.c |   10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
++++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
+@@ -3045,6 +3045,10 @@ static int iwl_mvm_send_sta_igtk(struct
+       igtk_cmd.sta_id = cpu_to_le32(sta_id);
+       if (remove_key) {
++              /* This is a valid situation for IGTK */
++              if (sta_id == IWL_MVM_INVALID_STA)
++                      return 0;
++
+               igtk_cmd.ctrl_flags |= cpu_to_le32(STA_KEY_NOT_VALID);
+       } else {
+               struct ieee80211_key_seq seq;
+@@ -3352,9 +3356,9 @@ int iwl_mvm_remove_sta_key(struct iwl_mv
+       IWL_DEBUG_WEP(mvm, "mvm remove dynamic key: idx=%d sta=%d\n",
+                     keyconf->keyidx, sta_id);
+-      if (mvm_sta && (keyconf->cipher == WLAN_CIPHER_SUITE_AES_CMAC ||
+-                      keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_128 ||
+-                      keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_256))
++      if (keyconf->cipher == WLAN_CIPHER_SUITE_AES_CMAC ||
++          keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_128 ||
++          keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_256)
+               return iwl_mvm_send_sta_igtk(mvm, keyconf, sta_id, true);
+       if (!__test_and_clear_bit(keyconf->hw_key_idx, mvm->fw_key_table)) {
diff --git a/queue-4.19/jbd2_seq_info_next-should-increase-position-index.patch b/queue-4.19/jbd2_seq_info_next-should-increase-position-index.patch
new file mode 100644 (file)
index 0000000..fa79fc4
--- /dev/null
@@ -0,0 +1,39 @@
+From 1a8e9cf40c9a6a2e40b1e924b13ed303aeea4418 Mon Sep 17 00:00:00 2001
+From: Vasily Averin <vvs@virtuozzo.com>
+Date: Thu, 23 Jan 2020 12:05:10 +0300
+Subject: jbd2_seq_info_next should increase position index
+
+From: Vasily Averin <vvs@virtuozzo.com>
+
+commit 1a8e9cf40c9a6a2e40b1e924b13ed303aeea4418 upstream.
+
+if seq_file .next fuction does not change position index,
+read after some lseek can generate unexpected output.
+
+Script below generates endless output
+ $ q=;while read -r r;do echo "$((++q)) $r";done </proc/fs/jbd2/DEV/info
+
+https://bugzilla.kernel.org/show_bug.cgi?id=206283
+
+Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface")
+Cc: stable@kernel.org
+Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/d13805e5-695e-8ac3-b678-26ca2313629f@virtuozzo.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/jbd2/journal.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/jbd2/journal.c
++++ b/fs/jbd2/journal.c
+@@ -1002,6 +1002,7 @@ static void *jbd2_seq_info_start(struct
+ static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
+ {
++      (*pos)++;
+       return NULL;
+ }
diff --git a/queue-4.19/kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch b/queue-4.19/kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch
new file mode 100644 (file)
index 0000000..f737a36
--- /dev/null
@@ -0,0 +1,44 @@
+From 1a978d9d3e72ddfa40ac60d26301b154247ee0bc Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Wed, 18 Dec 2019 13:54:46 -0800
+Subject: KVM: PPC: Book3S HV: Uninit vCPU if vcore creation fails
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit 1a978d9d3e72ddfa40ac60d26301b154247ee0bc upstream.
+
+Call kvm_vcpu_uninit() if vcore creation fails to avoid leaking any
+resources allocated by kvm_vcpu_init(), i.e. the vcpu->run page.
+
+Fixes: 371fefd6f2dc4 ("KVM: PPC: Allow book3s_hv guests to use SMT processor modes")
+Cc: stable@vger.kernel.org
+Reviewed-by: Greg Kurz <groug@kaod.org>
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Acked-by: Paul Mackerras <paulus@ozlabs.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/powerpc/kvm/book3s_hv.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/powerpc/kvm/book3s_hv.c
++++ b/arch/powerpc/kvm/book3s_hv.c
+@@ -2065,7 +2065,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu
+       mutex_unlock(&kvm->lock);
+       if (!vcore)
+-              goto free_vcpu;
++              goto uninit_vcpu;
+       spin_lock(&vcore->lock);
+       ++vcore->num_threads;
+@@ -2082,6 +2082,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu
+       return vcpu;
++uninit_vcpu:
++      kvm_vcpu_uninit(vcpu);
+ free_vcpu:
+       kmem_cache_free(kvm_vcpu_cache, vcpu);
+ out:
diff --git a/queue-4.19/kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch b/queue-4.19/kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch
new file mode 100644 (file)
index 0000000..34c842f
--- /dev/null
@@ -0,0 +1,41 @@
+From cb10bf9194f4d2c5d830eddca861f7ca0fecdbb4 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Wed, 18 Dec 2019 13:54:47 -0800
+Subject: KVM: PPC: Book3S PR: Free shared page if mmu initialization fails
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit cb10bf9194f4d2c5d830eddca861f7ca0fecdbb4 upstream.
+
+Explicitly free the shared page if kvmppc_mmu_init() fails during
+kvmppc_core_vcpu_create(), as the page is freed only in
+kvmppc_core_vcpu_free(), which is not reached via kvm_vcpu_uninit().
+
+Fixes: 96bc451a15329 ("KVM: PPC: Introduce shared page")
+Cc: stable@vger.kernel.org
+Reviewed-by: Greg Kurz <groug@kaod.org>
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Acked-by: Paul Mackerras <paulus@ozlabs.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/powerpc/kvm/book3s_pr.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/powerpc/kvm/book3s_pr.c
++++ b/arch/powerpc/kvm/book3s_pr.c
+@@ -1772,10 +1772,12 @@ static struct kvm_vcpu *kvmppc_core_vcpu
+       err = kvmppc_mmu_init(vcpu);
+       if (err < 0)
+-              goto uninit_vcpu;
++              goto free_shared_page;
+       return vcpu;
++free_shared_page:
++      free_page((unsigned long)vcpu->arch.shared);
+ uninit_vcpu:
+       kvm_vcpu_uninit(vcpu);
+ free_shadow_vcpu:
diff --git a/queue-4.19/kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch b/queue-4.19/kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch
new file mode 100644 (file)
index 0000000..cea097d
--- /dev/null
@@ -0,0 +1,55 @@
+From f958bd2314d117f8c29f4821401bc1925bc2e5ef Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Mon, 9 Dec 2019 12:19:31 -0800
+Subject: KVM: x86: Fix potential put_fpu() w/o load_fpu() on MPX platform
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit f958bd2314d117f8c29f4821401bc1925bc2e5ef upstream.
+
+Unlike most state managed by XSAVE, MPX is initialized to zero on INIT.
+Because INITs are usually recognized in the context of a VCPU_RUN call,
+kvm_vcpu_reset() puts the guest's FPU so that the FPU state is resident
+in memory, zeros the MPX state, and reloads FPU state to hardware.  But,
+in the unlikely event that an INIT is recognized during
+kvm_arch_vcpu_ioctl_get_mpstate() via kvm_apic_accept_events(),
+kvm_vcpu_reset() will call kvm_put_guest_fpu() without a preceding
+kvm_load_guest_fpu() and corrupt the guest's FPU state (and possibly
+userspace's FPU state as well).
+
+Given that MPX is being removed from the kernel[*], fix the bug with the
+simple-but-ugly approach of loading the guest's FPU during
+KVM_GET_MP_STATE.
+
+[*] See commit f240652b6032b ("x86/mpx: Remove MPX APIs").
+
+Fixes: f775b13eedee2 ("x86,kvm: move qemu/guest FPU switching out to vcpu_run")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -8235,6 +8235,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(stru
+                                   struct kvm_mp_state *mp_state)
+ {
+       vcpu_load(vcpu);
++      if (kvm_mpx_supported())
++              kvm_load_guest_fpu(vcpu);
+       kvm_apic_accept_events(vcpu);
+       if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+@@ -8243,6 +8245,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(stru
+       else
+               mp_state->mp_state = vcpu->arch.mp_state;
++      if (kvm_mpx_supported())
++              kvm_put_guest_fpu(vcpu);
+       vcpu_put(vcpu);
+       return 0;
+ }
diff --git a/queue-4.19/kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..77c4645
--- /dev/null
@@ -0,0 +1,57 @@
+From ea740059ecb37807ba47b84b33d1447435a8d868 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:52 -0800
+Subject: KVM: x86: Protect DR-based index computations from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit ea740059ecb37807ba47b84b33d1447435a8d868 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in __kvm_set_dr() and
+kvm_get_dr().
+Both kvm_get_dr() and kvm_set_dr() (a wrapper of __kvm_set_dr()) are
+exported symbols so KVM should tream them conservatively from a security
+perspective.
+
+Fixes: 020df0794f57 ("KVM: move DR register access handling into generic code")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -961,9 +961,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu
+ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+ {
++      size_t size = ARRAY_SIZE(vcpu->arch.db);
++
+       switch (dr) {
+       case 0 ... 3:
+-              vcpu->arch.db[dr] = val;
++              vcpu->arch.db[array_index_nospec(dr, size)] = val;
+               if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+                       vcpu->arch.eff_db[dr] = val;
+               break;
+@@ -1000,9 +1002,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
+ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+ {
++      size_t size = ARRAY_SIZE(vcpu->arch.db);
++
+       switch (dr) {
+       case 0 ... 3:
+-              *val = vcpu->arch.db[dr];
++              *val = vcpu->arch.db[array_index_nospec(dr, size)];
+               break;
+       case 4:
+               /* fall through */
diff --git a/queue-4.19/kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..9373c0d
--- /dev/null
@@ -0,0 +1,58 @@
+From 8c86405f606ca8508b8d9280680166ca26723695 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:44 -0800
+Subject: KVM: x86: Protect ioapic_read_indirect() from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 8c86405f606ca8508b8d9280680166ca26723695 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in ioapic_read_indirect().
+This function contains index computations based on the
+(attacker-controlled) IOREGSEL register.
+
+Fixes: a2c118bfab8b ("KVM: Fix bounds checking in ioapic indirect register reads (CVE-2013-1798)")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/ioapic.c |   14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/ioapic.c
++++ b/arch/x86/kvm/ioapic.c
+@@ -36,6 +36,7 @@
+ #include <linux/io.h>
+ #include <linux/slab.h>
+ #include <linux/export.h>
++#include <linux/nospec.h>
+ #include <asm/processor.h>
+ #include <asm/page.h>
+ #include <asm/current.h>
+@@ -73,13 +74,14 @@ static unsigned long ioapic_read_indirec
+       default:
+               {
+                       u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+-                      u64 redir_content;
++                      u64 redir_content = ~0ULL;
+-                      if (redir_index < IOAPIC_NUM_PINS)
+-                              redir_content =
+-                                      ioapic->redirtbl[redir_index].bits;
+-                      else
+-                              redir_content = ~0ULL;
++                      if (redir_index < IOAPIC_NUM_PINS) {
++                              u32 index = array_index_nospec(
++                                      redir_index, IOAPIC_NUM_PINS);
++
++                              redir_content = ioapic->redirtbl[index].bits;
++                      }
+                       result = (ioapic->ioregsel & 0x1) ?
+                           (redir_content >> 32) & 0xffffffff :
diff --git a/queue-4.19/kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..8169f68
--- /dev/null
@@ -0,0 +1,40 @@
+From 670564559ca35b439c8d8861fc399451ddf95137 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:45 -0800
+Subject: KVM: x86: Protect ioapic_write_indirect() from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 670564559ca35b439c8d8861fc399451ddf95137 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in ioapic_write_indirect().
+This function contains index computations based on the
+(attacker-controlled) IOREGSEL register.
+
+This patch depends on patch
+"KVM: x86: Protect ioapic_read_indirect() from Spectre-v1/L1TF attacks".
+
+Fixes: 70f93dae32ac ("KVM: Use temporary variable to shorten lines.")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/ioapic.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/ioapic.c
++++ b/arch/x86/kvm/ioapic.c
+@@ -297,6 +297,7 @@ static void ioapic_write_indirect(struct
+               ioapic_debug("change redir index %x val %x\n", index, val);
+               if (index >= IOAPIC_NUM_PINS)
+                       return;
++              index = array_index_nospec(index, IOAPIC_NUM_PINS);
+               e = &ioapic->redirtbl[index];
+               mask_before = e->fields.mask;
+               /* Preserve read-only fields */
diff --git a/queue-4.19/kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..fe82166
--- /dev/null
@@ -0,0 +1,59 @@
+From 8618793750071d66028584a83ed0b4fa7eb4f607 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:42 -0800
+Subject: KVM: x86: Protect kvm_hv_msr_[get|set]_crash_data() from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 8618793750071d66028584a83ed0b4fa7eb4f607 upstream.
+
+This fixes Spectre-v1/L1TF vulnerabilities in kvm_hv_msr_get_crash_data()
+and kvm_hv_msr_set_crash_data().
+These functions contain index computations that use the
+(attacker-controlled) MSR number.
+
+Fixes: e7d9513b60e8 ("kvm/x86: added hyper-v crash msrs into kvm hyperv context")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/hyperv.c |   10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/hyperv.c
++++ b/arch/x86/kvm/hyperv.c
+@@ -792,11 +792,12 @@ static int kvm_hv_msr_get_crash_data(str
+                                    u32 index, u64 *pdata)
+ {
+       struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
++      size_t size = ARRAY_SIZE(hv->hv_crash_param);
+-      if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
++      if (WARN_ON_ONCE(index >= size))
+               return -EINVAL;
+-      *pdata = hv->hv_crash_param[index];
++      *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
+       return 0;
+ }
+@@ -835,11 +836,12 @@ static int kvm_hv_msr_set_crash_data(str
+                                    u32 index, u64 data)
+ {
+       struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
++      size_t size = ARRAY_SIZE(hv->hv_crash_param);
+-      if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
++      if (WARN_ON_ONCE(index >= size))
+               return -EINVAL;
+-      hv->hv_crash_param[index] = data;
++      hv->hv_crash_param[array_index_nospec(index, size)] = data;
+       return 0;
+ }
diff --git a/queue-4.19/kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..d328dd9
--- /dev/null
@@ -0,0 +1,54 @@
+From 4bf79cb089f6b1c6c632492c0271054ce52ad766 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:46 -0800
+Subject: KVM: x86: Protect kvm_lapic_reg_write() from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 4bf79cb089f6b1c6c632492c0271054ce52ad766 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in kvm_lapic_reg_write().
+This function contains index computations based on the
+(attacker-controlled) MSR number.
+
+Fixes: 0105d1a52640 ("KVM: x2apic interface to lapic")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/lapic.c |   13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -1862,15 +1862,20 @@ int kvm_lapic_reg_write(struct kvm_lapic
+       case APIC_LVTTHMR:
+       case APIC_LVTPC:
+       case APIC_LVT1:
+-      case APIC_LVTERR:
++      case APIC_LVTERR: {
+               /* TODO: Check vector */
++              size_t size;
++              u32 index;
++
+               if (!kvm_apic_sw_enabled(apic))
+                       val |= APIC_LVT_MASKED;
+-
+-              val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
++              size = ARRAY_SIZE(apic_lvt_mask);
++              index = array_index_nospec(
++                              (reg - APIC_LVTT) >> 4, size);
++              val &= apic_lvt_mask[index];
+               kvm_lapic_set_reg(apic, reg, val);
+-
+               break;
++      }
+       case APIC_LVTT:
+               if (!kvm_apic_sw_enabled(apic))
diff --git a/queue-4.19/kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch b/queue-4.19/kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch
new file mode 100644 (file)
index 0000000..f8d2c6c
--- /dev/null
@@ -0,0 +1,54 @@
+From 6ec4c5eee1750d5d17951c4e1960d953376a0dda Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:49 -0800
+Subject: KVM: x86: Protect MSR-based index computations from Spectre-v1/L1TF attacks in x86.c
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 6ec4c5eee1750d5d17951c4e1960d953376a0dda upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in set_msr_mce() and
+get_msr_mce().
+Both functions contain index computations based on the
+(attacker-controlled) MSR number.
+
+Fixes: 890ca9aefa78 ("KVM: Add MCE support")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -2273,7 +2273,10 @@ static int set_msr_mce(struct kvm_vcpu *
+       default:
+               if (msr >= MSR_IA32_MC0_CTL &&
+                   msr < MSR_IA32_MCx_CTL(bank_num)) {
+-                      u32 offset = msr - MSR_IA32_MC0_CTL;
++                      u32 offset = array_index_nospec(
++                              msr - MSR_IA32_MC0_CTL,
++                              MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
++
+                       /* only 0 or all 1s can be written to IA32_MCi_CTL
+                        * some Linux kernels though clear bit 10 in bank 4 to
+                        * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+@@ -2685,7 +2688,10 @@ static int get_msr_mce(struct kvm_vcpu *
+       default:
+               if (msr >= MSR_IA32_MC0_CTL &&
+                   msr < MSR_IA32_MCx_CTL(bank_num)) {
+-                      u32 offset = msr - MSR_IA32_MC0_CTL;
++                      u32 offset = array_index_nospec(
++                              msr - MSR_IA32_MC0_CTL,
++                              MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
++
+                       data = vcpu->arch.mce_banks[offset];
+                       break;
+               }
diff --git a/queue-4.19/kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..c756368
--- /dev/null
@@ -0,0 +1,47 @@
+From 25a5edea71b7c154b6a0b8cec14c711cafa31d26 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:47 -0800
+Subject: KVM: x86: Protect MSR-based index computations in fixed_msr_to_seg_unit() from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 25a5edea71b7c154b6a0b8cec14c711cafa31d26 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in fixed_msr_to_seg_unit().
+This function contains index computations based on the
+(attacker-controlled) MSR number.
+
+Fixes: de9aef5e1ad6 ("KVM: MTRR: introduce fixed_mtrr_segment table")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/mtrr.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/mtrr.c
++++ b/arch/x86/kvm/mtrr.c
+@@ -194,11 +194,15 @@ static bool fixed_msr_to_seg_unit(u32 ms
+               break;
+       case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
+               *seg = 1;
+-              *unit = msr - MSR_MTRRfix16K_80000;
++              *unit = array_index_nospec(
++                      msr - MSR_MTRRfix16K_80000,
++                      MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
+               break;
+       case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+               *seg = 2;
+-              *unit = msr - MSR_MTRRfix4K_C0000;
++              *unit = array_index_nospec(
++                      msr - MSR_MTRRfix4K_C0000,
++                      MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
+               break;
+       default:
+               return false;
diff --git a/queue-4.19/kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..b477bca
--- /dev/null
@@ -0,0 +1,69 @@
+From 13c5183a4e643cc2b03a22d0e582c8e17bb7457d Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:48 -0800
+Subject: KVM: x86: Protect MSR-based index computations in pmu.h from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 13c5183a4e643cc2b03a22d0e582c8e17bb7457d upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in the get_gp_pmc() and
+get_fixed_pmc() functions.
+They both contain index computations based on the (attacker-controlled)
+MSR number.
+
+Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/pmu.h |   18 ++++++++++++++----
+ 1 file changed, 14 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/pmu.h
++++ b/arch/x86/kvm/pmu.h
+@@ -2,6 +2,8 @@
+ #ifndef __KVM_X86_PMU_H
+ #define __KVM_X86_PMU_H
++#include <linux/nospec.h>
++
+ #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+ #define pmu_to_vcpu(pmu)  (container_of((pmu), struct kvm_vcpu, arch.pmu))
+ #define pmc_to_pmu(pmc)   (&(pmc)->vcpu->arch.pmu)
+@@ -86,8 +88,12 @@ static inline bool pmc_is_enabled(struct
+ static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+                                        u32 base)
+ {
+-      if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
+-              return &pmu->gp_counters[msr - base];
++      if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
++              u32 index = array_index_nospec(msr - base,
++                                             pmu->nr_arch_gp_counters);
++
++              return &pmu->gp_counters[index];
++      }
+       return NULL;
+ }
+@@ -97,8 +103,12 @@ static inline struct kvm_pmc *get_fixed_
+ {
+       int base = MSR_CORE_PERF_FIXED_CTR0;
+-      if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
+-              return &pmu->fixed_counters[msr - base];
++      if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
++              u32 index = array_index_nospec(msr - base,
++                                             pmu->nr_arch_fixed_counters);
++
++              return &pmu->fixed_counters[index];
++      }
+       return NULL;
+ }
diff --git a/queue-4.19/kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..fec4a1c
--- /dev/null
@@ -0,0 +1,76 @@
+From 66061740f1a487f4ed54fde75e724709f805da53 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:53 -0800
+Subject: KVM: x86: Protect pmu_intel.c from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 66061740f1a487f4ed54fde75e724709f805da53 upstream.
+
+This fixes Spectre-v1/L1TF vulnerabilities in intel_find_fixed_event()
+and intel_rdpmc_ecx_to_pmc().
+kvm_rdpmc() (ancestor of intel_find_fixed_event()) and
+reprogram_fixed_counter() (ancestor of intel_rdpmc_ecx_to_pmc()) are
+exported symbols so KVM should treat them conservatively from a security
+perspective.
+
+Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/pmu_intel.c |   24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/kvm/pmu_intel.c
++++ b/arch/x86/kvm/pmu_intel.c
+@@ -87,10 +87,14 @@ static unsigned intel_find_arch_event(st
+ static unsigned intel_find_fixed_event(int idx)
+ {
+-      if (idx >= ARRAY_SIZE(fixed_pmc_events))
++      u32 event;
++      size_t size = ARRAY_SIZE(fixed_pmc_events);
++
++      if (idx >= size)
+               return PERF_COUNT_HW_MAX;
+-      return intel_arch_events[fixed_pmc_events[idx]].event_type;
++      event = fixed_pmc_events[array_index_nospec(idx, size)];
++      return intel_arch_events[event].event_type;
+ }
+ /* check if a PMC is enabled by comparing it with globl_ctrl bits. */
+@@ -131,16 +135,20 @@ static struct kvm_pmc *intel_msr_idx_to_
+       struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+       bool fixed = idx & (1u << 30);
+       struct kvm_pmc *counters;
++      unsigned int num_counters;
+       idx &= ~(3u << 30);
+-      if (!fixed && idx >= pmu->nr_arch_gp_counters)
+-              return NULL;
+-      if (fixed && idx >= pmu->nr_arch_fixed_counters)
++      if (fixed) {
++              counters = pmu->fixed_counters;
++              num_counters = pmu->nr_arch_fixed_counters;
++      } else {
++              counters = pmu->gp_counters;
++              num_counters = pmu->nr_arch_gp_counters;
++      }
++      if (idx >= num_counters)
+               return NULL;
+-      counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
+       *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
+-
+-      return &counters[idx];
++      return &counters[array_index_nospec(idx, num_counters)];
+ }
+ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
diff --git a/queue-4.19/kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..0411c76
--- /dev/null
@@ -0,0 +1,48 @@
+From 3c9053a2cae7ba2ba73766a34cea41baa70f57f7 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:41 -0800
+Subject: KVM: x86: Protect x86_decode_insn from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 3c9053a2cae7ba2ba73766a34cea41baa70f57f7 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in x86_decode_insn().
+kvm_emulate_instruction() (an ancestor of x86_decode_insn()) is an exported
+symbol, so KVM should treat it conservatively from a security perspective.
+
+Fixes: 045a282ca415 ("KVM: emulator: implement fninit, fnstsw, fnstcw")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -5269,10 +5269,15 @@ done_prefixes:
+                       }
+                       break;
+               case Escape:
+-                      if (ctxt->modrm > 0xbf)
+-                              opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
+-                      else
++                      if (ctxt->modrm > 0xbf) {
++                              size_t size = ARRAY_SIZE(opcode.u.esc->high);
++                              u32 index = array_index_nospec(
++                                      ctxt->modrm - 0xc0, size);
++
++                              opcode = opcode.u.esc->high[index];
++                      } else {
+                               opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
++                      }
+                       break;
+               case InstrDual:
+                       if ((ctxt->modrm >> 6) == 3)
diff --git a/queue-4.19/kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..eb7e20d
--- /dev/null
@@ -0,0 +1,45 @@
+From 14e32321f3606e4b0970200b6e5e47ee6f1e6410 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:43 -0800
+Subject: KVM: x86: Refactor picdev_write() to prevent Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 14e32321f3606e4b0970200b6e5e47ee6f1e6410 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in picdev_write().
+It replaces index computations based on the (attacked-controlled) port
+number with constants through a minor refactoring.
+
+Fixes: 85f455f7ddbe ("KVM: Add support for in-kernel PIC emulation")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/i8259.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/i8259.c
++++ b/arch/x86/kvm/i8259.c
+@@ -460,10 +460,14 @@ static int picdev_write(struct kvm_pic *
+       switch (addr) {
+       case 0x20:
+       case 0x21:
++              pic_lock(s);
++              pic_ioport_write(&s->pics[0], addr, data);
++              pic_unlock(s);
++              break;
+       case 0xa0:
+       case 0xa1:
+               pic_lock(s);
+-              pic_ioport_write(&s->pics[addr >> 7], addr, data);
++              pic_ioport_write(&s->pics[1], addr, data);
+               pic_unlock(s);
+               break;
+       case 0x4d0:
diff --git a/queue-4.19/kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch b/queue-4.19/kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch
new file mode 100644 (file)
index 0000000..528a8c4
--- /dev/null
@@ -0,0 +1,57 @@
+From 125ffc5e0a56a3eded608dc51e09d5ebf72cf652 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:50 -0800
+Subject: KVM: x86: Refactor prefix decoding to prevent Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 125ffc5e0a56a3eded608dc51e09d5ebf72cf652 upstream.
+
+This fixes Spectre-v1/L1TF vulnerabilities in
+vmx_read_guest_seg_selector(), vmx_read_guest_seg_base(),
+vmx_read_guest_seg_limit() and vmx_read_guest_seg_ar().  When
+invoked from emulation, these functions contain index computations
+based on the (attacker-influenced) segment value.  Using constants
+prevents the attack.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c |   16 ++++++++++++++--
+ 1 file changed, 14 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -5164,16 +5164,28 @@ int x86_decode_insn(struct x86_emulate_c
+                               ctxt->ad_bytes = def_ad_bytes ^ 6;
+                       break;
+               case 0x26:      /* ES override */
++                      has_seg_override = true;
++                      ctxt->seg_override = VCPU_SREG_ES;
++                      break;
+               case 0x2e:      /* CS override */
++                      has_seg_override = true;
++                      ctxt->seg_override = VCPU_SREG_CS;
++                      break;
+               case 0x36:      /* SS override */
++                      has_seg_override = true;
++                      ctxt->seg_override = VCPU_SREG_SS;
++                      break;
+               case 0x3e:      /* DS override */
+                       has_seg_override = true;
+-                      ctxt->seg_override = (ctxt->b >> 3) & 3;
++                      ctxt->seg_override = VCPU_SREG_DS;
+                       break;
+               case 0x64:      /* FS override */
++                      has_seg_override = true;
++                      ctxt->seg_override = VCPU_SREG_FS;
++                      break;
+               case 0x65:      /* GS override */
+                       has_seg_override = true;
+-                      ctxt->seg_override = ctxt->b & 7;
++                      ctxt->seg_override = VCPU_SREG_GS;
+                       break;
+               case 0x40 ... 0x4f: /* REX */
+                       if (mode != X86EMUL_MODE_PROT64)
diff --git a/queue-4.19/media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch b/queue-4.19/media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch
new file mode 100644 (file)
index 0000000..bba1252
--- /dev/null
@@ -0,0 +1,145 @@
+From 080d89f522e2baddb4fbbd1af4b67b5f92537ef8 Mon Sep 17 00:00:00 2001
+From: Sean Young <sean@mess.org>
+Date: Thu, 21 Nov 2019 11:10:47 +0100
+Subject: media: rc: ensure lirc is initialized before registering input device
+
+From: Sean Young <sean@mess.org>
+
+commit 080d89f522e2baddb4fbbd1af4b67b5f92537ef8 upstream.
+
+Once rc_open is called on the input device, lirc events can be delivered.
+Ensure lirc is ready to do so else we might get this:
+
+Registered IR keymap rc-hauppauge
+rc rc0: Hauppauge WinTV PVR-350 as
+/devices/pci0000:00/0000:00:1e.0/0000:04:00.0/i2c-0/0-0018/rc/rc0
+input: Hauppauge WinTV PVR-350 as
+/devices/pci0000:00/0000:00:1e.0/0000:04:00.0/i2c-0/0-0018/rc/rc0/input9
+BUG: kernel NULL pointer dereference, address: 0000000000000038
+PGD 0 P4D 0
+Oops: 0000 [#1] SMP PTI
+CPU: 1 PID: 17 Comm: kworker/1:0 Not tainted 5.3.11-300.fc31.x86_64 #1
+Hardware name:  /DG43NB, BIOS NBG4310H.86A.0096.2009.0903.1845 09/03/2009
+Workqueue: events ir_work [ir_kbd_i2c]
+RIP: 0010:ir_lirc_scancode_event+0x3d/0xb0
+Code: a6 b4 07 00 00 49 81 c6 b8 07 00 00 55 53 e8 ba a7 9d ff 4c 89
+e7 49 89 45 00 e8 5e 7a 25 00 49 8b 1e 48 89 c5 4c 39 f3 74 58 <8b> 43
+38 8b 53 40 89 c1 2b 4b 3c 39 ca 72 41 21 d0 49 8b 7d 00 49
+RSP: 0018:ffffaae2000b3d88 EFLAGS: 00010017
+RAX: 0000000000000002 RBX: 0000000000000000 RCX: 0000000000000019
+RDX: 0000000000000001 RSI: 006e801b1f26ce6a RDI: ffff9e39797c37b4
+RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000001
+R10: 0000000000000001 R11: 0000000000000001 R12: ffff9e39797c37b4
+R13: ffffaae2000b3db8 R14: ffff9e39797c37b8 R15: ffff9e39797c33d8
+FS:  0000000000000000(0000) GS:ffff9e397b680000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000000000038 CR3: 0000000035844000 CR4: 00000000000006e0
+Call Trace:
+ir_do_keydown+0x8e/0x2b0
+rc_keydown+0x52/0xc0
+ir_work+0xb8/0x130 [ir_kbd_i2c]
+process_one_work+0x19d/0x340
+worker_thread+0x50/0x3b0
+kthread+0xfb/0x130
+? process_one_work+0x340/0x340
+? kthread_park+0x80/0x80
+ret_from_fork+0x35/0x40
+Modules linked in: rc_hauppauge tuner msp3400 saa7127 saa7115 ivtv(+)
+tveeprom cx2341x v4l2_common videodev mc i2c_algo_bit ir_kbd_i2c
+ip_tables firewire_ohci e1000e serio_raw firewire_core ata_generic
+crc_itu_t pata_acpi pata_jmicron fuse
+CR2: 0000000000000038
+---[ end trace c67c2697a99fa74b ]---
+RIP: 0010:ir_lirc_scancode_event+0x3d/0xb0
+Code: a6 b4 07 00 00 49 81 c6 b8 07 00 00 55 53 e8 ba a7 9d ff 4c 89
+e7 49 89 45 00 e8 5e 7a 25 00 49 8b 1e 48 89 c5 4c 39 f3 74 58 <8b> 43
+38 8b 53 40 89 c1 2b 4b 3c 39 ca 72 41 21 d0 49 8b 7d 00 49
+RSP: 0018:ffffaae2000b3d88 EFLAGS: 00010017
+RAX: 0000000000000002 RBX: 0000000000000000 RCX: 0000000000000019
+RDX: 0000000000000001 RSI: 006e801b1f26ce6a RDI: ffff9e39797c37b4
+RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000001
+R10: 0000000000000001 R11: 0000000000000001 R12: ffff9e39797c37b4
+R13: ffffaae2000b3db8 R14: ffff9e39797c37b8 R15: ffff9e39797c33d8
+FS:  0000000000000000(0000) GS:ffff9e397b680000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000000000038 CR3: 0000000035844000 CR4: 00000000000006e0
+rc rc0: lirc_dev: driver ir_kbd_i2c registered at minor = 0, scancode
+receiver, no transmitter
+tuner-simple 0-0061: creating new instance
+tuner-simple 0-0061: type set to 2 (Philips NTSC (FI1236,FM1236 and
+compatibles))
+ivtv0: Registered device video0 for encoder MPG (4096 kB)
+ivtv0: Registered device video32 for encoder YUV (2048 kB)
+ivtv0: Registered device vbi0 for encoder VBI (1024 kB)
+ivtv0: Registered device video24 for encoder PCM (320 kB)
+ivtv0: Registered device radio0 for encoder radio
+ivtv0: Registered device video16 for decoder MPG (1024 kB)
+ivtv0: Registered device vbi8 for decoder VBI (64 kB)
+ivtv0: Registered device vbi16 for decoder VOUT
+ivtv0: Registered device video48 for decoder YUV (1024 kB)
+
+Cc: stable@vger.kernel.org
+Tested-by: Nick French <nickfrench@gmail.com>
+Reported-by: Nick French <nickfrench@gmail.com>
+Signed-off-by: Sean Young <sean@mess.org>
+Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/media/rc/rc-main.c |   27 ++++++++++++++++-----------
+ 1 file changed, 16 insertions(+), 11 deletions(-)
+
+--- a/drivers/media/rc/rc-main.c
++++ b/drivers/media/rc/rc-main.c
+@@ -1874,23 +1874,28 @@ int rc_register_device(struct rc_dev *de
+       dev->registered = true;
+-      if (dev->driver_type != RC_DRIVER_IR_RAW_TX) {
+-              rc = rc_setup_rx_device(dev);
+-              if (rc)
+-                      goto out_dev;
+-      }
+-
+-      /* Ensure that the lirc kfifo is setup before we start the thread */
++      /*
++       * once the the input device is registered in rc_setup_rx_device,
++       * userspace can open the input device and rc_open() will be called
++       * as a result. This results in driver code being allowed to submit
++       * keycodes with rc_keydown, so lirc must be registered first.
++       */
+       if (dev->allowed_protocols != RC_PROTO_BIT_CEC) {
+               rc = ir_lirc_register(dev);
+               if (rc < 0)
+-                      goto out_rx;
++                      goto out_dev;
++      }
++
++      if (dev->driver_type != RC_DRIVER_IR_RAW_TX) {
++              rc = rc_setup_rx_device(dev);
++              if (rc)
++                      goto out_lirc;
+       }
+       if (dev->driver_type == RC_DRIVER_IR_RAW) {
+               rc = ir_raw_event_register(dev);
+               if (rc < 0)
+-                      goto out_lirc;
++                      goto out_rx;
+       }
+       dev_dbg(&dev->dev, "Registered rc%u (driver: %s)\n", dev->minor,
+@@ -1898,11 +1903,11 @@ int rc_register_device(struct rc_dev *de
+       return 0;
++out_rx:
++      rc_free_rx_device(dev);
+ out_lirc:
+       if (dev->allowed_protocols != RC_PROTO_BIT_CEC)
+               ir_lirc_unregister(dev);
+-out_rx:
+-      rc_free_rx_device(dev);
+ out_dev:
+       device_del(&dev->dev);
+ out_rx_free:
diff --git a/queue-4.19/mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch b/queue-4.19/mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch
new file mode 100644 (file)
index 0000000..e740b6e
--- /dev/null
@@ -0,0 +1,35 @@
+From 65b1aae0d9d5962faccc06bdb8e91a2a0b09451c Mon Sep 17 00:00:00 2001
+From: Brian Norris <briannorris@chromium.org>
+Date: Mon, 6 Jan 2020 14:42:12 -0800
+Subject: mwifiex: fix unbalanced locking in mwifiex_process_country_ie()
+
+From: Brian Norris <briannorris@chromium.org>
+
+commit 65b1aae0d9d5962faccc06bdb8e91a2a0b09451c upstream.
+
+We called rcu_read_lock(), so we need to call rcu_read_unlock() before
+we return.
+
+Fixes: 3d94a4a8373b ("mwifiex: fix possible heap overflow in mwifiex_process_country_ie()")
+Cc: stable@vger.kernel.org
+Cc: huangwen <huangwenabc@gmail.com>
+Cc: Ganapathi Bhat <ganapathi.bhat@nxp.com>
+Signed-off-by: Brian Norris <briannorris@chromium.org>
+Acked-by: Ganapathi Bhat <ganapathi.bhat@nxp.com>
+Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/wireless/marvell/mwifiex/sta_ioctl.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
++++ b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
+@@ -232,6 +232,7 @@ static int mwifiex_process_country_ie(st
+       if (country_ie_len >
+           (IEEE80211_COUNTRY_STRING_LEN + MWIFIEX_MAX_TRIPLET_802_11D)) {
++              rcu_read_unlock();
+               mwifiex_dbg(priv->adapter, ERROR,
+                           "11D: country_ie_len overflow!, deauth AP\n");
+               return -EINVAL;
diff --git a/queue-4.19/nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch b/queue-4.19/nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch
new file mode 100644 (file)
index 0000000..523e26a
--- /dev/null
@@ -0,0 +1,112 @@
+From 114de38225d9b300f027e2aec9afbb6e0def154b Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <trondmy@gmail.com>
+Date: Sun, 2 Feb 2020 17:53:54 -0500
+Subject: NFS: Directory page cache pages need to be locked when read
+
+From: Trond Myklebust <trondmy@gmail.com>
+
+commit 114de38225d9b300f027e2aec9afbb6e0def154b upstream.
+
+When a NFS directory page cache page is removed from the page cache,
+its contents are freed through a call to nfs_readdir_clear_array().
+To prevent the removal of the page cache entry until after we've
+finished reading it, we must take the page lock.
+
+Fixes: 11de3b11e08c ("NFS: Fix a memory leak in nfs_readdir")
+Cc: stable@vger.kernel.org # v2.6.37+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfs/dir.c |   30 +++++++++++++++++++-----------
+ 1 file changed, 19 insertions(+), 11 deletions(-)
+
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -701,8 +701,6 @@ int nfs_readdir_filler(nfs_readdir_descr
+ static
+ void cache_page_release(nfs_readdir_descriptor_t *desc)
+ {
+-      if (!desc->page->mapping)
+-              nfs_readdir_clear_array(desc->page);
+       put_page(desc->page);
+       desc->page = NULL;
+ }
+@@ -716,19 +714,28 @@ struct page *get_cache_page(nfs_readdir_
+ /*
+  * Returns 0 if desc->dir_cookie was found on page desc->page_index
++ * and locks the page to prevent removal from the page cache.
+  */
+ static
+-int find_cache_page(nfs_readdir_descriptor_t *desc)
++int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
+ {
+       int res;
+       desc->page = get_cache_page(desc);
+       if (IS_ERR(desc->page))
+               return PTR_ERR(desc->page);
+-
+-      res = nfs_readdir_search_array(desc);
++      res = lock_page_killable(desc->page);
+       if (res != 0)
+-              cache_page_release(desc);
++              goto error;
++      res = -EAGAIN;
++      if (desc->page->mapping != NULL) {
++              res = nfs_readdir_search_array(desc);
++              if (res == 0)
++                      return 0;
++      }
++      unlock_page(desc->page);
++error:
++      cache_page_release(desc);
+       return res;
+ }
+@@ -743,7 +750,7 @@ int readdir_search_pagecache(nfs_readdir
+               desc->last_cookie = 0;
+       }
+       do {
+-              res = find_cache_page(desc);
++              res = find_and_lock_cache_page(desc);
+       } while (res == -EAGAIN);
+       return res;
+ }
+@@ -782,7 +789,6 @@ int nfs_do_filldir(nfs_readdir_descripto
+               desc->eof = true;
+       kunmap(desc->page);
+-      cache_page_release(desc);
+       dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
+                       (unsigned long long)*desc->dir_cookie, res);
+       return res;
+@@ -828,13 +834,13 @@ int uncached_readdir(nfs_readdir_descrip
+       status = nfs_do_filldir(desc);
++ out_release:
++      nfs_readdir_clear_array(desc->page);
++      cache_page_release(desc);
+  out:
+       dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
+                       __func__, status);
+       return status;
+- out_release:
+-      cache_page_release(desc);
+-      goto out;
+ }
+ /* The file offset position represents the dirent entry number.  A
+@@ -899,6 +905,8 @@ static int nfs_readdir(struct file *file
+                       break;
+               res = nfs_do_filldir(desc);
++              unlock_page(desc->page);
++              cache_page_release(desc);
+               if (res < 0)
+                       break;
+       } while (!desc->eof);
diff --git a/queue-4.19/nfs-fix-memory-leaks-and-corruption-in-readdir.patch b/queue-4.19/nfs-fix-memory-leaks-and-corruption-in-readdir.patch
new file mode 100644 (file)
index 0000000..87c7753
--- /dev/null
@@ -0,0 +1,81 @@
+From 4b310319c6a8ce708f1033d57145e2aa027a883c Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <trondmy@gmail.com>
+Date: Sun, 2 Feb 2020 17:53:53 -0500
+Subject: NFS: Fix memory leaks and corruption in readdir
+
+From: Trond Myklebust <trondmy@gmail.com>
+
+commit 4b310319c6a8ce708f1033d57145e2aa027a883c upstream.
+
+nfs_readdir_xdr_to_array() must not exit without having initialised
+the array, so that the page cache deletion routines can safely
+call nfs_readdir_clear_array().
+Furthermore, we should ensure that if we exit nfs_readdir_filler()
+with an error, we free up any page contents to prevent a leak
+if we try to fill the page again.
+
+Fixes: 11de3b11e08c ("NFS: Fix a memory leak in nfs_readdir")
+Cc: stable@vger.kernel.org # v2.6.37+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfs/dir.c |   17 +++++++++++++++--
+ 1 file changed, 15 insertions(+), 2 deletions(-)
+
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -162,6 +162,17 @@ typedef struct {
+       bool eof;
+ } nfs_readdir_descriptor_t;
++static
++void nfs_readdir_init_array(struct page *page)
++{
++      struct nfs_cache_array *array;
++
++      array = kmap_atomic(page);
++      memset(array, 0, sizeof(struct nfs_cache_array));
++      array->eof_index = -1;
++      kunmap_atomic(array);
++}
++
+ /*
+  * we are freeing strings created by nfs_add_to_readdir_array()
+  */
+@@ -174,6 +185,7 @@ void nfs_readdir_clear_array(struct page
+       array = kmap_atomic(page);
+       for (i = 0; i < array->size; i++)
+               kfree(array->array[i].string.name);
++      array->size = 0;
+       kunmap_atomic(array);
+ }
+@@ -610,6 +622,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir
+       int status = -ENOMEM;
+       unsigned int array_size = ARRAY_SIZE(pages);
++      nfs_readdir_init_array(page);
++
+       entry.prev_cookie = 0;
+       entry.cookie = desc->last_cookie;
+       entry.eof = 0;
+@@ -626,8 +640,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir
+       }
+       array = kmap(page);
+-      memset(array, 0, sizeof(struct nfs_cache_array));
+-      array->eof_index = -1;
+       status = nfs_readdir_alloc_pages(pages, array_size);
+       if (status < 0)
+@@ -681,6 +693,7 @@ int nfs_readdir_filler(nfs_readdir_descr
+       unlock_page(page);
+       return 0;
+  error:
++      nfs_readdir_clear_array(page);
+       unlock_page(page);
+       return ret;
+ }
diff --git a/queue-4.19/scsi-qla2xxx-fix-unbound-nvme-response-length.patch b/queue-4.19/scsi-qla2xxx-fix-unbound-nvme-response-length.patch
new file mode 100644 (file)
index 0000000..1af8734
--- /dev/null
@@ -0,0 +1,78 @@
+From 00fe717ee1ea3c2979db4f94b1533c57aed8dea9 Mon Sep 17 00:00:00 2001
+From: Arun Easi <aeasi@marvell.com>
+Date: Thu, 23 Jan 2020 20:50:14 -0800
+Subject: scsi: qla2xxx: Fix unbound NVME response length
+
+From: Arun Easi <aeasi@marvell.com>
+
+commit 00fe717ee1ea3c2979db4f94b1533c57aed8dea9 upstream.
+
+On certain cases when response length is less than 32, NVME response data
+is supplied inline in IOCB. This is indicated by some combination of state
+flags. There was an instance when a high, and incorrect, response length
+was indicated causing driver to overrun buffers. Fix this by checking and
+limiting the response payload length.
+
+Fixes: 7401bc18d1ee3 ("scsi: qla2xxx: Add FC-NVMe command handling")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20200124045014.23554-1-hmadhani@marvell.com
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Himanshu Madhani <hmadhani@marvell.com>
+Reviewed-by: Ewan D. Milne <emilne@redhat.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/qla2xxx/qla_dbg.c |    6 ------
+ drivers/scsi/qla2xxx/qla_dbg.h |    6 ++++++
+ drivers/scsi/qla2xxx/qla_isr.c |   12 ++++++++++++
+ 3 files changed, 18 insertions(+), 6 deletions(-)
+
+--- a/drivers/scsi/qla2xxx/qla_dbg.c
++++ b/drivers/scsi/qla2xxx/qla_dbg.c
+@@ -2520,12 +2520,6 @@ qla83xx_fw_dump_failed:
+ /*                         Driver Debug Functions.                          */
+ /****************************************************************************/
+-static inline int
+-ql_mask_match(uint32_t level)
+-{
+-      return (level & ql2xextended_error_logging) == level;
+-}
+-
+ /*
+  * This function is for formatting and logging debug information.
+  * It is to be used when vha is available. It formats the message
+--- a/drivers/scsi/qla2xxx/qla_dbg.h
++++ b/drivers/scsi/qla2xxx/qla_dbg.h
+@@ -374,3 +374,9 @@ extern int qla24xx_dump_ram(struct qla_h
+ extern void qla24xx_pause_risc(struct device_reg_24xx __iomem *,
+       struct qla_hw_data *);
+ extern int qla24xx_soft_reset(struct qla_hw_data *);
++
++static inline int
++ql_mask_match(uint level)
++{
++      return (level & ql2xextended_error_logging) == level;
++}
+--- a/drivers/scsi/qla2xxx/qla_isr.c
++++ b/drivers/scsi/qla2xxx/qla_isr.c
+@@ -1876,6 +1876,18 @@ static void qla24xx_nvme_iocb_entry(scsi
+               inbuf = (uint32_t *)&sts->nvme_ersp_data;
+               outbuf = (uint32_t *)fd->rspaddr;
+               iocb->u.nvme.rsp_pyld_len = le16_to_cpu(sts->nvme_rsp_pyld_len);
++              if (unlikely(iocb->u.nvme.rsp_pyld_len >
++                  sizeof(struct nvme_fc_ersp_iu))) {
++                      if (ql_mask_match(ql_dbg_io)) {
++                              WARN_ONCE(1, "Unexpected response payload length %u.\n",
++                                  iocb->u.nvme.rsp_pyld_len);
++                              ql_log(ql_log_warn, fcport->vha, 0x5100,
++                                  "Unexpected response payload length %u.\n",
++                                  iocb->u.nvme.rsp_pyld_len);
++                      }
++                      iocb->u.nvme.rsp_pyld_len =
++                          sizeof(struct nvme_fc_ersp_iu);
++              }
+               iter = iocb->u.nvme.rsp_pyld_len >> 2;
+               for (; iter; iter--)
+                       *outbuf++ = swab32(*inbuf++);
index b69acfbdac9439b4aa1b2997bddd785b1a757ab5..c6feaab650db22c71776df54734c0a2bc3bbef3c 100644 (file)
@@ -99,3 +99,39 @@ crypto-pcrypt-do-not-clear-may_sleep-flag-in-original-request.patch
 crypto-atmel-aes-fix-counter-overflow-in-ctr-mode.patch
 crypto-api-fix-race-condition-in-crypto_spawn_alg.patch
 crypto-picoxcell-adjust-the-position-of-tasklet_init-and-fix-missed-tasklet_kill.patch
+scsi-qla2xxx-fix-unbound-nvme-response-length.patch
+nfs-fix-memory-leaks-and-corruption-in-readdir.patch
+nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch
+jbd2_seq_info_next-should-increase-position-index.patch
+btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch
+btrfs-set-trans-drity-in-btrfs_commit_transaction.patch
+btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch
+arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch
+iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch
+mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch
+sunrpc-expiry_time-should-be-seconds-not-timeval.patch
+gfs2-move-setting-current-backing_dev_info.patch
+gfs2-fix-o_sync-write-handling.patch
+drm-rect-avoid-division-by-zero.patch
+media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch
+tools-kvm_stat-fix-kvm_exit-filter-name.patch
+xen-balloon-support-xend-based-toolstack-take-two.patch
+watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch
+bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch
+eventfd-track-eventfd_signal-recursion-depth.patch
+aio-prevent-potential-eventfd-recursion-on-poll.patch
+kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch
+kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch
+kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch
+kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch
+kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch
diff --git a/queue-4.19/sunrpc-expiry_time-should-be-seconds-not-timeval.patch b/queue-4.19/sunrpc-expiry_time-should-be-seconds-not-timeval.patch
new file mode 100644 (file)
index 0000000..34b8f9e
--- /dev/null
@@ -0,0 +1,54 @@
+From 3d96208c30f84d6edf9ab4fac813306ac0d20c10 Mon Sep 17 00:00:00 2001
+From: Roberto Bergantinos Corpas <rbergant@redhat.com>
+Date: Tue, 4 Feb 2020 11:32:56 +0100
+Subject: sunrpc: expiry_time should be seconds not timeval
+
+From: Roberto Bergantinos Corpas <rbergant@redhat.com>
+
+commit 3d96208c30f84d6edf9ab4fac813306ac0d20c10 upstream.
+
+When upcalling gssproxy, cache_head.expiry_time is set as a
+timeval, not seconds since boot. As such, RPC cache expiry
+logic will not clean expired objects created under
+auth.rpcsec.context cache.
+
+This has proven to cause kernel memory leaks on field. Using
+64 bit variants of getboottime/timespec
+
+Expiration times have worked this way since 2010's c5b29f885afe "sunrpc:
+use seconds since boot in expiry cache".  The gssproxy code introduced
+in 2012 added gss_proxy_save_rsc and introduced the bug.  That's a while
+for this to lurk, but it required a bit of an extreme case to make it
+obvious.
+
+Signed-off-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
+Cc: stable@vger.kernel.org
+Fixes: 030d794bf498 "SUNRPC: Use gssproxy upcall for server..."
+Tested-By: Frank Sorenson <sorenson@redhat.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/sunrpc/auth_gss/svcauth_gss.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/net/sunrpc/auth_gss/svcauth_gss.c
++++ b/net/sunrpc/auth_gss/svcauth_gss.c
+@@ -1224,6 +1224,7 @@ static int gss_proxy_save_rsc(struct cac
+               dprintk("RPC:       No creds found!\n");
+               goto out;
+       } else {
++              struct timespec64 boot;
+               /* steal creds */
+               rsci.cred = ud->creds;
+@@ -1244,6 +1245,9 @@ static int gss_proxy_save_rsc(struct cac
+                                               &expiry, GFP_KERNEL);
+               if (status)
+                       goto out;
++
++              getboottime64(&boot);
++              expiry -= boot.tv_sec;
+       }
+       rsci.h.expiry_time = expiry;
diff --git a/queue-4.19/tools-kvm_stat-fix-kvm_exit-filter-name.patch b/queue-4.19/tools-kvm_stat-fix-kvm_exit-filter-name.patch
new file mode 100644 (file)
index 0000000..27ac792
--- /dev/null
@@ -0,0 +1,73 @@
+From 5fcf3a55a62afb0760ccb6f391d62f20bce4a42f Mon Sep 17 00:00:00 2001
+From: Gavin Shan <gshan@redhat.com>
+Date: Tue, 10 Dec 2019 15:48:29 +1100
+Subject: tools/kvm_stat: Fix kvm_exit filter name
+
+From: Gavin Shan <gshan@redhat.com>
+
+commit 5fcf3a55a62afb0760ccb6f391d62f20bce4a42f upstream.
+
+The filter name is fixed to "exit_reason" for some kvm_exit events, no
+matter what architect we have. Actually, the filter name ("exit_reason")
+is only applicable to x86, meaning it's broken on other architects
+including aarch64.
+
+This fixes the issue by providing various kvm_exit filter names, depending
+on architect we're on. Afterwards, the variable filter name is picked and
+applied through ioctl(fd, SET_FILTER).
+
+Reported-by: Andrew Jones <drjones@redhat.com>
+Signed-off-by: Gavin Shan <gshan@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/kvm/kvm_stat/kvm_stat |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/tools/kvm/kvm_stat/kvm_stat
++++ b/tools/kvm/kvm_stat/kvm_stat
+@@ -271,6 +271,7 @@ class ArchX86(Arch):
+     def __init__(self, exit_reasons):
+         self.sc_perf_evt_open = 298
+         self.ioctl_numbers = IOCTL_NUMBERS
++        self.exit_reason_field = 'exit_reason'
+         self.exit_reasons = exit_reasons
+     def debugfs_is_child(self, field):
+@@ -290,6 +291,7 @@ class ArchPPC(Arch):
+         # numbers depend on the wordsize.
+         char_ptr_size = ctypes.sizeof(ctypes.c_char_p)
+         self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16
++        self.exit_reason_field = 'exit_nr'
+         self.exit_reasons = {}
+     def debugfs_is_child(self, field):
+@@ -301,6 +303,7 @@ class ArchA64(Arch):
+     def __init__(self):
+         self.sc_perf_evt_open = 241
+         self.ioctl_numbers = IOCTL_NUMBERS
++        self.exit_reason_field = 'esr_ec'
+         self.exit_reasons = AARCH64_EXIT_REASONS
+     def debugfs_is_child(self, field):
+@@ -312,6 +315,7 @@ class ArchS390(Arch):
+     def __init__(self):
+         self.sc_perf_evt_open = 331
+         self.ioctl_numbers = IOCTL_NUMBERS
++        self.exit_reason_field = None
+         self.exit_reasons = None
+     def debugfs_is_child(self, field):
+@@ -542,8 +546,8 @@ class TracepointProvider(Provider):
+         """
+         filters = {}
+         filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
+-        if ARCH.exit_reasons:
+-            filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
++        if ARCH.exit_reason_field and ARCH.exit_reasons:
++            filters['kvm_exit'] = (ARCH.exit_reason_field, ARCH.exit_reasons)
+         return filters
+     def _get_available_fields(self):
diff --git a/queue-4.19/watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch b/queue-4.19/watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch
new file mode 100644 (file)
index 0000000..ca35261
--- /dev/null
@@ -0,0 +1,197 @@
+From 69503e585192fdd84b240f18a0873d20e18a2e0a Mon Sep 17 00:00:00 2001
+From: Vladis Dronov <vdronov@redhat.com>
+Date: Wed, 8 Jan 2020 13:53:47 +0100
+Subject: watchdog: fix UAF in reboot notifier handling in watchdog core code
+
+From: Vladis Dronov <vdronov@redhat.com>
+
+commit 69503e585192fdd84b240f18a0873d20e18a2e0a upstream.
+
+After the commit 44ea39420fc9 ("drivers/watchdog: make use of
+devm_register_reboot_notifier()") the struct notifier_block reboot_nb in
+the struct watchdog_device is removed from the reboot notifiers chain at
+the time watchdog's chardev is closed. But at least in i6300esb.c case
+reboot_nb is embedded in the struct esb_dev which can be freed on its
+device removal and before the chardev is closed, thus UAF at reboot:
+
+[    7.728581] esb_probe: esb_dev.watchdog_device ffff91316f91ab28
+ts# uname -r                            note the address ^^^
+5.5.0-rc5-ae6088-wdog
+ts# ./openwdog0 &
+[1] 696
+ts# opened /dev/watchdog0, sleeping 10s...
+ts# echo 1 > /sys/devices/pci0000\:00/0000\:00\:09.0/remove
+[  178.086079] devres:rel_nodes: dev ffff91317668a0b0 data ffff91316f91ab28
+           esb_dev.watchdog_device.reboot_nb memory is freed here ^^^
+ts# ...woken up
+[  181.459010] devres:rel_nodes: dev ffff913171781000 data ffff913174a1dae8
+[  181.460195] devm_unreg_reboot_notifier: res ffff913174a1dae8 nb ffff91316f91ab78
+                                     attempt to use memory already freed ^^^
+[  181.461063] devm_unreg_reboot_notifier: nb->call 6b6b6b6b6b6b6b6b
+[  181.461243] devm_unreg_reboot_notifier: nb->next 6b6b6b6b6b6b6b6b
+                freed memory is filled with a slub poison ^^^
+[1]+  Done                    ./openwdog0
+ts# reboot
+[  229.921862] systemd-shutdown[1]: Rebooting.
+[  229.939265] notifier_call_chain: nb ffffffff9c6c2f20 nb->next ffffffff9c6d50c0
+[  229.943080] notifier_call_chain: nb ffffffff9c6d50c0 nb->next 6b6b6b6b6b6b6b6b
+[  229.946054] notifier_call_chain: nb 6b6b6b6b6b6b6b6b INVAL
+[  229.957584] general protection fault: 0000 [#1] SMP
+[  229.958770] CPU: 0 PID: 1 Comm: systemd-shutdow Not tainted 5.5.0-rc5-ae6088-wdog
+[  229.960224] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
+[  229.963288] RIP: 0010:notifier_call_chain+0x66/0xd0
+[  229.969082] RSP: 0018:ffffb20dc0013d88 EFLAGS: 00010246
+[  229.970812] RAX: 000000000000002e RBX: 6b6b6b6b6b6b6b6b RCX: 00000000000008b3
+[  229.972929] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffffffff9ccc46ac
+[  229.975028] RBP: 0000000000000001 R08: 0000000000000000 R09: 00000000000008b3
+[  229.977039] R10: 0000000000000001 R11: ffffffff9c26c740 R12: 0000000000000000
+[  229.979155] R13: 6b6b6b6b6b6b6b6b R14: 0000000000000000 R15: 00000000fffffffa
+...   slub_debug=FZP poison ^^^
+[  229.989089] Call Trace:
+[  229.990157]  blocking_notifier_call_chain+0x43/0x59
+[  229.991401]  kernel_restart_prepare+0x14/0x30
+[  229.992607]  kernel_restart+0x9/0x30
+[  229.993800]  __do_sys_reboot+0x1d2/0x210
+[  230.000149]  do_syscall_64+0x3d/0x130
+[  230.001277]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+[  230.002639] RIP: 0033:0x7f5461bdd177
+[  230.016402] Modules linked in: i6300esb
+[  230.050261] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
+
+Fix the crash by reverting 44ea39420fc9 so unregister_reboot_notifier()
+is called when watchdog device is removed. This also makes handling of
+the reboot notifier unified with the handling of the restart handler,
+which is freed with unregister_restart_handler() in the same place.
+
+Fixes: 44ea39420fc9 ("drivers/watchdog: make use of devm_register_reboot_notifier()")
+Cc: stable@vger.kernel.org # v4.15+
+Signed-off-by: Vladis Dronov <vdronov@redhat.com>
+Reviewed-by: Guenter Roeck <linux@roeck-us.net>
+Link: https://lore.kernel.org/r/20200108125347.6067-1-vdronov@redhat.com
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/watchdog/watchdog_core.c |   35 +++++++++++++++++++++++++++++++++++
+ drivers/watchdog/watchdog_dev.c  |   36 +-----------------------------------
+ 2 files changed, 36 insertions(+), 35 deletions(-)
+
+--- a/drivers/watchdog/watchdog_core.c
++++ b/drivers/watchdog/watchdog_core.c
+@@ -138,6 +138,25 @@ int watchdog_init_timeout(struct watchdo
+ }
+ EXPORT_SYMBOL_GPL(watchdog_init_timeout);
++static int watchdog_reboot_notifier(struct notifier_block *nb,
++                                  unsigned long code, void *data)
++{
++      struct watchdog_device *wdd;
++
++      wdd = container_of(nb, struct watchdog_device, reboot_nb);
++      if (code == SYS_DOWN || code == SYS_HALT) {
++              if (watchdog_active(wdd)) {
++                      int ret;
++
++                      ret = wdd->ops->stop(wdd);
++                      if (ret)
++                              return NOTIFY_BAD;
++              }
++      }
++
++      return NOTIFY_DONE;
++}
++
+ static int watchdog_restart_notifier(struct notifier_block *nb,
+                                    unsigned long action, void *data)
+ {
+@@ -226,6 +245,19 @@ static int __watchdog_register_device(st
+               }
+       }
++      if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
++              wdd->reboot_nb.notifier_call = watchdog_reboot_notifier;
++
++              ret = register_reboot_notifier(&wdd->reboot_nb);
++              if (ret) {
++                      pr_err("watchdog%d: Cannot register reboot notifier (%d)\n",
++                             wdd->id, ret);
++                      watchdog_dev_unregister(wdd);
++                      ida_simple_remove(&watchdog_ida, id);
++                      return ret;
++              }
++      }
++
+       if (wdd->ops->restart) {
+               wdd->restart_nb.notifier_call = watchdog_restart_notifier;
+@@ -271,6 +303,9 @@ static void __watchdog_unregister_device
+       if (wdd->ops->restart)
+               unregister_restart_handler(&wdd->restart_nb);
++      if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status))
++              unregister_reboot_notifier(&wdd->reboot_nb);
++
+       watchdog_dev_unregister(wdd);
+       ida_simple_remove(&watchdog_ida, wdd->id);
+ }
+--- a/drivers/watchdog/watchdog_dev.c
++++ b/drivers/watchdog/watchdog_dev.c
+@@ -42,7 +42,6 @@
+ #include <linux/miscdevice.h> /* For handling misc devices */
+ #include <linux/module.h>     /* For module stuff/... */
+ #include <linux/mutex.h>      /* For mutexes */
+-#include <linux/reboot.h>     /* For reboot notifier */
+ #include <linux/slab.h>               /* For memory functions */
+ #include <linux/types.h>      /* For standard types (like size_t) */
+ #include <linux/watchdog.h>   /* For watchdog specific items */
+@@ -1048,25 +1047,6 @@ static void watchdog_cdev_unregister(str
+       put_device(&wd_data->dev);
+ }
+-static int watchdog_reboot_notifier(struct notifier_block *nb,
+-                                  unsigned long code, void *data)
+-{
+-      struct watchdog_device *wdd;
+-
+-      wdd = container_of(nb, struct watchdog_device, reboot_nb);
+-      if (code == SYS_DOWN || code == SYS_HALT) {
+-              if (watchdog_active(wdd)) {
+-                      int ret;
+-
+-                      ret = wdd->ops->stop(wdd);
+-                      if (ret)
+-                              return NOTIFY_BAD;
+-              }
+-      }
+-
+-      return NOTIFY_DONE;
+-}
+-
+ /*
+  *    watchdog_dev_register: register a watchdog device
+  *    @wdd: watchdog device
+@@ -1085,22 +1065,8 @@ int watchdog_dev_register(struct watchdo
+               return ret;
+       ret = watchdog_register_pretimeout(wdd);
+-      if (ret) {
++      if (ret)
+               watchdog_cdev_unregister(wdd);
+-              return ret;
+-      }
+-
+-      if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
+-              wdd->reboot_nb.notifier_call = watchdog_reboot_notifier;
+-
+-              ret = devm_register_reboot_notifier(&wdd->wd_data->dev,
+-                                                  &wdd->reboot_nb);
+-              if (ret) {
+-                      pr_err("watchdog%d: Cannot register reboot notifier (%d)\n",
+-                             wdd->id, ret);
+-                      watchdog_dev_unregister(wdd);
+-              }
+-      }
+       return ret;
+ }
diff --git a/queue-4.19/xen-balloon-support-xend-based-toolstack-take-two.patch b/queue-4.19/xen-balloon-support-xend-based-toolstack-take-two.patch
new file mode 100644 (file)
index 0000000..9895c98
--- /dev/null
@@ -0,0 +1,47 @@
+From eda4eabf86fd6806eaabc23fb90dd056fdac037b Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Fri, 17 Jan 2020 14:49:31 +0100
+Subject: xen/balloon: Support xend-based toolstack take two
+
+From: Juergen Gross <jgross@suse.com>
+
+commit eda4eabf86fd6806eaabc23fb90dd056fdac037b upstream.
+
+Commit 3aa6c19d2f38be ("xen/balloon: Support xend-based toolstack")
+tried to fix a regression with running on rather ancient Xen versions.
+Unfortunately the fix was based on the assumption that xend would
+just use another Xenstore node, but in reality only some downstream
+versions of xend are doing that. The upstream xend does not write
+that Xenstore node at all, so the problem must be fixed in another
+way.
+
+The easiest way to achieve that is to fall back to the behavior
+before commit 96edd61dcf4436 ("xen/balloon: don't online new memory
+initially") in case the static memory maximum can't be read.
+
+This is achieved by setting static_max to the current number of
+memory pages known by the system resulting in target_diff becoming
+zero.
+
+Fixes: 3aa6c19d2f38be ("xen/balloon: Support xend-based toolstack")
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: <stable@vger.kernel.org> # 4.13
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/xen/xen-balloon.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/xen/xen-balloon.c
++++ b/drivers/xen/xen-balloon.c
+@@ -83,7 +83,7 @@ static void watch_target(struct xenbus_w
+                                 "%llu", &static_max) == 1))
+                       static_max >>= PAGE_SHIFT - 10;
+               else
+-                      static_max = new_target;
++                      static_max = balloon_stats.current_pages;
+               target_diff = (xen_pv_domain() || xen_initial_domain()) ? 0
+                               : static_max - balloon_stats.target_pages;