--- /dev/null
+From 01d7a356872eec22ef34a33a5f9cfa917d145468 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Mon, 3 Feb 2020 10:33:42 -0700
+Subject: aio: prevent potential eventfd recursion on poll
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 01d7a356872eec22ef34a33a5f9cfa917d145468 upstream.
+
+If we have nested or circular eventfd wakeups, then we can deadlock if
+we run them inline from our poll waitqueue wakeup handler. It's also
+possible to have very long chains of notifications, to the extent where
+we could risk blowing the stack.
+
+Check the eventfd recursion count before calling eventfd_signal(). If
+it's non-zero, then punt the signaling to async context. This is always
+safe, as it takes us out-of-line in terms of stack and locking context.
+
+Cc: stable@vger.kernel.org # 4.19+
+Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c | 20 ++++++++++++++++++--
+ 1 file changed, 18 insertions(+), 2 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -1610,6 +1610,14 @@ static int aio_fsync(struct fsync_iocb *
+ return 0;
+ }
+
++static void aio_poll_put_work(struct work_struct *work)
++{
++ struct poll_iocb *req = container_of(work, struct poll_iocb, work);
++ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
++
++ iocb_put(iocb);
++}
++
+ static void aio_poll_complete_work(struct work_struct *work)
+ {
+ struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+@@ -1674,6 +1682,8 @@ static int aio_poll_wake(struct wait_que
+ list_del_init(&req->wait.entry);
+
+ if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
++ struct kioctx *ctx = iocb->ki_ctx;
++
+ /*
+ * Try to complete the iocb inline if we can. Use
+ * irqsave/irqrestore because not all filesystems (e.g. fuse)
+@@ -1683,8 +1693,14 @@ static int aio_poll_wake(struct wait_que
+ list_del(&iocb->ki_list);
+ iocb->ki_res.res = mangle_poll(mask);
+ req->done = true;
+- spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
+- iocb_put(iocb);
++ if (iocb->ki_eventfd && eventfd_signal_count()) {
++ iocb = NULL;
++ INIT_WORK(&req->work, aio_poll_put_work);
++ schedule_work(&req->work);
++ }
++ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
++ if (iocb)
++ iocb_put(iocb);
+ } else {
+ schedule_work(&req->work);
+ }
--- /dev/null
+From 1a3388d506bf5b45bb283e6a4c4706cfb4897333 Mon Sep 17 00:00:00 2001
+From: Stephen Warren <swarren@nvidia.com>
+Date: Thu, 3 Oct 2019 14:50:31 -0600
+Subject: ARM: tegra: Enable PLLP bypass during Tegra124 LP1
+
+From: Stephen Warren <swarren@nvidia.com>
+
+commit 1a3388d506bf5b45bb283e6a4c4706cfb4897333 upstream.
+
+For a little over a year, U-Boot has configured the flow controller to
+perform automatic RAM re-repair on off->on power transitions of the CPU
+rail[1]. This is mandatory for correct operation of Tegra124. However,
+RAM re-repair relies on certain clocks, which the kernel must enable and
+leave running. PLLP is one of those clocks. This clock is shut down
+during LP1 in order to save power. Enable bypass (which I believe routes
+osc_div_clk, essentially the crystal clock, to the PLL output) so that
+this clock signal toggles even though the PLL is not active. This is
+required so that LP1 power mode (system suspend) operates correctly.
+
+The bypass configuration must then be undone when resuming from LP1, so
+that all peripheral clocks run at the expected rate. Without this, many
+peripherals won't work correctly; for example, the UART baud rate would
+be incorrect.
+
+NVIDIA's downstream kernel code only does this if not compiled for
+Tegra30, so the added code is made conditional upon the chip ID.
+NVIDIA's downstream code makes this change conditional upon the active
+CPU cluster. The upstream kernel currently doesn't support cluster
+switching, so this patch doesn't test the active CPU cluster ID.
+
+[1] 3cc7942a4ae5 ARM: tegra: implement RAM repair
+
+Reported-by: Jonathan Hunter <jonathanh@nvidia.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Stephen Warren <swarren@nvidia.com>
+Signed-off-by: Thierry Reding <treding@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/mach-tegra/sleep-tegra30.S | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/arch/arm/mach-tegra/sleep-tegra30.S
++++ b/arch/arm/mach-tegra/sleep-tegra30.S
+@@ -370,6 +370,14 @@ _pll_m_c_x_done:
+ pll_locked r1, r0, CLK_RESET_PLLC_BASE
+ pll_locked r1, r0, CLK_RESET_PLLX_BASE
+
++ tegra_get_soc_id TEGRA_APB_MISC_BASE, r1
++ cmp r1, #TEGRA30
++ beq 1f
++ ldr r1, [r0, #CLK_RESET_PLLP_BASE]
++ bic r1, r1, #(1<<31) @ disable PllP bypass
++ str r1, [r0, #CLK_RESET_PLLP_BASE]
++1:
++
+ mov32 r7, TEGRA_TMRUS_BASE
+ ldr r1, [r7]
+ add r1, r1, #LOCK_DELAY
+@@ -630,7 +638,10 @@ tegra30_switch_cpu_to_clk32k:
+ str r0, [r4, #PMC_PLLP_WB0_OVERRIDE]
+
+ /* disable PLLP, PLLA, PLLC and PLLX */
++ tegra_get_soc_id TEGRA_APB_MISC_BASE, r1
++ cmp r1, #TEGRA30
+ ldr r0, [r5, #CLK_RESET_PLLP_BASE]
++ orrne r0, r0, #(1 << 31) @ enable PllP bypass on fast cluster
+ bic r0, r0, #(1 << 30)
+ str r0, [r5, #CLK_RESET_PLLP_BASE]
+ ldr r0, [r5, #CLK_RESET_PLLA_BASE]
--- /dev/null
+From 038ba8cc1bffc51250add4a9b9249d4331576d8f Mon Sep 17 00:00:00 2001
+From: Coly Li <colyli@suse.de>
+Date: Sat, 1 Feb 2020 22:42:33 +0800
+Subject: bcache: add readahead cache policy options via sysfs interface
+
+From: Coly Li <colyli@suse.de>
+
+commit 038ba8cc1bffc51250add4a9b9249d4331576d8f upstream.
+
+In year 2007 high performance SSD was still expensive, in order to
+save more space for real workload or meta data, the readahead I/Os
+for non-meta data was bypassed and not cached on SSD.
+
+In now days, SSD price drops a lot and people can find larger size
+SSD with more comfortable price. It is unncessary to alway bypass
+normal readahead I/Os to save SSD space for now.
+
+This patch adds options for readahead data cache policies via sysfs
+file /sys/block/bcache<N>/readahead_cache_policy, the options are,
+- "all": cache all readahead data I/Os.
+- "meta-only": only cache meta data, and bypass other regular I/Os.
+
+If users want to make bcache continue to only cache readahead request
+for metadata and bypass regular data readahead, please set "meta-only"
+to this sysfs file. By default, bcache will back to cache all read-
+ahead requests now.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Coly Li <colyli@suse.de>
+Acked-by: Eric Wheeler <bcache@linux.ewheeler.net>
+Cc: Michael Lyle <mlyle@lyle.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/bcache/bcache.h | 3 +++
+ drivers/md/bcache/request.c | 17 ++++++++++++-----
+ drivers/md/bcache/sysfs.c | 22 ++++++++++++++++++++++
+ 3 files changed, 37 insertions(+), 5 deletions(-)
+
+--- a/drivers/md/bcache/bcache.h
++++ b/drivers/md/bcache/bcache.h
+@@ -329,6 +329,9 @@ struct cached_dev {
+ */
+ atomic_t has_dirty;
+
++#define BCH_CACHE_READA_ALL 0
++#define BCH_CACHE_READA_META_ONLY 1
++ unsigned int cache_readahead_policy;
+ struct bch_ratelimit writeback_rate;
+ struct delayed_work writeback_rate_update;
+
+--- a/drivers/md/bcache/request.c
++++ b/drivers/md/bcache/request.c
+@@ -391,13 +391,20 @@ static bool check_should_bypass(struct c
+ goto skip;
+
+ /*
+- * Flag for bypass if the IO is for read-ahead or background,
+- * unless the read-ahead request is for metadata
++ * If the bio is for read-ahead or background IO, bypass it or
++ * not depends on the following situations,
++ * - If the IO is for meta data, always cache it and no bypass
++ * - If the IO is not meta data, check dc->cache_reada_policy,
++ * BCH_CACHE_READA_ALL: cache it and not bypass
++ * BCH_CACHE_READA_META_ONLY: not cache it and bypass
++ * That is, read-ahead request for metadata always get cached
+ * (eg, for gfs2 or xfs).
+ */
+- if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
+- !(bio->bi_opf & (REQ_META|REQ_PRIO)))
+- goto skip;
++ if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) {
++ if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
++ (dc->cache_readahead_policy != BCH_CACHE_READA_ALL))
++ goto skip;
++ }
+
+ if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
+ bio_sectors(bio) & (c->sb.block_size - 1)) {
+--- a/drivers/md/bcache/sysfs.c
++++ b/drivers/md/bcache/sysfs.c
+@@ -27,6 +27,12 @@ static const char * const bch_cache_mode
+ NULL
+ };
+
++static const char * const bch_reada_cache_policies[] = {
++ "all",
++ "meta-only",
++ NULL
++};
++
+ /* Default is 0 ("auto") */
+ static const char * const bch_stop_on_failure_modes[] = {
+ "auto",
+@@ -100,6 +106,7 @@ rw_attribute(congested_write_threshold_u
+ rw_attribute(sequential_cutoff);
+ rw_attribute(data_csum);
+ rw_attribute(cache_mode);
++rw_attribute(readahead_cache_policy);
+ rw_attribute(stop_when_cache_set_failed);
+ rw_attribute(writeback_metadata);
+ rw_attribute(writeback_running);
+@@ -167,6 +174,11 @@ SHOW(__bch_cached_dev)
+ bch_cache_modes,
+ BDEV_CACHE_MODE(&dc->sb));
+
++ if (attr == &sysfs_readahead_cache_policy)
++ return bch_snprint_string_list(buf, PAGE_SIZE,
++ bch_reada_cache_policies,
++ dc->cache_readahead_policy);
++
+ if (attr == &sysfs_stop_when_cache_set_failed)
+ return bch_snprint_string_list(buf, PAGE_SIZE,
+ bch_stop_on_failure_modes,
+@@ -352,6 +364,15 @@ STORE(__cached_dev)
+ }
+ }
+
++ if (attr == &sysfs_readahead_cache_policy) {
++ v = __sysfs_match_string(bch_reada_cache_policies, -1, buf);
++ if (v < 0)
++ return v;
++
++ if ((unsigned int) v != dc->cache_readahead_policy)
++ dc->cache_readahead_policy = v;
++ }
++
+ if (attr == &sysfs_stop_when_cache_set_failed) {
+ v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf);
+ if (v < 0)
+@@ -466,6 +487,7 @@ static struct attribute *bch_cached_dev_
+ &sysfs_data_csum,
+ #endif
+ &sysfs_cache_mode,
++ &sysfs_readahead_cache_policy,
+ &sysfs_stop_when_cache_set_failed,
+ &sysfs_writeback_metadata,
+ &sysfs_writeback_running,
--- /dev/null
+From 5750c37523a2c8cbb450b9ef31e21c2ba876b05e Mon Sep 17 00:00:00 2001
+From: Nikolay Borisov <nborisov@suse.com>
+Date: Mon, 27 Jan 2020 11:59:26 +0200
+Subject: btrfs: Correctly handle empty trees in find_first_clear_extent_bit
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit 5750c37523a2c8cbb450b9ef31e21c2ba876b05e upstream.
+
+Raviu reported that running his regular fs_trim segfaulted with the
+following backtrace:
+
+[ 237.525947] assertion failed: prev, in ../fs/btrfs/extent_io.c:1595
+[ 237.525984] ------------[ cut here ]------------
+[ 237.525985] kernel BUG at ../fs/btrfs/ctree.h:3117!
+[ 237.525992] invalid opcode: 0000 [#1] SMP PTI
+[ 237.525998] CPU: 4 PID: 4423 Comm: fstrim Tainted: G U OE 5.4.14-8-vanilla #1
+[ 237.526001] Hardware name: ASUSTeK COMPUTER INC.
+[ 237.526044] RIP: 0010:assfail.constprop.58+0x18/0x1a [btrfs]
+[ 237.526079] Call Trace:
+[ 237.526120] find_first_clear_extent_bit+0x13d/0x150 [btrfs]
+[ 237.526148] btrfs_trim_fs+0x211/0x3f0 [btrfs]
+[ 237.526184] btrfs_ioctl_fitrim+0x103/0x170 [btrfs]
+[ 237.526219] btrfs_ioctl+0x129a/0x2ed0 [btrfs]
+[ 237.526227] ? filemap_map_pages+0x190/0x3d0
+[ 237.526232] ? do_filp_open+0xaf/0x110
+[ 237.526238] ? _copy_to_user+0x22/0x30
+[ 237.526242] ? cp_new_stat+0x150/0x180
+[ 237.526247] ? do_vfs_ioctl+0xa4/0x640
+[ 237.526278] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs]
+[ 237.526283] do_vfs_ioctl+0xa4/0x640
+[ 237.526288] ? __do_sys_newfstat+0x3c/0x60
+[ 237.526292] ksys_ioctl+0x70/0x80
+[ 237.526297] __x64_sys_ioctl+0x16/0x20
+[ 237.526303] do_syscall_64+0x5a/0x1c0
+[ 237.526310] entry_SYSCALL_64_after_hwframe+0x49/0xbe
+
+That was due to btrfs_fs_device::aloc_tree being empty. Initially I
+thought this wasn't possible and as a percaution have put the assert in
+find_first_clear_extent_bit. Turns out this is indeed possible and could
+happen when a file system with SINGLE data/metadata profile has a 2nd
+device added. Until balance is run or a new chunk is allocated on this
+device it will be completely empty.
+
+In this case find_first_clear_extent_bit should return the full range
+[0, -1ULL] and let the caller handle this i.e for trim the end will be
+capped at the size of actual device.
+
+Link: https://lore.kernel.org/linux-btrfs/izW2WNyvy1dEDweBICizKnd2KDwDiDyY2EYQr4YCwk7pkuIpthx-JRn65MPBde00ND6V0_Lh8mW0kZwzDiLDv25pUYWxkskWNJnVP0kgdMA=@protonmail.com/
+Fixes: 45bfcfc168f8 ("btrfs: Implement find_first_clear_extent_bit")
+CC: stable@vger.kernel.org # 5.2+
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/extent_io.c | 32 ++++++++++++++++++--------------
+ fs/btrfs/tests/extent-io-tests.c | 9 +++++++++
+ 2 files changed, 27 insertions(+), 14 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -1583,21 +1583,25 @@ void find_first_clear_extent_bit(struct
+ /* Find first extent with bits cleared */
+ while (1) {
+ node = __etree_search(tree, start, &next, &prev, NULL, NULL);
+- if (!node) {
++ if (!node && !next && !prev) {
++ /*
++ * Tree is completely empty, send full range and let
++ * caller deal with it
++ */
++ *start_ret = 0;
++ *end_ret = -1;
++ goto out;
++ } else if (!node && !next) {
++ /*
++ * We are past the last allocated chunk, set start at
++ * the end of the last extent.
++ */
++ state = rb_entry(prev, struct extent_state, rb_node);
++ *start_ret = state->end + 1;
++ *end_ret = -1;
++ goto out;
++ } else if (!node) {
+ node = next;
+- if (!node) {
+- /*
+- * We are past the last allocated chunk,
+- * set start at the end of the last extent. The
+- * device alloc tree should never be empty so
+- * prev is always set.
+- */
+- ASSERT(prev);
+- state = rb_entry(prev, struct extent_state, rb_node);
+- *start_ret = state->end + 1;
+- *end_ret = -1;
+- goto out;
+- }
+ }
+ /*
+ * At this point 'node' either contains 'start' or start is
+--- a/fs/btrfs/tests/extent-io-tests.c
++++ b/fs/btrfs/tests/extent-io-tests.c
+@@ -441,8 +441,17 @@ static int test_find_first_clear_extent_
+ int ret = -EINVAL;
+
+ test_msg("running find_first_clear_extent_bit test");
++
+ extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
+
++ /* Test correct handling of empty tree */
++ find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
++ if (start != 0 || end != -1) {
++ test_err(
++ "error getting a range from completely empty tree: start %llu end %llu",
++ start, end);
++ goto out;
++ }
+ /*
+ * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
+ * 4M-32M
--- /dev/null
+From 889bfa39086e86b52fcfaa04d72c95eaeb12f9a5 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Fri, 17 Jan 2020 09:12:45 -0500
+Subject: btrfs: drop log root for dropped roots
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 889bfa39086e86b52fcfaa04d72c95eaeb12f9a5 upstream.
+
+If we fsync on a subvolume and create a log root for that volume, and
+then later delete that subvolume we'll never clean up its log root. Fix
+this by making switch_commit_roots free the log for any dropped roots we
+encounter. The extra churn is because we need a btrfs_trans_handle, not
+the btrfs_transaction.
+
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/transaction.c | 22 ++++++++++++----------
+ 1 file changed, 12 insertions(+), 10 deletions(-)
+
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -77,13 +77,14 @@ void btrfs_put_transaction(struct btrfs_
+ }
+ }
+
+-static noinline void switch_commit_roots(struct btrfs_transaction *trans)
++static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
+ {
++ struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *root, *tmp;
+
+ down_write(&fs_info->commit_root_sem);
+- list_for_each_entry_safe(root, tmp, &trans->switch_commits,
++ list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
+ dirty_list) {
+ list_del_init(&root->dirty_list);
+ free_extent_buffer(root->commit_root);
+@@ -95,16 +96,17 @@ static noinline void switch_commit_roots
+ }
+
+ /* We can free old roots now. */
+- spin_lock(&trans->dropped_roots_lock);
+- while (!list_empty(&trans->dropped_roots)) {
+- root = list_first_entry(&trans->dropped_roots,
++ spin_lock(&cur_trans->dropped_roots_lock);
++ while (!list_empty(&cur_trans->dropped_roots)) {
++ root = list_first_entry(&cur_trans->dropped_roots,
+ struct btrfs_root, root_list);
+ list_del_init(&root->root_list);
+- spin_unlock(&trans->dropped_roots_lock);
++ spin_unlock(&cur_trans->dropped_roots_lock);
++ btrfs_free_log(trans, root);
+ btrfs_drop_and_free_fs_root(fs_info, root);
+- spin_lock(&trans->dropped_roots_lock);
++ spin_lock(&cur_trans->dropped_roots_lock);
+ }
+- spin_unlock(&trans->dropped_roots_lock);
++ spin_unlock(&cur_trans->dropped_roots_lock);
+ up_write(&fs_info->commit_root_sem);
+ }
+
+@@ -1359,7 +1361,7 @@ static int qgroup_account_snapshot(struc
+ ret = commit_cowonly_roots(trans);
+ if (ret)
+ goto out;
+- switch_commit_roots(trans->transaction);
++ switch_commit_roots(trans);
+ ret = btrfs_write_and_wait_transaction(trans);
+ if (ret)
+ btrfs_handle_fs_error(fs_info, ret,
+@@ -2245,7 +2247,7 @@ int btrfs_commit_transaction(struct btrf
+ list_add_tail(&fs_info->chunk_root->dirty_list,
+ &cur_trans->switch_commits);
+
+- switch_commit_roots(cur_trans);
++ switch_commit_roots(trans);
+
+ ASSERT(list_empty(&cur_trans->dirty_bgs));
+ ASSERT(list_empty(&cur_trans->io_bgs));
--- /dev/null
+From b5e4ff9d465da1233a2d9a47ebce487c70d8f4ab Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 15 Jan 2020 13:21:35 +0000
+Subject: Btrfs: fix infinite loop during fsync after rename operations
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit b5e4ff9d465da1233a2d9a47ebce487c70d8f4ab upstream.
+
+Recently fsstress (from fstests) sporadically started to trigger an
+infinite loop during fsync operations. This turned out to be because
+support for the rename exchange and whiteout operations was added to
+fsstress in fstests. These operations, unlike any others in fsstress,
+cause file names to be reused, whence triggering this issue. However
+it's not necessary to use rename exchange and rename whiteout operations
+trigger this issue, simple rename operations and file creations are
+enough to trigger the issue.
+
+The issue boils down to when we are logging inodes that conflict (that
+had the name of any inode we need to log during the fsync operation), we
+keep logging them even if they were already logged before, and after
+that we check if there's any other inode that conflicts with them and
+then add it again to the list of inodes to log. Skipping already logged
+inodes fixes the issue.
+
+Consider the following example:
+
+ $ mkfs.btrfs -f /dev/sdb
+ $ mount /dev/sdb /mnt
+
+ $ mkdir /mnt/testdir # inode 257
+
+ $ touch /mnt/testdir/zz # inode 258
+ $ ln /mnt/testdir/zz /mnt/testdir/zz_link
+
+ $ touch /mnt/testdir/a # inode 259
+
+ $ sync
+
+ # The following 3 renames achieve the same result as a rename exchange
+ # operation (<rename_exchange> /mnt/testdir/zz_link to /mnt/testdir/a).
+
+ $ mv /mnt/testdir/a /mnt/testdir/a/tmp
+ $ mv /mnt/testdir/zz_link /mnt/testdir/a
+ $ mv /mnt/testdir/a/tmp /mnt/testdir/zz_link
+
+ # The following rename and file creation give the same result as a
+ # rename whiteout operation (<rename_whiteout> zz to a2).
+
+ $ mv /mnt/testdir/zz /mnt/testdir/a2
+ $ touch /mnt/testdir/zz # inode 260
+
+ $ xfs_io -c fsync /mnt/testdir/zz
+ --> results in the infinite loop
+
+The following steps happen:
+
+1) When logging inode 260, we find that its reference named "zz" was
+ used by inode 258 in the previous transaction (through the commit
+ root), so inode 258 is added to the list of conflicting indoes that
+ need to be logged;
+
+2) After logging inode 258, we find that its reference named "a" was
+ used by inode 259 in the previous transaction, and therefore we add
+ inode 259 to the list of conflicting inodes to be logged;
+
+3) After logging inode 259, we find that its reference named "zz_link"
+ was used by inode 258 in the previous transaction - we add inode 258
+ to the list of conflicting inodes to log, again - we had already
+ logged it before at step 3. After logging it again, we find again
+ that inode 259 conflicts with him, and we add again 259 to the list,
+ etc - we end up repeating all the previous steps.
+
+So fix this by skipping logging of conflicting inodes that were already
+logged.
+
+Fixes: 6b5fc433a7ad67 ("Btrfs: fix fsync after succession of renames of different files")
+CC: stable@vger.kernel.org # 5.1+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-log.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 44 insertions(+)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -4855,6 +4855,50 @@ static int log_conflicting_inodes(struct
+ continue;
+ }
+ /*
++ * If the inode was already logged skip it - otherwise we can
++ * hit an infinite loop. Example:
++ *
++ * From the commit root (previous transaction) we have the
++ * following inodes:
++ *
++ * inode 257 a directory
++ * inode 258 with references "zz" and "zz_link" on inode 257
++ * inode 259 with reference "a" on inode 257
++ *
++ * And in the current (uncommitted) transaction we have:
++ *
++ * inode 257 a directory, unchanged
++ * inode 258 with references "a" and "a2" on inode 257
++ * inode 259 with reference "zz_link" on inode 257
++ * inode 261 with reference "zz" on inode 257
++ *
++ * When logging inode 261 the following infinite loop could
++ * happen if we don't skip already logged inodes:
++ *
++ * - we detect inode 258 as a conflicting inode, with inode 261
++ * on reference "zz", and log it;
++ *
++ * - we detect inode 259 as a conflicting inode, with inode 258
++ * on reference "a", and log it;
++ *
++ * - we detect inode 258 as a conflicting inode, with inode 259
++ * on reference "zz_link", and log it - again! After this we
++ * repeat the above steps forever.
++ */
++ spin_lock(&BTRFS_I(inode)->lock);
++ /*
++ * Check the inode's logged_trans only instead of
++ * btrfs_inode_in_log(). This is because the last_log_commit of
++ * the inode is not updated when we only log that it exists and
++ * and it has the full sync bit set (see btrfs_log_inode()).
++ */
++ if (BTRFS_I(inode)->logged_trans == trans->transid) {
++ spin_unlock(&BTRFS_I(inode)->lock);
++ btrfs_add_delayed_iput(inode);
++ continue;
++ }
++ spin_unlock(&BTRFS_I(inode)->lock);
++ /*
+ * We are safe logging the other inode without acquiring its
+ * lock as long as we log with the LOG_INODE_EXISTS mode. We
+ * are safe against concurrent renames of the other inode as
--- /dev/null
+From 0e56315ca147b3e60c7bf240233a301d3c7fb508 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 19 Nov 2019 12:07:33 +0000
+Subject: Btrfs: fix missing hole after hole punching and fsync when using NO_HOLES
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 0e56315ca147b3e60c7bf240233a301d3c7fb508 upstream.
+
+When using the NO_HOLES feature, if we punch a hole into a file and then
+fsync it, there are cases where a subsequent fsync will miss the fact that
+a hole was punched, resulting in the holes not existing after replaying
+the log tree.
+
+Essentially these cases all imply that, tree-log.c:copy_items(), is not
+invoked for the leafs that delimit holes, because nothing changed those
+leafs in the current transaction. And it's precisely copy_items() where
+we currenly detect and log holes, which works as long as the holes are
+between file extent items in the input leaf or between the beginning of
+input leaf and the previous leaf or between the last item in the leaf
+and the next leaf.
+
+First example where we miss a hole:
+
+ *) The extent items of the inode span multiple leafs;
+
+ *) The punched hole covers a range that affects only the extent items of
+ the first leaf;
+
+ *) The fsync operation is done in full mode (BTRFS_INODE_NEEDS_FULL_SYNC
+ is set in the inode's runtime flags).
+
+ That results in the hole not existing after replaying the log tree.
+
+ For example, if the fs/subvolume tree has the following layout for a
+ particular inode:
+
+ Leaf N, generation 10:
+
+ [ ... INODE_ITEM INODE_REF EXTENT_ITEM (0 64K) EXTENT_ITEM (64K 128K) ]
+
+ Leaf N + 1, generation 10:
+
+ [ EXTENT_ITEM (128K 64K) ... ]
+
+ If at transaction 11 we punch a hole coverting the range [0, 128K[, we end
+ up dropping the two extent items from leaf N, but we don't touch the other
+ leaf, so we end up in the following state:
+
+ Leaf N, generation 11:
+
+ [ ... INODE_ITEM INODE_REF ]
+
+ Leaf N + 1, generation 10:
+
+ [ EXTENT_ITEM (128K 64K) ... ]
+
+ A full fsync after punching the hole will only process leaf N because it
+ was modified in the current transaction, but not leaf N + 1, since it
+ was not modified in the current transaction (generation 10 and not 11).
+ As a result the fsync will not log any holes, because it didn't process
+ any leaf with extent items.
+
+Second example where we will miss a hole:
+
+ *) An inode as its items spanning 5 (or more) leafs;
+
+ *) A hole is punched and it covers only the extents items of the 3rd
+ leaf. This resulsts in deleting the entire leaf and not touching any
+ of the other leafs.
+
+ So the only leaf that is modified in the current transaction, when
+ punching the hole, is the first leaf, which contains the inode item.
+ During the full fsync, the only leaf that is passed to copy_items()
+ is that first leaf, and that's not enough for the hole detection
+ code in copy_items() to determine there's a hole between the last
+ file extent item in the 2nd leaf and the first file extent item in
+ the 3rd leaf (which was the 4th leaf before punching the hole).
+
+Fix this by scanning all leafs and punch holes as necessary when doing a
+full fsync (less common than a non-full fsync) when the NO_HOLES feature
+is enabled. The lack of explicit file extent items to mark holes makes it
+necessary to scan existing extents to determine if holes exist.
+
+A test case for fstests follows soon.
+
+Fixes: 16e7549f045d33 ("Btrfs: incompatible format change to remove hole extents")
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-log.c | 388 +++++++++++++---------------------------------------
+ 1 file changed, 100 insertions(+), 288 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3953,7 +3953,7 @@ static int log_csums(struct btrfs_trans_
+ static noinline int copy_items(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *dst_path,
+- struct btrfs_path *src_path, u64 *last_extent,
++ struct btrfs_path *src_path,
+ int start_slot, int nr, int inode_only,
+ u64 logged_isize)
+ {
+@@ -3964,7 +3964,6 @@ static noinline int copy_items(struct bt
+ struct btrfs_file_extent_item *extent;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *src = src_path->nodes[0];
+- struct btrfs_key first_key, last_key, key;
+ int ret;
+ struct btrfs_key *ins_keys;
+ u32 *ins_sizes;
+@@ -3972,9 +3971,6 @@ static noinline int copy_items(struct bt
+ int i;
+ struct list_head ordered_sums;
+ int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
+- bool has_extents = false;
+- bool need_find_last_extent = true;
+- bool done = false;
+
+ INIT_LIST_HEAD(&ordered_sums);
+
+@@ -3983,8 +3979,6 @@ static noinline int copy_items(struct bt
+ if (!ins_data)
+ return -ENOMEM;
+
+- first_key.objectid = (u64)-1;
+-
+ ins_sizes = (u32 *)ins_data;
+ ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+
+@@ -4005,9 +3999,6 @@ static noinline int copy_items(struct bt
+
+ src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+
+- if (i == nr - 1)
+- last_key = ins_keys[i];
+-
+ if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(dst_path->nodes[0],
+ dst_path->slots[0],
+@@ -4021,20 +4012,6 @@ static noinline int copy_items(struct bt
+ src_offset, ins_sizes[i]);
+ }
+
+- /*
+- * We set need_find_last_extent here in case we know we were
+- * processing other items and then walk into the first extent in
+- * the inode. If we don't hit an extent then nothing changes,
+- * we'll do the last search the next time around.
+- */
+- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
+- has_extents = true;
+- if (first_key.objectid == (u64)-1)
+- first_key = ins_keys[i];
+- } else {
+- need_find_last_extent = false;
+- }
+-
+ /* take a reference on file data extents so that truncates
+ * or deletes of this inode don't have to relog the inode
+ * again
+@@ -4100,167 +4077,6 @@ static noinline int copy_items(struct bt
+ kfree(sums);
+ }
+
+- if (!has_extents)
+- return ret;
+-
+- if (need_find_last_extent && *last_extent == first_key.offset) {
+- /*
+- * We don't have any leafs between our current one and the one
+- * we processed before that can have file extent items for our
+- * inode (and have a generation number smaller than our current
+- * transaction id).
+- */
+- need_find_last_extent = false;
+- }
+-
+- /*
+- * Because we use btrfs_search_forward we could skip leaves that were
+- * not modified and then assume *last_extent is valid when it really
+- * isn't. So back up to the previous leaf and read the end of the last
+- * extent before we go and fill in holes.
+- */
+- if (need_find_last_extent) {
+- u64 len;
+-
+- ret = btrfs_prev_leaf(inode->root, src_path);
+- if (ret < 0)
+- return ret;
+- if (ret)
+- goto fill_holes;
+- if (src_path->slots[0])
+- src_path->slots[0]--;
+- src = src_path->nodes[0];
+- btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
+- if (key.objectid != btrfs_ino(inode) ||
+- key.type != BTRFS_EXTENT_DATA_KEY)
+- goto fill_holes;
+- extent = btrfs_item_ptr(src, src_path->slots[0],
+- struct btrfs_file_extent_item);
+- if (btrfs_file_extent_type(src, extent) ==
+- BTRFS_FILE_EXTENT_INLINE) {
+- len = btrfs_file_extent_ram_bytes(src, extent);
+- *last_extent = ALIGN(key.offset + len,
+- fs_info->sectorsize);
+- } else {
+- len = btrfs_file_extent_num_bytes(src, extent);
+- *last_extent = key.offset + len;
+- }
+- }
+-fill_holes:
+- /* So we did prev_leaf, now we need to move to the next leaf, but a few
+- * things could have happened
+- *
+- * 1) A merge could have happened, so we could currently be on a leaf
+- * that holds what we were copying in the first place.
+- * 2) A split could have happened, and now not all of the items we want
+- * are on the same leaf.
+- *
+- * So we need to adjust how we search for holes, we need to drop the
+- * path and re-search for the first extent key we found, and then walk
+- * forward until we hit the last one we copied.
+- */
+- if (need_find_last_extent) {
+- /* btrfs_prev_leaf could return 1 without releasing the path */
+- btrfs_release_path(src_path);
+- ret = btrfs_search_slot(NULL, inode->root, &first_key,
+- src_path, 0, 0);
+- if (ret < 0)
+- return ret;
+- ASSERT(ret == 0);
+- src = src_path->nodes[0];
+- i = src_path->slots[0];
+- } else {
+- i = start_slot;
+- }
+-
+- /*
+- * Ok so here we need to go through and fill in any holes we may have
+- * to make sure that holes are punched for those areas in case they had
+- * extents previously.
+- */
+- while (!done) {
+- u64 offset, len;
+- u64 extent_end;
+-
+- if (i >= btrfs_header_nritems(src_path->nodes[0])) {
+- ret = btrfs_next_leaf(inode->root, src_path);
+- if (ret < 0)
+- return ret;
+- ASSERT(ret == 0);
+- src = src_path->nodes[0];
+- i = 0;
+- need_find_last_extent = true;
+- }
+-
+- btrfs_item_key_to_cpu(src, &key, i);
+- if (!btrfs_comp_cpu_keys(&key, &last_key))
+- done = true;
+- if (key.objectid != btrfs_ino(inode) ||
+- key.type != BTRFS_EXTENT_DATA_KEY) {
+- i++;
+- continue;
+- }
+- extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
+- if (btrfs_file_extent_type(src, extent) ==
+- BTRFS_FILE_EXTENT_INLINE) {
+- len = btrfs_file_extent_ram_bytes(src, extent);
+- extent_end = ALIGN(key.offset + len,
+- fs_info->sectorsize);
+- } else {
+- len = btrfs_file_extent_num_bytes(src, extent);
+- extent_end = key.offset + len;
+- }
+- i++;
+-
+- if (*last_extent == key.offset) {
+- *last_extent = extent_end;
+- continue;
+- }
+- offset = *last_extent;
+- len = key.offset - *last_extent;
+- ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
+- offset, 0, 0, len, 0, len, 0, 0, 0);
+- if (ret)
+- break;
+- *last_extent = extent_end;
+- }
+-
+- /*
+- * Check if there is a hole between the last extent found in our leaf
+- * and the first extent in the next leaf. If there is one, we need to
+- * log an explicit hole so that at replay time we can punch the hole.
+- */
+- if (ret == 0 &&
+- key.objectid == btrfs_ino(inode) &&
+- key.type == BTRFS_EXTENT_DATA_KEY &&
+- i == btrfs_header_nritems(src_path->nodes[0])) {
+- ret = btrfs_next_leaf(inode->root, src_path);
+- need_find_last_extent = true;
+- if (ret > 0) {
+- ret = 0;
+- } else if (ret == 0) {
+- btrfs_item_key_to_cpu(src_path->nodes[0], &key,
+- src_path->slots[0]);
+- if (key.objectid == btrfs_ino(inode) &&
+- key.type == BTRFS_EXTENT_DATA_KEY &&
+- *last_extent < key.offset) {
+- const u64 len = key.offset - *last_extent;
+-
+- ret = btrfs_insert_file_extent(trans, log,
+- btrfs_ino(inode),
+- *last_extent, 0,
+- 0, len, 0, len,
+- 0, 0, 0);
+- *last_extent += len;
+- }
+- }
+- }
+- /*
+- * Need to let the callers know we dropped the path so they should
+- * re-search.
+- */
+- if (!ret && need_find_last_extent)
+- ret = 1;
+ return ret;
+ }
+
+@@ -4425,7 +4241,7 @@ static int btrfs_log_prealloc_extents(st
+ const u64 i_size = i_size_read(&inode->vfs_inode);
+ const u64 ino = btrfs_ino(inode);
+ struct btrfs_path *dst_path = NULL;
+- u64 last_extent = (u64)-1;
++ bool dropped_extents = false;
+ int ins_nr = 0;
+ int start_slot;
+ int ret;
+@@ -4447,8 +4263,7 @@ static int btrfs_log_prealloc_extents(st
+ if (slot >= btrfs_header_nritems(leaf)) {
+ if (ins_nr > 0) {
+ ret = copy_items(trans, inode, dst_path, path,
+- &last_extent, start_slot,
+- ins_nr, 1, 0);
++ start_slot, ins_nr, 1, 0);
+ if (ret < 0)
+ goto out;
+ ins_nr = 0;
+@@ -4472,8 +4287,7 @@ static int btrfs_log_prealloc_extents(st
+ path->slots[0]++;
+ continue;
+ }
+- if (last_extent == (u64)-1) {
+- last_extent = key.offset;
++ if (!dropped_extents) {
+ /*
+ * Avoid logging extent items logged in past fsync calls
+ * and leading to duplicate keys in the log tree.
+@@ -4487,6 +4301,7 @@ static int btrfs_log_prealloc_extents(st
+ } while (ret == -EAGAIN);
+ if (ret)
+ goto out;
++ dropped_extents = true;
+ }
+ if (ins_nr == 0)
+ start_slot = slot;
+@@ -4501,7 +4316,7 @@ static int btrfs_log_prealloc_extents(st
+ }
+ }
+ if (ins_nr > 0) {
+- ret = copy_items(trans, inode, dst_path, path, &last_extent,
++ ret = copy_items(trans, inode, dst_path, path,
+ start_slot, ins_nr, 1, 0);
+ if (ret > 0)
+ ret = 0;
+@@ -4688,13 +4503,8 @@ static int btrfs_log_all_xattrs(struct b
+
+ if (slot >= nritems) {
+ if (ins_nr > 0) {
+- u64 last_extent = 0;
+-
+ ret = copy_items(trans, inode, dst_path, path,
+- &last_extent, start_slot,
+- ins_nr, 1, 0);
+- /* can't be 1, extent items aren't processed */
+- ASSERT(ret <= 0);
++ start_slot, ins_nr, 1, 0);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+@@ -4718,13 +4528,8 @@ static int btrfs_log_all_xattrs(struct b
+ cond_resched();
+ }
+ if (ins_nr > 0) {
+- u64 last_extent = 0;
+-
+ ret = copy_items(trans, inode, dst_path, path,
+- &last_extent, start_slot,
+- ins_nr, 1, 0);
+- /* can't be 1, extent items aren't processed */
+- ASSERT(ret <= 0);
++ start_slot, ins_nr, 1, 0);
+ if (ret < 0)
+ return ret;
+ }
+@@ -4733,100 +4538,119 @@ static int btrfs_log_all_xattrs(struct b
+ }
+
+ /*
+- * If the no holes feature is enabled we need to make sure any hole between the
+- * last extent and the i_size of our inode is explicitly marked in the log. This
+- * is to make sure that doing something like:
+- *
+- * 1) create file with 128Kb of data
+- * 2) truncate file to 64Kb
+- * 3) truncate file to 256Kb
+- * 4) fsync file
+- * 5) <crash/power failure>
+- * 6) mount fs and trigger log replay
+- *
+- * Will give us a file with a size of 256Kb, the first 64Kb of data match what
+- * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
+- * file correspond to a hole. The presence of explicit holes in a log tree is
+- * what guarantees that log replay will remove/adjust file extent items in the
+- * fs/subvol tree.
+- *
+- * Here we do not need to care about holes between extents, that is already done
+- * by copy_items(). We also only need to do this in the full sync path, where we
+- * lookup for extents from the fs/subvol tree only. In the fast path case, we
+- * lookup the list of modified extent maps and if any represents a hole, we
+- * insert a corresponding extent representing a hole in the log tree.
++ * When using the NO_HOLES feature if we punched a hole that causes the
++ * deletion of entire leafs or all the extent items of the first leaf (the one
++ * that contains the inode item and references) we may end up not processing
++ * any extents, because there are no leafs with a generation matching the
++ * current transaction that have extent items for our inode. So we need to find
++ * if any holes exist and then log them. We also need to log holes after any
++ * truncate operation that changes the inode's size.
+ */
+-static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
+- struct btrfs_root *root,
+- struct btrfs_inode *inode,
+- struct btrfs_path *path)
++static int btrfs_log_holes(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root,
++ struct btrfs_inode *inode,
++ struct btrfs_path *path)
+ {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+- int ret;
+ struct btrfs_key key;
+- u64 hole_start;
+- u64 hole_size;
+- struct extent_buffer *leaf;
+- struct btrfs_root *log = root->log_root;
+ const u64 ino = btrfs_ino(inode);
+ const u64 i_size = i_size_read(&inode->vfs_inode);
++ u64 prev_extent_end = 0;
++ int ret;
+
+- if (!btrfs_fs_incompat(fs_info, NO_HOLES))
++ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
+ return 0;
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+- key.offset = (u64)-1;
++ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+- ASSERT(ret != 0);
+ if (ret < 0)
+ return ret;
+
+- ASSERT(path->slots[0] > 0);
+- path->slots[0]--;
+- leaf = path->nodes[0];
+- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+-
+- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
+- /* inode does not have any extents */
+- hole_start = 0;
+- hole_size = i_size;
+- } else {
++ while (true) {
+ struct btrfs_file_extent_item *extent;
++ struct extent_buffer *leaf = path->nodes[0];
+ u64 len;
+
+- /*
+- * If there's an extent beyond i_size, an explicit hole was
+- * already inserted by copy_items().
+- */
+- if (key.offset >= i_size)
+- return 0;
++ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
++ ret = btrfs_next_leaf(root, path);
++ if (ret < 0)
++ return ret;
++ if (ret > 0) {
++ ret = 0;
++ break;
++ }
++ leaf = path->nodes[0];
++ }
++
++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
++ break;
++
++ /* We have a hole, log it. */
++ if (prev_extent_end < key.offset) {
++ const u64 hole_len = key.offset - prev_extent_end;
++
++ /*
++ * Release the path to avoid deadlocks with other code
++ * paths that search the root while holding locks on
++ * leafs from the log root.
++ */
++ btrfs_release_path(path);
++ ret = btrfs_insert_file_extent(trans, root->log_root,
++ ino, prev_extent_end, 0,
++ 0, hole_len, 0, hole_len,
++ 0, 0, 0);
++ if (ret < 0)
++ return ret;
++
++ /*
++ * Search for the same key again in the root. Since it's
++ * an extent item and we are holding the inode lock, the
++ * key must still exist. If it doesn't just emit warning
++ * and return an error to fall back to a transaction
++ * commit.
++ */
++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++ if (ret < 0)
++ return ret;
++ if (WARN_ON(ret > 0))
++ return -ENOENT;
++ leaf = path->nodes[0];
++ }
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+-
+ if (btrfs_file_extent_type(leaf, extent) ==
+- BTRFS_FILE_EXTENT_INLINE)
+- return 0;
++ BTRFS_FILE_EXTENT_INLINE) {
++ len = btrfs_file_extent_ram_bytes(leaf, extent);
++ prev_extent_end = ALIGN(key.offset + len,
++ fs_info->sectorsize);
++ } else {
++ len = btrfs_file_extent_num_bytes(leaf, extent);
++ prev_extent_end = key.offset + len;
++ }
+
+- len = btrfs_file_extent_num_bytes(leaf, extent);
+- /* Last extent goes beyond i_size, no need to log a hole. */
+- if (key.offset + len > i_size)
+- return 0;
+- hole_start = key.offset + len;
+- hole_size = i_size - hole_start;
++ path->slots[0]++;
++ cond_resched();
+ }
+- btrfs_release_path(path);
+
+- /* Last extent ends at i_size. */
+- if (hole_size == 0)
+- return 0;
++ if (prev_extent_end < i_size) {
++ u64 hole_len;
+
+- hole_size = ALIGN(hole_size, fs_info->sectorsize);
+- ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
+- hole_size, 0, hole_size, 0, 0, 0);
+- return ret;
++ btrfs_release_path(path);
++ hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
++ ret = btrfs_insert_file_extent(trans, root->log_root,
++ ino, prev_extent_end, 0, 0,
++ hole_len, 0, hole_len,
++ 0, 0, 0);
++ if (ret < 0)
++ return ret;
++ }
++
++ return 0;
+ }
+
+ /*
+@@ -5129,7 +4953,6 @@ static int btrfs_log_inode(struct btrfs_
+ struct btrfs_key min_key;
+ struct btrfs_key max_key;
+ struct btrfs_root *log = root->log_root;
+- u64 last_extent = 0;
+ int err = 0;
+ int ret;
+ int nritems;
+@@ -5307,7 +5130,7 @@ again:
+ ins_start_slot = path->slots[0];
+ }
+ ret = copy_items(trans, inode, dst_path, path,
+- &last_extent, ins_start_slot,
++ ins_start_slot,
+ ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0) {
+@@ -5330,17 +5153,13 @@ again:
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+- &last_extent, ins_start_slot,
++ ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ ins_nr = 0;
+- if (ret) {
+- btrfs_release_path(path);
+- continue;
+- }
+ goto next_slot;
+ }
+
+@@ -5353,18 +5172,13 @@ again:
+ goto next_slot;
+ }
+
+- ret = copy_items(trans, inode, dst_path, path, &last_extent,
++ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+- if (ret) {
+- ins_nr = 0;
+- btrfs_release_path(path);
+- continue;
+- }
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+ next_slot:
+@@ -5378,13 +5192,12 @@ next_slot:
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path,
+- &last_extent, ins_start_slot,
++ ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+- ret = 0;
+ ins_nr = 0;
+ }
+ btrfs_release_path(path);
+@@ -5399,14 +5212,13 @@ next_key:
+ }
+ }
+ if (ins_nr) {
+- ret = copy_items(trans, inode, dst_path, path, &last_extent,
++ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+- ret = 0;
+ ins_nr = 0;
+ }
+
+@@ -5419,7 +5231,7 @@ next_key:
+ if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
+- err = btrfs_log_trailing_hole(trans, root, inode, path);
++ err = btrfs_log_holes(trans, root, inode, path);
+ if (err)
+ goto out_unlock;
+ }
--- /dev/null
+From 7227ff4de55d931bbdc156c8ef0ce4f100c78a5b Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 22 Jan 2020 12:23:20 +0000
+Subject: Btrfs: fix race between adding and putting tree mod seq elements and nodes
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 7227ff4de55d931bbdc156c8ef0ce4f100c78a5b upstream.
+
+There is a race between adding and removing elements to the tree mod log
+list and rbtree that can lead to use-after-free problems.
+
+Consider the following example that explains how/why the problems happens:
+
+1) Task A has mod log element with sequence number 200. It currently is
+ the only element in the mod log list;
+
+2) Task A calls btrfs_put_tree_mod_seq() because it no longer needs to
+ access the tree mod log. When it enters the function, it initializes
+ 'min_seq' to (u64)-1. Then it acquires the lock 'tree_mod_seq_lock'
+ before checking if there are other elements in the mod seq list.
+ Since the list it empty, 'min_seq' remains set to (u64)-1. Then it
+ unlocks the lock 'tree_mod_seq_lock';
+
+3) Before task A acquires the lock 'tree_mod_log_lock', task B adds
+ itself to the mod seq list through btrfs_get_tree_mod_seq() and gets a
+ sequence number of 201;
+
+4) Some other task, name it task C, modifies a btree and because there
+ elements in the mod seq list, it adds a tree mod elem to the tree
+ mod log rbtree. That node added to the mod log rbtree is assigned
+ a sequence number of 202;
+
+5) Task B, which is doing fiemap and resolving indirect back references,
+ calls btrfs get_old_root(), with 'time_seq' == 201, which in turn
+ calls tree_mod_log_search() - the search returns the mod log node
+ from the rbtree with sequence number 202, created by task C;
+
+6) Task A now acquires the lock 'tree_mod_log_lock', starts iterating
+ the mod log rbtree and finds the node with sequence number 202. Since
+ 202 is less than the previously computed 'min_seq', (u64)-1, it
+ removes the node and frees it;
+
+7) Task B still has a pointer to the node with sequence number 202, and
+ it dereferences the pointer itself and through the call to
+ __tree_mod_log_rewind(), resulting in a use-after-free problem.
+
+This issue can be triggered sporadically with the test case generic/561
+from fstests, and it happens more frequently with a higher number of
+duperemove processes. When it happens to me, it either freezes the VM or
+it produces a trace like the following before crashing:
+
+ [ 1245.321140] general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
+ [ 1245.321200] CPU: 1 PID: 26997 Comm: pool Not tainted 5.5.0-rc6-btrfs-next-52 #1
+ [ 1245.321235] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014
+ [ 1245.321287] RIP: 0010:rb_next+0x16/0x50
+ [ 1245.321307] Code: ....
+ [ 1245.321372] RSP: 0018:ffffa151c4d039b0 EFLAGS: 00010202
+ [ 1245.321388] RAX: 6b6b6b6b6b6b6b6b RBX: ffff8ae221363c80 RCX: 6b6b6b6b6b6b6b6b
+ [ 1245.321409] RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff8ae221363c80
+ [ 1245.321439] RBP: ffff8ae20fcc4688 R08: 0000000000000002 R09: 0000000000000000
+ [ 1245.321475] R10: ffff8ae20b120910 R11: 00000000243f8bb1 R12: 0000000000000038
+ [ 1245.321506] R13: ffff8ae221363c80 R14: 000000000000075f R15: ffff8ae223f762b8
+ [ 1245.321539] FS: 00007fdee1ec7700(0000) GS:ffff8ae236c80000(0000) knlGS:0000000000000000
+ [ 1245.321591] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ [ 1245.321614] CR2: 00007fded4030c48 CR3: 000000021da16003 CR4: 00000000003606e0
+ [ 1245.321642] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ [ 1245.321668] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+ [ 1245.321706] Call Trace:
+ [ 1245.321798] __tree_mod_log_rewind+0xbf/0x280 [btrfs]
+ [ 1245.321841] btrfs_search_old_slot+0x105/0xd00 [btrfs]
+ [ 1245.321877] resolve_indirect_refs+0x1eb/0xc60 [btrfs]
+ [ 1245.321912] find_parent_nodes+0x3dc/0x11b0 [btrfs]
+ [ 1245.321947] btrfs_check_shared+0x115/0x1c0 [btrfs]
+ [ 1245.321980] ? extent_fiemap+0x59d/0x6d0 [btrfs]
+ [ 1245.322029] extent_fiemap+0x59d/0x6d0 [btrfs]
+ [ 1245.322066] do_vfs_ioctl+0x45a/0x750
+ [ 1245.322081] ksys_ioctl+0x70/0x80
+ [ 1245.322092] ? trace_hardirqs_off_thunk+0x1a/0x1c
+ [ 1245.322113] __x64_sys_ioctl+0x16/0x20
+ [ 1245.322126] do_syscall_64+0x5c/0x280
+ [ 1245.322139] entry_SYSCALL_64_after_hwframe+0x49/0xbe
+ [ 1245.322155] RIP: 0033:0x7fdee3942dd7
+ [ 1245.322177] Code: ....
+ [ 1245.322258] RSP: 002b:00007fdee1ec6c88 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+ [ 1245.322294] RAX: ffffffffffffffda RBX: 00007fded40210d8 RCX: 00007fdee3942dd7
+ [ 1245.322314] RDX: 00007fded40210d8 RSI: 00000000c020660b RDI: 0000000000000004
+ [ 1245.322337] RBP: 0000562aa89e7510 R08: 0000000000000000 R09: 00007fdee1ec6d44
+ [ 1245.322369] R10: 0000000000000073 R11: 0000000000000246 R12: 00007fdee1ec6d48
+ [ 1245.322390] R13: 00007fdee1ec6d40 R14: 00007fded40210d0 R15: 00007fdee1ec6d50
+ [ 1245.322423] Modules linked in: ....
+ [ 1245.323443] ---[ end trace 01de1e9ec5dff3cd ]---
+
+Fix this by ensuring that btrfs_put_tree_mod_seq() computes the minimum
+sequence number and iterates the rbtree while holding the lock
+'tree_mod_log_lock' in write mode. Also get rid of the 'tree_mod_seq_lock'
+lock, since it is now redundant.
+
+Fixes: bd989ba359f2ac ("Btrfs: add tree modification log functions")
+Fixes: 097b8a7c9e48e2 ("Btrfs: join tree mod log code with the code holding back delayed refs")
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.c | 8 ++------
+ fs/btrfs/ctree.h | 6 ++----
+ fs/btrfs/delayed-ref.c | 8 ++++----
+ fs/btrfs/disk-io.c | 1 -
+ fs/btrfs/tests/btrfs-tests.c | 1 -
+ 5 files changed, 8 insertions(+), 16 deletions(-)
+
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -330,12 +330,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_
+ struct seq_list *elem)
+ {
+ write_lock(&fs_info->tree_mod_log_lock);
+- spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!elem->seq) {
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+ }
+- spin_unlock(&fs_info->tree_mod_seq_lock);
+ write_unlock(&fs_info->tree_mod_log_lock);
+
+ return elem->seq;
+@@ -355,7 +353,7 @@ void btrfs_put_tree_mod_seq(struct btrfs
+ if (!seq_putting)
+ return;
+
+- spin_lock(&fs_info->tree_mod_seq_lock);
++ write_lock(&fs_info->tree_mod_log_lock);
+ list_del(&elem->list);
+ elem->seq = 0;
+
+@@ -366,19 +364,17 @@ void btrfs_put_tree_mod_seq(struct btrfs
+ * blocker with lower sequence number exists, we
+ * cannot remove anything from the log
+ */
+- spin_unlock(&fs_info->tree_mod_seq_lock);
++ write_unlock(&fs_info->tree_mod_log_lock);
+ return;
+ }
+ min_seq = cur_elem->seq;
+ }
+ }
+- spin_unlock(&fs_info->tree_mod_seq_lock);
+
+ /*
+ * anything that's lower than the lowest existing (read: blocked)
+ * sequence number can be removed from the tree.
+ */
+- write_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ for (node = rb_first(tm_root); node; node = next) {
+ next = rb_next(node);
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -671,14 +671,12 @@ struct btrfs_fs_info {
+ atomic_t nr_delayed_iputs;
+ wait_queue_head_t delayed_iputs_wait;
+
+- /* this protects tree_mod_seq_list */
+- spinlock_t tree_mod_seq_lock;
+ atomic64_t tree_mod_seq;
+- struct list_head tree_mod_seq_list;
+
+- /* this protects tree_mod_log */
++ /* this protects tree_mod_log and tree_mod_seq_list */
+ rwlock_t tree_mod_log_lock;
+ struct rb_root tree_mod_log;
++ struct list_head tree_mod_seq_list;
+
+ atomic_t async_delalloc_pages;
+
+--- a/fs/btrfs/delayed-ref.c
++++ b/fs/btrfs/delayed-ref.c
+@@ -492,7 +492,7 @@ void btrfs_merge_delayed_refs(struct btr
+ if (head->is_data)
+ return;
+
+- spin_lock(&fs_info->tree_mod_seq_lock);
++ read_lock(&fs_info->tree_mod_log_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct seq_list *elem;
+
+@@ -500,7 +500,7 @@ void btrfs_merge_delayed_refs(struct btr
+ struct seq_list, list);
+ seq = elem->seq;
+ }
+- spin_unlock(&fs_info->tree_mod_seq_lock);
++ read_unlock(&fs_info->tree_mod_log_lock);
+
+ again:
+ for (node = rb_first_cached(&head->ref_tree); node;
+@@ -518,7 +518,7 @@ int btrfs_check_delayed_seq(struct btrfs
+ struct seq_list *elem;
+ int ret = 0;
+
+- spin_lock(&fs_info->tree_mod_seq_lock);
++ read_lock(&fs_info->tree_mod_log_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+@@ -531,7 +531,7 @@ int btrfs_check_delayed_seq(struct btrfs
+ }
+ }
+
+- spin_unlock(&fs_info->tree_mod_seq_lock);
++ read_unlock(&fs_info->tree_mod_log_lock);
+ return ret;
+ }
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -2652,7 +2652,6 @@ int open_ctree(struct super_block *sb,
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->delayed_iput_lock);
+ spin_lock_init(&fs_info->defrag_inodes_lock);
+- spin_lock_init(&fs_info->tree_mod_seq_lock);
+ spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->unused_bgs_lock);
+--- a/fs/btrfs/tests/btrfs-tests.c
++++ b/fs/btrfs/tests/btrfs-tests.c
+@@ -121,7 +121,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_
+ spin_lock_init(&fs_info->qgroup_lock);
+ spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
+- spin_lock_init(&fs_info->tree_mod_seq_lock);
+ mutex_init(&fs_info->qgroup_ioctl_lock);
+ mutex_init(&fs_info->qgroup_rescan_lock);
+ rwlock_init(&fs_info->tree_mod_log_lock);
--- /dev/null
+From 42ffb0bf584ae5b6b38f72259af1e0ee417ac77f Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Thu, 23 Jan 2020 15:33:02 -0500
+Subject: btrfs: flush write bio if we loop in extent_write_cache_pages
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 42ffb0bf584ae5b6b38f72259af1e0ee417ac77f upstream.
+
+There exists a deadlock with range_cyclic that has existed forever. If
+we loop around with a bio already built we could deadlock with a writer
+who has the page locked that we're attempting to write but is waiting on
+a page in our bio to be written out. The task traces are as follows
+
+ PID: 1329874 TASK: ffff889ebcdf3800 CPU: 33 COMMAND: "kworker/u113:5"
+ #0 [ffffc900297bb658] __schedule at ffffffff81a4c33f
+ #1 [ffffc900297bb6e0] schedule at ffffffff81a4c6e3
+ #2 [ffffc900297bb6f8] io_schedule at ffffffff81a4ca42
+ #3 [ffffc900297bb708] __lock_page at ffffffff811f145b
+ #4 [ffffc900297bb798] __process_pages_contig at ffffffff814bc502
+ #5 [ffffc900297bb8c8] lock_delalloc_pages at ffffffff814bc684
+ #6 [ffffc900297bb900] find_lock_delalloc_range at ffffffff814be9ff
+ #7 [ffffc900297bb9a0] writepage_delalloc at ffffffff814bebd0
+ #8 [ffffc900297bba18] __extent_writepage at ffffffff814bfbf2
+ #9 [ffffc900297bba98] extent_write_cache_pages at ffffffff814bffbd
+
+ PID: 2167901 TASK: ffff889dc6a59c00 CPU: 14 COMMAND:
+ "aio-dio-invalid"
+ #0 [ffffc9003b50bb18] __schedule at ffffffff81a4c33f
+ #1 [ffffc9003b50bba0] schedule at ffffffff81a4c6e3
+ #2 [ffffc9003b50bbb8] io_schedule at ffffffff81a4ca42
+ #3 [ffffc9003b50bbc8] wait_on_page_bit at ffffffff811f24d6
+ #4 [ffffc9003b50bc60] prepare_pages at ffffffff814b05a7
+ #5 [ffffc9003b50bcd8] btrfs_buffered_write at ffffffff814b1359
+ #6 [ffffc9003b50bdb0] btrfs_file_write_iter at ffffffff814b5933
+ #7 [ffffc9003b50be38] new_sync_write at ffffffff8128f6a8
+ #8 [ffffc9003b50bec8] vfs_write at ffffffff81292b9d
+ #9 [ffffc9003b50bf00] ksys_pwrite64 at ffffffff81293032
+
+I used drgn to find the respective pages we were stuck on
+
+page_entry.page 0xffffea00fbfc7500 index 8148 bit 15 pid 2167901
+page_entry.page 0xffffea00f9bb7400 index 7680 bit 0 pid 1329874
+
+As you can see the kworker is waiting for bit 0 (PG_locked) on index
+7680, and aio-dio-invalid is waiting for bit 15 (PG_writeback) on index
+8148. aio-dio-invalid has 7680, and the kworker epd looks like the
+following
+
+ crash> struct extent_page_data ffffc900297bbbb0
+ struct extent_page_data {
+ bio = 0xffff889f747ed830,
+ tree = 0xffff889eed6ba448,
+ extent_locked = 0,
+ sync_io = 0
+ }
+
+Probably worth mentioning as well that it waits for writeback of the
+page to complete while holding a lock on it (at prepare_pages()).
+
+Using drgn I walked the bio pages looking for page
+0xffffea00fbfc7500 which is the one we're waiting for writeback on
+
+ bio = Object(prog, 'struct bio', address=0xffff889f747ed830)
+ for i in range(0, bio.bi_vcnt.value_()):
+ bv = bio.bi_io_vec[i]
+ if bv.bv_page.value_() == 0xffffea00fbfc7500:
+ print("FOUND IT")
+
+which validated what I suspected.
+
+The fix for this is simple, flush the epd before we loop back around to
+the beginning of the file during writeout.
+
+Fixes: b293f02e1423 ("Btrfs: Add writepages support")
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/extent_io.c | 11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -4185,7 +4185,16 @@ retry:
+ */
+ scanned = 1;
+ index = 0;
+- goto retry;
++
++ /*
++ * If we're looping we could run into a page that is locked by a
++ * writer and that writer could be waiting on writeback for a
++ * page in our current bio, and thus deadlock, so flush the
++ * write bio here.
++ */
++ ret = flush_write_bio(epd);
++ if (!ret)
++ goto retry;
+ }
+
+ if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
--- /dev/null
+From 831d2fa25ab8e27592b1b0268dae6f2dfaf7cc43 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 16 Dec 2019 18:26:56 +0000
+Subject: Btrfs: make deduplication with range including the last block work
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 831d2fa25ab8e27592b1b0268dae6f2dfaf7cc43 upstream.
+
+Since btrfs was migrated to use the generic VFS helpers for clone and
+deduplication, it stopped allowing for the last block of a file to be
+deduplicated when the source file size is not sector size aligned (when
+eof is somewhere in the middle of the last block). There are two reasons
+for that:
+
+1) The generic code always rounds down, to a multiple of the block size,
+ the range's length for deduplications. This means we end up never
+ deduplicating the last block when the eof is not block size aligned,
+ even for the safe case where the destination range's end offset matches
+ the destination file's size. That rounding down operation is done at
+ generic_remap_check_len();
+
+2) Because of that, the btrfs specific code does not expect anymore any
+ non-aligned range length's for deduplication and therefore does not
+ work if such nona-aligned length is given.
+
+This patch addresses that second part, and it depends on a patch that
+fixes generic_remap_check_len(), in the VFS, which was submitted ealier
+and has the following subject:
+
+ "fs: allow deduplication of eof block into the end of the destination file"
+
+These two patches address reports from users that started seeing lower
+deduplication rates due to the last block never being deduplicated when
+the file size is not aligned to the filesystem's block size.
+
+Link: https://lore.kernel.org/linux-btrfs/2019-1576167349.500456@svIo.N5dq.dFFD/
+CC: stable@vger.kernel.org # 5.1+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -3244,6 +3244,7 @@ static void btrfs_double_extent_lock(str
+ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
+ struct inode *dst, u64 dst_loff)
+ {
++ const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+ int ret;
+
+ /*
+@@ -3251,7 +3252,7 @@ static int btrfs_extent_same_range(struc
+ * source range to serialize with relocation.
+ */
+ btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
+- ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
++ ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
+ btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
+
+ return ret;
--- /dev/null
+From d62b23c94952e78211a383b7d90ef0afbd9a3717 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Fri, 17 Jan 2020 08:57:51 -0500
+Subject: btrfs: set trans->drity in btrfs_commit_transaction
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit d62b23c94952e78211a383b7d90ef0afbd9a3717 upstream.
+
+If we abort a transaction we have the following sequence
+
+if (!trans->dirty && list_empty(&trans->new_bgs))
+ return;
+WRITE_ONCE(trans->transaction->aborted, err);
+
+The idea being if we didn't modify anything with our trans handle then
+we don't really need to abort the whole transaction, maybe the other
+trans handles are fine and we can carry on.
+
+However in the case of create_snapshot we add a pending_snapshot object
+to our transaction and then commit the transaction. We don't actually
+modify anything. sync() behaves the same way, attach to an existing
+transaction and commit it. This means that if we have an IO error in
+the right places we could abort the committing transaction with our
+trans->dirty being not set and thus not set transaction->aborted.
+
+This is a problem because in the create_snapshot() case we depend on
+pending->error being set to something, or btrfs_commit_transaction
+returning an error.
+
+If we are not the trans handle that gets to commit the transaction, and
+we're waiting on the commit to happen we get our return value from
+cur_trans->aborted. If this was not set to anything because sync() hit
+an error in the transaction commit before it could modify anything then
+cur_trans->aborted would be 0. Thus we'd return 0 from
+btrfs_commit_transaction() in create_snapshot.
+
+This is a problem because we then try to do things with
+pending_snapshot->snap, which will be NULL because we didn't create the
+snapshot, and then we'll get a NULL pointer dereference like the
+following
+
+"BUG: kernel NULL pointer dereference, address: 00000000000001f0"
+RIP: 0010:btrfs_orphan_cleanup+0x2d/0x330
+Call Trace:
+ ? btrfs_mksubvol.isra.31+0x3f2/0x510
+ btrfs_mksubvol.isra.31+0x4bc/0x510
+ ? __sb_start_write+0xfa/0x200
+ ? mnt_want_write_file+0x24/0x50
+ btrfs_ioctl_snap_create_transid+0x16c/0x1a0
+ btrfs_ioctl_snap_create_v2+0x11e/0x1a0
+ btrfs_ioctl+0x1534/0x2c10
+ ? free_debug_processing+0x262/0x2a3
+ do_vfs_ioctl+0xa6/0x6b0
+ ? do_sys_open+0x188/0x220
+ ? syscall_trace_enter+0x1f8/0x330
+ ksys_ioctl+0x60/0x90
+ __x64_sys_ioctl+0x16/0x20
+ do_syscall_64+0x4a/0x1b0
+
+In order to fix this we need to make sure anybody who calls
+commit_transaction has trans->dirty set so that they properly set the
+trans->transaction->aborted value properly so any waiters know bad
+things happened.
+
+This was found while I was running generic/475 with my modified
+fsstress, it reproduced within a few runs. I ran with this patch all
+night and didn't see the problem again.
+
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/transaction.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1949,6 +1949,14 @@ int btrfs_commit_transaction(struct btrf
+ struct btrfs_transaction *prev_trans = NULL;
+ int ret;
+
++ /*
++ * Some places just start a transaction to commit it. We need to make
++ * sure that if this commit fails that the abort code actually marks the
++ * transaction as failed, so set trans->dirty to make the abort code do
++ * the right thing.
++ */
++ trans->dirty = true;
++
+ /* Stop the commit early if ->aborted is set */
+ if (unlikely(READ_ONCE(cur_trans->aborted))) {
+ ret = cur_trans->aborted;
--- /dev/null
+From 2c1fb9d86f6820abbfaa38a6836157c76ccb4e7b Mon Sep 17 00:00:00 2001
+From: Claudiu Beznea <claudiu.beznea@microchip.com>
+Date: Wed, 18 Dec 2019 14:28:25 +0200
+Subject: drm: atmel-hlcdc: enable clock before configuring timing engine
+
+From: Claudiu Beznea <claudiu.beznea@microchip.com>
+
+commit 2c1fb9d86f6820abbfaa38a6836157c76ccb4e7b upstream.
+
+Changing pixel clock source without having this clock source enabled
+will block the timing engine and the next operations after (in this case
+setting ATMEL_HLCDC_CFG(5) settings in atmel_hlcdc_crtc_mode_set_nofb()
+will fail). It is recomended (although in datasheet this is not present)
+to actually enabled pixel clock source before doing any changes on timing
+enginge (only SAM9X60 datasheet specifies that the peripheral clock and
+pixel clock must be enabled before using LCD controller).
+
+Fixes: 1a396789f65a ("drm: add Atmel HLCDC Display Controller support")
+Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
+Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
+Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
+Cc: <stable@vger.kernel.org> # v4.0+
+Link: https://patchwork.freedesktop.org/patch/msgid/1576672109-22707-3-git-send-email-claudiu.beznea@microchip.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
++++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
+@@ -73,7 +73,11 @@ static void atmel_hlcdc_crtc_mode_set_no
+ unsigned long prate;
+ unsigned int mask = ATMEL_HLCDC_CLKDIV_MASK | ATMEL_HLCDC_CLKPOL;
+ unsigned int cfg = 0;
+- int div;
++ int div, ret;
++
++ ret = clk_prepare_enable(crtc->dc->hlcdc->sys_clk);
++ if (ret)
++ return;
+
+ vm.vfront_porch = adj->crtc_vsync_start - adj->crtc_vdisplay;
+ vm.vback_porch = adj->crtc_vtotal - adj->crtc_vsync_end;
+@@ -147,6 +151,8 @@ static void atmel_hlcdc_crtc_mode_set_no
+ ATMEL_HLCDC_VSPSU | ATMEL_HLCDC_VSPHO |
+ ATMEL_HLCDC_GUARDTIME_MASK | ATMEL_HLCDC_MODE_MASK,
+ cfg);
++
++ clk_disable_unprepare(crtc->dc->hlcdc->sys_clk);
+ }
+
+ static enum drm_mode_status
--- /dev/null
+From 51a19d150b520f6cb42143f3bdffacd3c33d7ac5 Mon Sep 17 00:00:00 2001
+From: Peter Rosin <peda@axentia.se>
+Date: Wed, 18 Dec 2019 14:28:28 +0200
+Subject: drm: atmel-hlcdc: prefer a lower pixel-clock than requested
+
+From: Peter Rosin <peda@axentia.se>
+
+commit 51a19d150b520f6cb42143f3bdffacd3c33d7ac5 upstream.
+
+The intention was to only select a higher pixel-clock rate than the
+requested, if a slight overclocking would result in a rate significantly
+closer to the requested rate than if the conservative lower pixel-clock
+rate is selected. The fixed patch has the logic the other way around and
+actually prefers the higher frequency. Fix that.
+
+Signed-off-by: Peter Rosin <peda@axentia.se>
+Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
+Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
+Fixes: 9946a3a9dbed ("drm/atmel-hlcdc: allow selecting a higher pixel-clock than requested")
+Reported-by: Claudiu Beznea <claudiu.beznea@microchip.com>
+Tested-by: Claudiu Beznea <claudiu.beznea@microchip.com>
+Cc: Boris Brezillon <boris.brezillon@bootlin.com>
+Cc: <stable@vger.kernel.org> # v4.20+
+Link: https://patchwork.freedesktop.org/patch/msgid/1576672109-22707-6-git-send-email-claudiu.beznea@microchip.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
++++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
+@@ -121,8 +121,8 @@ static void atmel_hlcdc_crtc_mode_set_no
+ int div_low = prate / mode_rate;
+
+ if (div_low >= 2 &&
+- ((prate / div_low - mode_rate) <
+- 10 * (mode_rate - prate / div)))
++ (10 * (prate / div_low - mode_rate) <
++ (mode_rate - prate / div)))
+ /*
+ * At least 10 times better when using a higher
+ * frequency than requested, instead of a lower.
--- /dev/null
+From 07acf4bafe81dd37eff3fbcfbbdbc48084bc202b Mon Sep 17 00:00:00 2001
+From: Claudiu Beznea <claudiu.beznea@microchip.com>
+Date: Wed, 18 Dec 2019 14:28:24 +0200
+Subject: drm: atmel-hlcdc: use double rate for pixel clock only if supported
+
+From: Claudiu Beznea <claudiu.beznea@microchip.com>
+
+commit 07acf4bafe81dd37eff3fbcfbbdbc48084bc202b upstream.
+
+Doubled system clock should be used as pixel cock source only if this
+is supported. This is emphasized by the value of
+atmel_hlcdc_crtc::dc::desc::fixed_clksrc.
+
+Fixes: a6eca2abdd42 ("drm: atmel-hlcdc: add config option for clock selection")
+Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
+Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
+Cc: Boris Brezillon <bbrezillon@kernel.org>
+Cc: <stable@vger.kernel.org> # v5.3+
+Link: https://patchwork.freedesktop.org/patch/msgid/1576672109-22707-2-git-send-email-claudiu.beznea@microchip.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
++++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
+@@ -95,14 +95,14 @@ static void atmel_hlcdc_crtc_mode_set_no
+ (adj->crtc_hdisplay - 1) |
+ ((adj->crtc_vdisplay - 1) << 16));
+
++ prate = clk_get_rate(crtc->dc->hlcdc->sys_clk);
++ mode_rate = adj->crtc_clock * 1000;
+ if (!crtc->dc->desc->fixed_clksrc) {
++ prate *= 2;
+ cfg |= ATMEL_HLCDC_CLKSEL;
+ mask |= ATMEL_HLCDC_CLKSEL;
+ }
+
+- prate = 2 * clk_get_rate(crtc->dc->hlcdc->sys_clk);
+- mode_rate = adj->crtc_clock * 1000;
+-
+ div = DIV_ROUND_UP(prate, mode_rate);
+ if (div < 2) {
+ div = 2;
--- /dev/null
+From 433480c1afd44f3e1e664b85063d98cefeefa0ed Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
+Date: Fri, 22 Nov 2019 19:56:20 +0200
+Subject: drm/rect: Avoid division by zero
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ville Syrjälä <ville.syrjala@linux.intel.com>
+
+commit 433480c1afd44f3e1e664b85063d98cefeefa0ed upstream.
+
+Check for zero width/height destination rectangle in
+drm_rect_clip_scaled() to avoid a division by zero.
+
+Cc: stable@vger.kernel.org
+Fixes: f96bdf564f3e ("drm/rect: Handle rounding errors in drm_rect_clip_scaled, v3.")
+Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
+Cc: Benjamin Gaignard <benjamin.gaignard@st.com>
+Cc: Daniel Vetter <daniel@ffwll.ch>
+Testcase: igt/kms_selftest/drm_rect_clip_scaled_div_by_zero
+Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20191122175623.13565-2-ville.syrjala@linux.intel.com
+Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Reviewed-by: Benjamin Gaignard <benjamin.gaignard@st.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/drm_rect.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/drm_rect.c
++++ b/drivers/gpu/drm/drm_rect.c
+@@ -54,7 +54,12 @@ EXPORT_SYMBOL(drm_rect_intersect);
+
+ static u32 clip_scaled(u32 src, u32 dst, u32 clip)
+ {
+- u64 tmp = mul_u32_u32(src, dst - clip);
++ u64 tmp;
++
++ if (dst == 0)
++ return 0;
++
++ tmp = mul_u32_u32(src, dst - clip);
+
+ /*
+ * Round toward 1.0 when clipping so that we don't accidentally
--- /dev/null
+From b5e683d5cab8cd433b06ae178621f083cabd4f63 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Sun, 2 Feb 2020 08:23:03 -0700
+Subject: eventfd: track eventfd_signal() recursion depth
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit b5e683d5cab8cd433b06ae178621f083cabd4f63 upstream.
+
+eventfd use cases from aio and io_uring can deadlock due to circular
+or resursive calling, when eventfd_signal() tries to grab the waitqueue
+lock. On top of that, it's also possible to construct notification
+chains that are deep enough that we could blow the stack.
+
+Add a percpu counter that tracks the percpu recursion depth, warn if we
+exceed it. The counter is also exposed so that users of eventfd_signal()
+can do the right thing if it's non-zero in the context where it is
+called.
+
+Cc: stable@vger.kernel.org # 4.19+
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/eventfd.c | 15 +++++++++++++++
+ include/linux/eventfd.h | 14 ++++++++++++++
+ 2 files changed, 29 insertions(+)
+
+--- a/fs/eventfd.c
++++ b/fs/eventfd.c
+@@ -24,6 +24,8 @@
+ #include <linux/seq_file.h>
+ #include <linux/idr.h>
+
++DEFINE_PER_CPU(int, eventfd_wake_count);
++
+ static DEFINE_IDA(eventfd_ida);
+
+ struct eventfd_ctx {
+@@ -60,12 +62,25 @@ __u64 eventfd_signal(struct eventfd_ctx
+ {
+ unsigned long flags;
+
++ /*
++ * Deadlock or stack overflow issues can happen if we recurse here
++ * through waitqueue wakeup handlers. If the caller users potentially
++ * nested waitqueues with custom wakeup handlers, then it should
++ * check eventfd_signal_count() before calling this function. If
++ * it returns true, the eventfd_signal() call should be deferred to a
++ * safe context.
++ */
++ if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
++ return 0;
++
+ spin_lock_irqsave(&ctx->wqh.lock, flags);
++ this_cpu_inc(eventfd_wake_count);
+ if (ULLONG_MAX - ctx->count < n)
+ n = ULLONG_MAX - ctx->count;
+ ctx->count += n;
+ if (waitqueue_active(&ctx->wqh))
+ wake_up_locked_poll(&ctx->wqh, EPOLLIN);
++ this_cpu_dec(eventfd_wake_count);
+ spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+
+ return n;
+--- a/include/linux/eventfd.h
++++ b/include/linux/eventfd.h
+@@ -12,6 +12,8 @@
+ #include <linux/fcntl.h>
+ #include <linux/wait.h>
+ #include <linux/err.h>
++#include <linux/percpu-defs.h>
++#include <linux/percpu.h>
+
+ /*
+ * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
+@@ -40,6 +42,13 @@ __u64 eventfd_signal(struct eventfd_ctx
+ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
+ __u64 *cnt);
+
++DECLARE_PER_CPU(int, eventfd_wake_count);
++
++static inline bool eventfd_signal_count(void)
++{
++ return this_cpu_read(eventfd_wake_count);
++}
++
+ #else /* CONFIG_EVENTFD */
+
+ /*
+@@ -68,6 +77,11 @@ static inline int eventfd_ctx_remove_wai
+ return -ENOSYS;
+ }
+
++static inline bool eventfd_signal_count(void)
++{
++ return false;
++}
++
+ #endif
+
+ #endif /* _LINUX_EVENTFD_H */
--- /dev/null
+From 547c556f4db7c09447ecf5f833ab6aaae0c5ab58 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Tue, 31 Dec 2019 12:11:49 -0600
+Subject: ext4: fix deadlock allocating crypto bounce page from mempool
+
+From: Eric Biggers <ebiggers@google.com>
+
+commit 547c556f4db7c09447ecf5f833ab6aaae0c5ab58 upstream.
+
+ext4_writepages() on an encrypted file has to encrypt the data, but it
+can't modify the pagecache pages in-place, so it encrypts the data into
+bounce pages and writes those instead. All bounce pages are allocated
+from a mempool using GFP_NOFS.
+
+This is not correct use of a mempool, and it can deadlock. This is
+because GFP_NOFS includes __GFP_DIRECT_RECLAIM, which enables the "never
+fail" mode for mempool_alloc() where a failed allocation will fall back
+to waiting for one of the preallocated elements in the pool.
+
+But since this mode is used for all a bio's pages and not just the
+first, it can deadlock waiting for pages already in the bio to be freed.
+
+This deadlock can be reproduced by patching mempool_alloc() to pretend
+that pool->alloc() always fails (so that it always falls back to the
+preallocations), and then creating an encrypted file of size > 128 KiB.
+
+Fix it by only using GFP_NOFS for the first page in the bio. For
+subsequent pages just use GFP_NOWAIT, and if any of those fail, just
+submit the bio and start a new one.
+
+This will need to be fixed in f2fs too, but that's less straightforward.
+
+Fixes: c9af28fdd449 ("ext4 crypto: don't let data integrity writebacks fail with ENOMEM")
+Cc: stable@vger.kernel.org
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Link: https://lore.kernel.org/r/20191231181149.47619-1-ebiggers@kernel.org
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/page-io.c | 19 ++++++++++++++-----
+ 1 file changed, 14 insertions(+), 5 deletions(-)
+
+--- a/fs/ext4/page-io.c
++++ b/fs/ext4/page-io.c
+@@ -478,17 +478,26 @@ int ext4_bio_write_page(struct ext4_io_s
+ gfp_t gfp_flags = GFP_NOFS;
+ unsigned int enc_bytes = round_up(len, i_blocksize(inode));
+
++ /*
++ * Since bounce page allocation uses a mempool, we can only use
++ * a waiting mask (i.e. request guaranteed allocation) on the
++ * first page of the bio. Otherwise it can deadlock.
++ */
++ if (io->io_bio)
++ gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
+ retry_encrypt:
+ bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes,
+ 0, gfp_flags);
+ if (IS_ERR(bounce_page)) {
+ ret = PTR_ERR(bounce_page);
+- if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
+- if (io->io_bio) {
++ if (ret == -ENOMEM &&
++ (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
++ gfp_flags = GFP_NOFS;
++ if (io->io_bio)
+ ext4_io_submit(io);
+- congestion_wait(BLK_RW_ASYNC, HZ/50);
+- }
+- gfp_flags |= __GFP_NOFAIL;
++ else
++ gfp_flags |= __GFP_NOFAIL;
++ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ goto retry_encrypt;
+ }
+ bounce_page = NULL;
--- /dev/null
+From ec772f01307a2c06ebf6cdd221e6b518a71ddae7 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@google.com>
+Date: Thu, 23 Jan 2020 20:12:34 -0800
+Subject: ext4: fix race conditions in ->d_compare() and ->d_hash()
+
+From: Eric Biggers <ebiggers@google.com>
+
+commit ec772f01307a2c06ebf6cdd221e6b518a71ddae7 upstream.
+
+Since ->d_compare() and ->d_hash() can be called in RCU-walk mode,
+->d_parent and ->d_inode can be concurrently modified, and in
+particular, ->d_inode may be changed to NULL. For ext4_d_hash() this
+resulted in a reproducible NULL dereference if a lookup is done in a
+directory being deleted, e.g. with:
+
+ int main()
+ {
+ if (fork()) {
+ for (;;) {
+ mkdir("subdir", 0700);
+ rmdir("subdir");
+ }
+ } else {
+ for (;;)
+ access("subdir/file", 0);
+ }
+ }
+
+... or by running the 't_encrypted_d_revalidate' program from xfstests.
+Both repros work in any directory on a filesystem with the encoding
+feature, even if the directory doesn't actually have the casefold flag.
+
+I couldn't reproduce a crash in ext4_d_compare(), but it appears that a
+similar crash is possible there.
+
+Fix these bugs by reading ->d_parent and ->d_inode using READ_ONCE() and
+falling back to the case sensitive behavior if the inode is NULL.
+
+Reported-by: Al Viro <viro@zeniv.linux.org.uk>
+Fixes: b886ee3e778e ("ext4: Support case-insensitive file name lookups")
+Cc: <stable@vger.kernel.org> # v5.2+
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Link: https://lore.kernel.org/r/20200124041234.159740-1-ebiggers@kernel.org
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/dir.c | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/dir.c
++++ b/fs/ext4/dir.c
+@@ -673,9 +673,11 @@ static int ext4_d_compare(const struct d
+ const char *str, const struct qstr *name)
+ {
+ struct qstr qstr = {.name = str, .len = len };
+- struct inode *inode = dentry->d_parent->d_inode;
++ const struct dentry *parent = READ_ONCE(dentry->d_parent);
++ const struct inode *inode = READ_ONCE(parent->d_inode);
+
+- if (!IS_CASEFOLDED(inode) || !EXT4_SB(inode->i_sb)->s_encoding) {
++ if (!inode || !IS_CASEFOLDED(inode) ||
++ !EXT4_SB(inode->i_sb)->s_encoding) {
+ if (len != name->len)
+ return -1;
+ return memcmp(str, name->name, len);
+@@ -688,10 +690,11 @@ static int ext4_d_hash(const struct dent
+ {
+ const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
+ const struct unicode_map *um = sbi->s_encoding;
++ const struct inode *inode = READ_ONCE(dentry->d_inode);
+ unsigned char *norm;
+ int len, ret = 0;
+
+- if (!IS_CASEFOLDED(dentry->d_inode) || !um)
++ if (!inode || !IS_CASEFOLDED(inode) || !um)
+ return 0;
+
+ norm = kmalloc(PATH_MAX, GFP_ATOMIC);
--- /dev/null
+From 7582026f6f3588ecebd281965c8a71aff6fb6158 Mon Sep 17 00:00:00 2001
+From: Abhi Das <adas@redhat.com>
+Date: Tue, 4 Feb 2020 14:14:56 -0600
+Subject: gfs2: fix gfs2_find_jhead that returns uninitialized jhead with seq 0
+
+From: Abhi Das <adas@redhat.com>
+
+commit 7582026f6f3588ecebd281965c8a71aff6fb6158 upstream.
+
+When the first log header in a journal happens to have a sequence
+number of 0, a bug in gfs2_find_jhead() causes it to prematurely exit,
+and return an uninitialized jhead with seq 0. This can cause failures
+in the caller. For instance, a mount fails in one test case.
+
+The correct behavior is for it to continue searching through the journal
+to find the correct journal head with the highest sequence number.
+
+Fixes: f4686c26ecc3 ("gfs2: read journal in large chunks")
+Cc: stable@vger.kernel.org # v5.2+
+Signed-off-by: Abhi Das <adas@redhat.com>
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/gfs2/lops.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/gfs2/lops.c
++++ b/fs/gfs2/lops.c
+@@ -421,7 +421,7 @@ static bool gfs2_jhead_pg_srch(struct gf
+
+ for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
+ if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
+- if (lh.lh_sequence > head->lh_sequence)
++ if (lh.lh_sequence >= head->lh_sequence)
+ *head = lh;
+ else {
+ ret = true;
--- /dev/null
+From 6e5e41e2dc4e4413296d5a4af54ac92d7cd52317 Mon Sep 17 00:00:00 2001
+From: Andreas Gruenbacher <agruenba@redhat.com>
+Date: Tue, 14 Jan 2020 17:12:18 +0100
+Subject: gfs2: fix O_SYNC write handling
+
+From: Andreas Gruenbacher <agruenba@redhat.com>
+
+commit 6e5e41e2dc4e4413296d5a4af54ac92d7cd52317 upstream.
+
+In gfs2_file_write_iter, for direct writes, the error checking in the buffered
+write fallback case is incomplete. This can cause inode write errors to go
+undetected. Fix and clean up gfs2_file_write_iter along the way.
+
+Based on a proposed fix by Christoph Hellwig <hch@lst.de>.
+
+Fixes: 967bcc91b044 ("gfs2: iomap direct I/O support")
+Cc: stable@vger.kernel.org # v4.19+
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/gfs2/file.c | 51 +++++++++++++++++++++------------------------------
+ 1 file changed, 21 insertions(+), 30 deletions(-)
+
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -833,7 +833,7 @@ static ssize_t gfs2_file_write_iter(stru
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct gfs2_inode *ip = GFS2_I(inode);
+- ssize_t written = 0, ret;
++ ssize_t ret;
+
+ ret = gfs2_rsqa_alloc(ip);
+ if (ret)
+@@ -865,55 +865,46 @@ static ssize_t gfs2_file_write_iter(stru
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct address_space *mapping = file->f_mapping;
+- loff_t pos, endbyte;
+- ssize_t buffered;
++ ssize_t buffered, ret2;
+
+- written = gfs2_file_direct_write(iocb, from);
+- if (written < 0 || !iov_iter_count(from))
++ ret = gfs2_file_direct_write(iocb, from);
++ if (ret < 0 || !iov_iter_count(from))
+ goto out_unlock;
+
++ iocb->ki_flags |= IOCB_DSYNC;
+ current->backing_dev_info = inode_to_bdi(inode);
+- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
++ buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+ current->backing_dev_info = NULL;
+- if (unlikely(ret < 0))
++ if (unlikely(buffered <= 0))
+ goto out_unlock;
+- buffered = ret;
+
+ /*
+ * We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+- * semantics.
++ * semantics. If the writeback or invalidate fails, only report
++ * the direct I/O range as we don't know if the buffered pages
++ * made it to disk.
+ */
+- pos = iocb->ki_pos;
+- endbyte = pos + buffered - 1;
+- ret = filemap_write_and_wait_range(mapping, pos, endbyte);
+- if (!ret) {
+- iocb->ki_pos += buffered;
+- written += buffered;
+- invalidate_mapping_pages(mapping,
+- pos >> PAGE_SHIFT,
+- endbyte >> PAGE_SHIFT);
+- } else {
+- /*
+- * We don't know how much we wrote, so just return
+- * the number of bytes which were direct-written
+- */
+- }
++ iocb->ki_pos += buffered;
++ ret2 = generic_write_sync(iocb, buffered);
++ invalidate_mapping_pages(mapping,
++ (iocb->ki_pos - buffered) >> PAGE_SHIFT,
++ (iocb->ki_pos - 1) >> PAGE_SHIFT);
++ if (!ret || ret2 > 0)
++ ret += ret2;
+ } else {
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+ current->backing_dev_info = NULL;
+- if (likely(ret > 0))
++ if (likely(ret > 0)) {
+ iocb->ki_pos += ret;
++ ret = generic_write_sync(iocb, ret);
++ }
+ }
+
+ out_unlock:
+ inode_unlock(inode);
+- if (likely(ret > 0)) {
+- /* Handle various SYNC-type writes */
+- ret = generic_write_sync(iocb, ret);
+- }
+- return written ? written : ret;
++ return ret;
+ }
+
+ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
--- /dev/null
+From 4c0e8dda608a51855225c611b5c6b442f95fbc56 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Wed, 15 Jan 2020 16:38:29 +0100
+Subject: gfs2: move setting current->backing_dev_info
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 4c0e8dda608a51855225c611b5c6b442f95fbc56 upstream.
+
+Set current->backing_dev_info just around the buffered write calls to
+prepare for the next fix.
+
+Fixes: 967bcc91b044 ("gfs2: iomap direct I/O support")
+Cc: stable@vger.kernel.org # v4.19+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/gfs2/file.c | 21 ++++++++++-----------
+ 1 file changed, 10 insertions(+), 11 deletions(-)
+
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -853,18 +853,15 @@ static ssize_t gfs2_file_write_iter(stru
+ inode_lock(inode);
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+- goto out;
+-
+- /* We can write back this queue in page reclaim */
+- current->backing_dev_info = inode_to_bdi(inode);
++ goto out_unlock;
+
+ ret = file_remove_privs(file);
+ if (ret)
+- goto out2;
++ goto out_unlock;
+
+ ret = file_update_time(file);
+ if (ret)
+- goto out2;
++ goto out_unlock;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct address_space *mapping = file->f_mapping;
+@@ -873,11 +870,13 @@ static ssize_t gfs2_file_write_iter(stru
+
+ written = gfs2_file_direct_write(iocb, from);
+ if (written < 0 || !iov_iter_count(from))
+- goto out2;
++ goto out_unlock;
+
++ current->backing_dev_info = inode_to_bdi(inode);
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
++ current->backing_dev_info = NULL;
+ if (unlikely(ret < 0))
+- goto out2;
++ goto out_unlock;
+ buffered = ret;
+
+ /*
+@@ -901,14 +900,14 @@ static ssize_t gfs2_file_write_iter(stru
+ */
+ }
+ } else {
++ current->backing_dev_info = inode_to_bdi(inode);
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
++ current->backing_dev_info = NULL;
+ if (likely(ret > 0))
+ iocb->ki_pos += ret;
+ }
+
+-out2:
+- current->backing_dev_info = NULL;
+-out:
++out_unlock:
+ inode_unlock(inode);
+ if (likely(ret > 0)) {
+ /* Handle various SYNC-type writes */
--- /dev/null
+From 197288d5ba8a5289f22d3aeb4fca3824bfd9b4af Mon Sep 17 00:00:00 2001
+From: Luca Coelho <luciano.coelho@intel.com>
+Date: Fri, 31 Jan 2020 15:45:25 +0200
+Subject: iwlwifi: don't throw error when trying to remove IGTK
+
+From: Luca Coelho <luciano.coelho@intel.com>
+
+commit 197288d5ba8a5289f22d3aeb4fca3824bfd9b4af upstream.
+
+The IGTK keys are only removed by mac80211 after it has already
+removed the AP station. This causes the driver to throw an error
+because mac80211 is trying to remove the IGTK when the station doesn't
+exist anymore.
+
+The firmware is aware that the station has been removed and can deal
+with it the next time we try to add an IGTK for a station, so we
+shouldn't try to remove the key if the station ID is
+IWL_MVM_INVALID_STA. Do this by removing the check for mvm_sta before
+calling iwl_mvm_send_sta_igtk() and check return from that function
+gracefully if the station ID is invalid.
+
+Cc: stable@vger.kernel.org # 4.12+
+Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
+Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
++++ b/drivers/net/wireless/intel/iwlwifi/mvm/sta.c
+@@ -3321,6 +3321,10 @@ static int iwl_mvm_send_sta_igtk(struct
+ igtk_cmd.sta_id = cpu_to_le32(sta_id);
+
+ if (remove_key) {
++ /* This is a valid situation for IGTK */
++ if (sta_id == IWL_MVM_INVALID_STA)
++ return 0;
++
+ igtk_cmd.ctrl_flags |= cpu_to_le32(STA_KEY_NOT_VALID);
+ } else {
+ struct ieee80211_key_seq seq;
+@@ -3575,9 +3579,9 @@ int iwl_mvm_remove_sta_key(struct iwl_mv
+ IWL_DEBUG_WEP(mvm, "mvm remove dynamic key: idx=%d sta=%d\n",
+ keyconf->keyidx, sta_id);
+
+- if (mvm_sta && (keyconf->cipher == WLAN_CIPHER_SUITE_AES_CMAC ||
+- keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_128 ||
+- keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_256))
++ if (keyconf->cipher == WLAN_CIPHER_SUITE_AES_CMAC ||
++ keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_128 ||
++ keyconf->cipher == WLAN_CIPHER_SUITE_BIP_GMAC_256)
+ return iwl_mvm_send_sta_igtk(mvm, keyconf, sta_id, true);
+
+ if (!__test_and_clear_bit(keyconf->hw_key_idx, mvm->fw_key_table)) {
--- /dev/null
+From 1a8e9cf40c9a6a2e40b1e924b13ed303aeea4418 Mon Sep 17 00:00:00 2001
+From: Vasily Averin <vvs@virtuozzo.com>
+Date: Thu, 23 Jan 2020 12:05:10 +0300
+Subject: jbd2_seq_info_next should increase position index
+
+From: Vasily Averin <vvs@virtuozzo.com>
+
+commit 1a8e9cf40c9a6a2e40b1e924b13ed303aeea4418 upstream.
+
+if seq_file .next fuction does not change position index,
+read after some lseek can generate unexpected output.
+
+Script below generates endless output
+ $ q=;while read -r r;do echo "$((++q)) $r";done </proc/fs/jbd2/DEV/info
+
+https://bugzilla.kernel.org/show_bug.cgi?id=206283
+
+Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface")
+Cc: stable@kernel.org
+Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/d13805e5-695e-8ac3-b678-26ca2313629f@virtuozzo.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/jbd2/journal.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/jbd2/journal.c
++++ b/fs/jbd2/journal.c
+@@ -981,6 +981,7 @@ static void *jbd2_seq_info_start(struct
+
+ static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
+ {
++ (*pos)++;
+ return NULL;
+ }
+
--- /dev/null
+From 1a978d9d3e72ddfa40ac60d26301b154247ee0bc Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Wed, 18 Dec 2019 13:54:46 -0800
+Subject: KVM: PPC: Book3S HV: Uninit vCPU if vcore creation fails
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit 1a978d9d3e72ddfa40ac60d26301b154247ee0bc upstream.
+
+Call kvm_vcpu_uninit() if vcore creation fails to avoid leaking any
+resources allocated by kvm_vcpu_init(), i.e. the vcpu->run page.
+
+Fixes: 371fefd6f2dc4 ("KVM: PPC: Allow book3s_hv guests to use SMT processor modes")
+Cc: stable@vger.kernel.org
+Reviewed-by: Greg Kurz <groug@kaod.org>
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Acked-by: Paul Mackerras <paulus@ozlabs.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/powerpc/kvm/book3s_hv.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/powerpc/kvm/book3s_hv.c
++++ b/arch/powerpc/kvm/book3s_hv.c
+@@ -2354,7 +2354,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu
+ mutex_unlock(&kvm->lock);
+
+ if (!vcore)
+- goto free_vcpu;
++ goto uninit_vcpu;
+
+ spin_lock(&vcore->lock);
+ ++vcore->num_threads;
+@@ -2371,6 +2371,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu
+
+ return vcpu;
+
++uninit_vcpu:
++ kvm_vcpu_uninit(vcpu);
+ free_vcpu:
+ kmem_cache_free(kvm_vcpu_cache, vcpu);
+ out:
--- /dev/null
+From cb10bf9194f4d2c5d830eddca861f7ca0fecdbb4 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Wed, 18 Dec 2019 13:54:47 -0800
+Subject: KVM: PPC: Book3S PR: Free shared page if mmu initialization fails
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit cb10bf9194f4d2c5d830eddca861f7ca0fecdbb4 upstream.
+
+Explicitly free the shared page if kvmppc_mmu_init() fails during
+kvmppc_core_vcpu_create(), as the page is freed only in
+kvmppc_core_vcpu_free(), which is not reached via kvm_vcpu_uninit().
+
+Fixes: 96bc451a15329 ("KVM: PPC: Introduce shared page")
+Cc: stable@vger.kernel.org
+Reviewed-by: Greg Kurz <groug@kaod.org>
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Acked-by: Paul Mackerras <paulus@ozlabs.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/powerpc/kvm/book3s_pr.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/powerpc/kvm/book3s_pr.c
++++ b/arch/powerpc/kvm/book3s_pr.c
+@@ -1769,10 +1769,12 @@ static struct kvm_vcpu *kvmppc_core_vcpu
+
+ err = kvmppc_mmu_init(vcpu);
+ if (err < 0)
+- goto uninit_vcpu;
++ goto free_shared_page;
+
+ return vcpu;
+
++free_shared_page:
++ free_page((unsigned long)vcpu->arch.shared);
+ uninit_vcpu:
+ kvm_vcpu_uninit(vcpu);
+ free_shadow_vcpu:
--- /dev/null
+From a47970ed74a535b1accb4bc73643fd5a93993c3e Mon Sep 17 00:00:00 2001
+From: John Allen <john.allen@amd.com>
+Date: Thu, 19 Dec 2019 14:17:59 -0600
+Subject: kvm/svm: PKU not currently supported
+
+From: John Allen <john.allen@amd.com>
+
+commit a47970ed74a535b1accb4bc73643fd5a93993c3e upstream.
+
+Current SVM implementation does not have support for handling PKU. Guests
+running on a host with future AMD cpus that support the feature will read
+garbage from the PKRU register and will hit segmentation faults on boot as
+memory is getting marked as protected that should not be. Ensure that cpuid
+from SVM does not advertise the feature.
+
+Signed-off-by: John Allen <john.allen@amd.com>
+Cc: stable@vger.kernel.org
+Fixes: 0556cbdc2fbc ("x86/pkeys: Don't check if PKRU is zero before writing it")
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/kvm_host.h | 1 +
+ arch/x86/kvm/cpuid.c | 4 +++-
+ arch/x86/kvm/svm.c | 6 ++++++
+ arch/x86/kvm/vmx/capabilities.h | 5 +++++
+ arch/x86/kvm/vmx/vmx.c | 1 +
+ 5 files changed, 16 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -1128,6 +1128,7 @@ struct kvm_x86_ops {
+ bool (*xsaves_supported)(void);
+ bool (*umip_emulated)(void);
+ bool (*pt_supported)(void);
++ bool (*pku_supported)(void);
+
+ int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
+ void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
+--- a/arch/x86/kvm/cpuid.c
++++ b/arch/x86/kvm/cpuid.c
+@@ -352,6 +352,7 @@ static inline void do_cpuid_7_mask(struc
+ unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
+ unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
+ unsigned f_la57;
++ unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0;
+
+ /* cpuid 7.0.ebx */
+ const u32 kvm_cpuid_7_0_ebx_x86_features =
+@@ -363,7 +364,7 @@ static inline void do_cpuid_7_mask(struc
+
+ /* cpuid 7.0.ecx*/
+ const u32 kvm_cpuid_7_0_ecx_x86_features =
+- F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
++ F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
+ F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
+ F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
+@@ -392,6 +393,7 @@ static inline void do_cpuid_7_mask(struc
+ /* Set LA57 based on hardware capability. */
+ entry->ecx |= f_la57;
+ entry->ecx |= f_umip;
++ entry->ecx |= f_pku;
+ /* PKU is not yet implemented for shadow paging. */
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+ entry->ecx &= ~F(PKU);
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -5986,6 +5986,11 @@ static bool svm_has_wbinvd_exit(void)
+ return true;
+ }
+
++static bool svm_pku_supported(void)
++{
++ return false;
++}
++
+ #define PRE_EX(exit) { .exit_code = (exit), \
+ .stage = X86_ICPT_PRE_EXCEPT, }
+ #define POST_EX(exit) { .exit_code = (exit), \
+@@ -7278,6 +7283,7 @@ static struct kvm_x86_ops svm_x86_ops __
+ .xsaves_supported = svm_xsaves_supported,
+ .umip_emulated = svm_umip_emulated,
+ .pt_supported = svm_pt_supported,
++ .pku_supported = svm_pku_supported,
+
+ .set_supported_cpuid = svm_set_supported_cpuid,
+
+--- a/arch/x86/kvm/vmx/capabilities.h
++++ b/arch/x86/kvm/vmx/capabilities.h
+@@ -145,6 +145,11 @@ static inline bool vmx_umip_emulated(voi
+ SECONDARY_EXEC_DESC;
+ }
+
++static inline bool vmx_pku_supported(void)
++{
++ return boot_cpu_has(X86_FEATURE_PKU);
++}
++
+ static inline bool cpu_has_vmx_rdtscp(void)
+ {
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+--- a/arch/x86/kvm/vmx/vmx.c
++++ b/arch/x86/kvm/vmx/vmx.c
+@@ -7865,6 +7865,7 @@ static struct kvm_x86_ops vmx_x86_ops __
+ .xsaves_supported = vmx_xsaves_supported,
+ .umip_emulated = vmx_umip_emulated,
+ .pt_supported = vmx_pt_supported,
++ .pku_supported = vmx_pku_supported,
+
+ .request_immediate_exit = vmx_request_immediate_exit,
+
--- /dev/null
+From f958bd2314d117f8c29f4821401bc1925bc2e5ef Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+Date: Mon, 9 Dec 2019 12:19:31 -0800
+Subject: KVM: x86: Fix potential put_fpu() w/o load_fpu() on MPX platform
+
+From: Sean Christopherson <sean.j.christopherson@intel.com>
+
+commit f958bd2314d117f8c29f4821401bc1925bc2e5ef upstream.
+
+Unlike most state managed by XSAVE, MPX is initialized to zero on INIT.
+Because INITs are usually recognized in the context of a VCPU_RUN call,
+kvm_vcpu_reset() puts the guest's FPU so that the FPU state is resident
+in memory, zeros the MPX state, and reloads FPU state to hardware. But,
+in the unlikely event that an INIT is recognized during
+kvm_arch_vcpu_ioctl_get_mpstate() via kvm_apic_accept_events(),
+kvm_vcpu_reset() will call kvm_put_guest_fpu() without a preceding
+kvm_load_guest_fpu() and corrupt the guest's FPU state (and possibly
+userspace's FPU state as well).
+
+Given that MPX is being removed from the kernel[*], fix the bug with the
+simple-but-ugly approach of loading the guest's FPU during
+KVM_GET_MP_STATE.
+
+[*] See commit f240652b6032b ("x86/mpx: Remove MPX APIs").
+
+Fixes: f775b13eedee2 ("x86,kvm: move qemu/guest FPU switching out to vcpu_run")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -8698,6 +8698,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(stru
+ struct kvm_mp_state *mp_state)
+ {
+ vcpu_load(vcpu);
++ if (kvm_mpx_supported())
++ kvm_load_guest_fpu(vcpu);
+
+ kvm_apic_accept_events(vcpu);
+ if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
+@@ -8706,6 +8708,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(stru
+ else
+ mp_state->mp_state = vcpu->arch.mp_state;
+
++ if (kvm_mpx_supported())
++ kvm_put_guest_fpu(vcpu);
+ vcpu_put(vcpu);
+ return 0;
+ }
--- /dev/null
+From ea740059ecb37807ba47b84b33d1447435a8d868 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:52 -0800
+Subject: KVM: x86: Protect DR-based index computations from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit ea740059ecb37807ba47b84b33d1447435a8d868 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in __kvm_set_dr() and
+kvm_get_dr().
+Both kvm_get_dr() and kvm_set_dr() (a wrapper of __kvm_set_dr()) are
+exported symbols so KVM should tream them conservatively from a security
+perspective.
+
+Fixes: 020df0794f57 ("KVM: move DR register access handling into generic code")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -1054,9 +1054,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu
+
+ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+ {
++ size_t size = ARRAY_SIZE(vcpu->arch.db);
++
+ switch (dr) {
+ case 0 ... 3:
+- vcpu->arch.db[dr] = val;
++ vcpu->arch.db[array_index_nospec(dr, size)] = val;
+ if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+ vcpu->arch.eff_db[dr] = val;
+ break;
+@@ -1093,9 +1095,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+ {
++ size_t size = ARRAY_SIZE(vcpu->arch.db);
++
+ switch (dr) {
+ case 0 ... 3:
+- *val = vcpu->arch.db[dr];
++ *val = vcpu->arch.db[array_index_nospec(dr, size)];
+ break;
+ case 4:
+ /* fall through */
--- /dev/null
+From 8c86405f606ca8508b8d9280680166ca26723695 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:44 -0800
+Subject: KVM: x86: Protect ioapic_read_indirect() from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 8c86405f606ca8508b8d9280680166ca26723695 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in ioapic_read_indirect().
+This function contains index computations based on the
+(attacker-controlled) IOREGSEL register.
+
+Fixes: a2c118bfab8b ("KVM: Fix bounds checking in ioapic indirect register reads (CVE-2013-1798)")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/ioapic.c | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/ioapic.c
++++ b/arch/x86/kvm/ioapic.c
+@@ -36,6 +36,7 @@
+ #include <linux/io.h>
+ #include <linux/slab.h>
+ #include <linux/export.h>
++#include <linux/nospec.h>
+ #include <asm/processor.h>
+ #include <asm/page.h>
+ #include <asm/current.h>
+@@ -68,13 +69,14 @@ static unsigned long ioapic_read_indirec
+ default:
+ {
+ u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+- u64 redir_content;
++ u64 redir_content = ~0ULL;
+
+- if (redir_index < IOAPIC_NUM_PINS)
+- redir_content =
+- ioapic->redirtbl[redir_index].bits;
+- else
+- redir_content = ~0ULL;
++ if (redir_index < IOAPIC_NUM_PINS) {
++ u32 index = array_index_nospec(
++ redir_index, IOAPIC_NUM_PINS);
++
++ redir_content = ioapic->redirtbl[index].bits;
++ }
+
+ result = (ioapic->ioregsel & 0x1) ?
+ (redir_content >> 32) & 0xffffffff :
--- /dev/null
+From 670564559ca35b439c8d8861fc399451ddf95137 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:45 -0800
+Subject: KVM: x86: Protect ioapic_write_indirect() from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 670564559ca35b439c8d8861fc399451ddf95137 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in ioapic_write_indirect().
+This function contains index computations based on the
+(attacker-controlled) IOREGSEL register.
+
+This patch depends on patch
+"KVM: x86: Protect ioapic_read_indirect() from Spectre-v1/L1TF attacks".
+
+Fixes: 70f93dae32ac ("KVM: Use temporary variable to shorten lines.")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/ioapic.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/kvm/ioapic.c
++++ b/arch/x86/kvm/ioapic.c
+@@ -291,6 +291,7 @@ static void ioapic_write_indirect(struct
+
+ if (index >= IOAPIC_NUM_PINS)
+ return;
++ index = array_index_nospec(index, IOAPIC_NUM_PINS);
+ e = &ioapic->redirtbl[index];
+ mask_before = e->fields.mask;
+ /* Preserve read-only fields */
--- /dev/null
+From 8618793750071d66028584a83ed0b4fa7eb4f607 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:42 -0800
+Subject: KVM: x86: Protect kvm_hv_msr_[get|set]_crash_data() from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 8618793750071d66028584a83ed0b4fa7eb4f607 upstream.
+
+This fixes Spectre-v1/L1TF vulnerabilities in kvm_hv_msr_get_crash_data()
+and kvm_hv_msr_set_crash_data().
+These functions contain index computations that use the
+(attacker-controlled) MSR number.
+
+Fixes: e7d9513b60e8 ("kvm/x86: added hyper-v crash msrs into kvm hyperv context")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/hyperv.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/hyperv.c
++++ b/arch/x86/kvm/hyperv.c
+@@ -809,11 +809,12 @@ static int kvm_hv_msr_get_crash_data(str
+ u32 index, u64 *pdata)
+ {
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
++ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
++ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+- *pdata = hv->hv_crash_param[index];
++ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
+ return 0;
+ }
+
+@@ -852,11 +853,12 @@ static int kvm_hv_msr_set_crash_data(str
+ u32 index, u64 data)
+ {
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
++ size_t size = ARRAY_SIZE(hv->hv_crash_param);
+
+- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
++ if (WARN_ON_ONCE(index >= size))
+ return -EINVAL;
+
+- hv->hv_crash_param[index] = data;
++ hv->hv_crash_param[array_index_nospec(index, size)] = data;
+ return 0;
+ }
+
--- /dev/null
+From 4bf79cb089f6b1c6c632492c0271054ce52ad766 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:46 -0800
+Subject: KVM: x86: Protect kvm_lapic_reg_write() from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 4bf79cb089f6b1c6c632492c0271054ce52ad766 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in kvm_lapic_reg_write().
+This function contains index computations based on the
+(attacker-controlled) MSR number.
+
+Fixes: 0105d1a52640 ("KVM: x2apic interface to lapic")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/lapic.c | 13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/lapic.c
++++ b/arch/x86/kvm/lapic.c
+@@ -1926,15 +1926,20 @@ int kvm_lapic_reg_write(struct kvm_lapic
+ case APIC_LVTTHMR:
+ case APIC_LVTPC:
+ case APIC_LVT1:
+- case APIC_LVTERR:
++ case APIC_LVTERR: {
+ /* TODO: Check vector */
++ size_t size;
++ u32 index;
++
+ if (!kvm_apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+-
+- val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
++ size = ARRAY_SIZE(apic_lvt_mask);
++ index = array_index_nospec(
++ (reg - APIC_LVTT) >> 4, size);
++ val &= apic_lvt_mask[index];
+ kvm_lapic_set_reg(apic, reg, val);
+-
+ break;
++ }
+
+ case APIC_LVTT:
+ if (!kvm_apic_sw_enabled(apic))
--- /dev/null
+From 6ec4c5eee1750d5d17951c4e1960d953376a0dda Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:49 -0800
+Subject: KVM: x86: Protect MSR-based index computations from Spectre-v1/L1TF attacks in x86.c
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 6ec4c5eee1750d5d17951c4e1960d953376a0dda upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in set_msr_mce() and
+get_msr_mce().
+Both functions contain index computations based on the
+(attacker-controlled) MSR number.
+
+Fixes: 890ca9aefa78 ("KVM: Add MCE support")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/x86.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -2494,7 +2494,10 @@ static int set_msr_mce(struct kvm_vcpu *
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MCx_CTL(bank_num)) {
+- u32 offset = msr - MSR_IA32_MC0_CTL;
++ u32 offset = array_index_nospec(
++ msr - MSR_IA32_MC0_CTL,
++ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
++
+ /* only 0 or all 1s can be written to IA32_MCi_CTL
+ * some Linux kernels though clear bit 10 in bank 4 to
+ * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+@@ -2921,7 +2924,10 @@ static int get_msr_mce(struct kvm_vcpu *
+ default:
+ if (msr >= MSR_IA32_MC0_CTL &&
+ msr < MSR_IA32_MCx_CTL(bank_num)) {
+- u32 offset = msr - MSR_IA32_MC0_CTL;
++ u32 offset = array_index_nospec(
++ msr - MSR_IA32_MC0_CTL,
++ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
++
+ data = vcpu->arch.mce_banks[offset];
+ break;
+ }
--- /dev/null
+From 25a5edea71b7c154b6a0b8cec14c711cafa31d26 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:47 -0800
+Subject: KVM: x86: Protect MSR-based index computations in fixed_msr_to_seg_unit() from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 25a5edea71b7c154b6a0b8cec14c711cafa31d26 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in fixed_msr_to_seg_unit().
+This function contains index computations based on the
+(attacker-controlled) MSR number.
+
+Fixes: de9aef5e1ad6 ("KVM: MTRR: introduce fixed_mtrr_segment table")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/mtrr.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/mtrr.c
++++ b/arch/x86/kvm/mtrr.c
+@@ -192,11 +192,15 @@ static bool fixed_msr_to_seg_unit(u32 ms
+ break;
+ case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
+ *seg = 1;
+- *unit = msr - MSR_MTRRfix16K_80000;
++ *unit = array_index_nospec(
++ msr - MSR_MTRRfix16K_80000,
++ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
+ break;
+ case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+ *seg = 2;
+- *unit = msr - MSR_MTRRfix4K_C0000;
++ *unit = array_index_nospec(
++ msr - MSR_MTRRfix4K_C0000,
++ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
+ break;
+ default:
+ return false;
--- /dev/null
+From 13c5183a4e643cc2b03a22d0e582c8e17bb7457d Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:48 -0800
+Subject: KVM: x86: Protect MSR-based index computations in pmu.h from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 13c5183a4e643cc2b03a22d0e582c8e17bb7457d upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in the get_gp_pmc() and
+get_fixed_pmc() functions.
+They both contain index computations based on the (attacker-controlled)
+MSR number.
+
+Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/pmu.h | 18 ++++++++++++++----
+ 1 file changed, 14 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kvm/pmu.h
++++ b/arch/x86/kvm/pmu.h
+@@ -2,6 +2,8 @@
+ #ifndef __KVM_X86_PMU_H
+ #define __KVM_X86_PMU_H
+
++#include <linux/nospec.h>
++
+ #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+ #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
+ #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
+@@ -86,8 +88,12 @@ static inline bool pmc_is_enabled(struct
+ static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+ u32 base)
+ {
+- if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
+- return &pmu->gp_counters[msr - base];
++ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
++ u32 index = array_index_nospec(msr - base,
++ pmu->nr_arch_gp_counters);
++
++ return &pmu->gp_counters[index];
++ }
+
+ return NULL;
+ }
+@@ -97,8 +103,12 @@ static inline struct kvm_pmc *get_fixed_
+ {
+ int base = MSR_CORE_PERF_FIXED_CTR0;
+
+- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
+- return &pmu->fixed_counters[msr - base];
++ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
++ u32 index = array_index_nospec(msr - base,
++ pmu->nr_arch_fixed_counters);
++
++ return &pmu->fixed_counters[index];
++ }
+
+ return NULL;
+ }
--- /dev/null
+From 66061740f1a487f4ed54fde75e724709f805da53 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:53 -0800
+Subject: KVM: x86: Protect pmu_intel.c from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 66061740f1a487f4ed54fde75e724709f805da53 upstream.
+
+This fixes Spectre-v1/L1TF vulnerabilities in intel_find_fixed_event()
+and intel_rdpmc_ecx_to_pmc().
+kvm_rdpmc() (ancestor of intel_find_fixed_event()) and
+reprogram_fixed_counter() (ancestor of intel_rdpmc_ecx_to_pmc()) are
+exported symbols so KVM should treat them conservatively from a security
+perspective.
+
+Fixes: 25462f7f5295 ("KVM: x86/vPMU: Define kvm_pmu_ops to support vPMU function dispatch")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx/pmu_intel.c | 24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -84,10 +84,14 @@ static unsigned intel_find_arch_event(st
+
+ static unsigned intel_find_fixed_event(int idx)
+ {
+- if (idx >= ARRAY_SIZE(fixed_pmc_events))
++ u32 event;
++ size_t size = ARRAY_SIZE(fixed_pmc_events);
++
++ if (idx >= size)
+ return PERF_COUNT_HW_MAX;
+
+- return intel_arch_events[fixed_pmc_events[idx]].event_type;
++ event = fixed_pmc_events[array_index_nospec(idx, size)];
++ return intel_arch_events[event].event_type;
+ }
+
+ /* check if a PMC is enabled by comparing it with globl_ctrl bits. */
+@@ -128,16 +132,20 @@ static struct kvm_pmc *intel_msr_idx_to_
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+ struct kvm_pmc *counters;
++ unsigned int num_counters;
+
+ idx &= ~(3u << 30);
+- if (!fixed && idx >= pmu->nr_arch_gp_counters)
+- return NULL;
+- if (fixed && idx >= pmu->nr_arch_fixed_counters)
++ if (fixed) {
++ counters = pmu->fixed_counters;
++ num_counters = pmu->nr_arch_fixed_counters;
++ } else {
++ counters = pmu->gp_counters;
++ num_counters = pmu->nr_arch_gp_counters;
++ }
++ if (idx >= num_counters)
+ return NULL;
+- counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
+ *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
+-
+- return &counters[idx];
++ return &counters[array_index_nospec(idx, num_counters)];
+ }
+
+ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
--- /dev/null
+From 3c9053a2cae7ba2ba73766a34cea41baa70f57f7 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:41 -0800
+Subject: KVM: x86: Protect x86_decode_insn from Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 3c9053a2cae7ba2ba73766a34cea41baa70f57f7 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in x86_decode_insn().
+kvm_emulate_instruction() (an ancestor of x86_decode_insn()) is an exported
+symbol, so KVM should treat it conservatively from a security perspective.
+
+Fixes: 045a282ca415 ("KVM: emulator: implement fninit, fnstsw, fnstcw")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -5317,10 +5317,15 @@ done_prefixes:
+ }
+ break;
+ case Escape:
+- if (ctxt->modrm > 0xbf)
+- opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
+- else
++ if (ctxt->modrm > 0xbf) {
++ size_t size = ARRAY_SIZE(opcode.u.esc->high);
++ u32 index = array_index_nospec(
++ ctxt->modrm - 0xc0, size);
++
++ opcode = opcode.u.esc->high[index];
++ } else {
+ opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
++ }
+ break;
+ case InstrDual:
+ if ((ctxt->modrm >> 6) == 3)
--- /dev/null
+From 14e32321f3606e4b0970200b6e5e47ee6f1e6410 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:43 -0800
+Subject: KVM: x86: Refactor picdev_write() to prevent Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 14e32321f3606e4b0970200b6e5e47ee6f1e6410 upstream.
+
+This fixes a Spectre-v1/L1TF vulnerability in picdev_write().
+It replaces index computations based on the (attacked-controlled) port
+number with constants through a minor refactoring.
+
+Fixes: 85f455f7ddbe ("KVM: Add support for in-kernel PIC emulation")
+
+Signed-off-by: Nick Finco <nifi@google.com>
+Signed-off-by: Marios Pomonis <pomonis@google.com>
+Reviewed-by: Andrew Honig <ahonig@google.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/i8259.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/i8259.c
++++ b/arch/x86/kvm/i8259.c
+@@ -460,10 +460,14 @@ static int picdev_write(struct kvm_pic *
+ switch (addr) {
+ case 0x20:
+ case 0x21:
++ pic_lock(s);
++ pic_ioport_write(&s->pics[0], addr, data);
++ pic_unlock(s);
++ break;
+ case 0xa0:
+ case 0xa1:
+ pic_lock(s);
+- pic_ioport_write(&s->pics[addr >> 7], addr, data);
++ pic_ioport_write(&s->pics[1], addr, data);
+ pic_unlock(s);
+ break;
+ case 0x4d0:
--- /dev/null
+From 125ffc5e0a56a3eded608dc51e09d5ebf72cf652 Mon Sep 17 00:00:00 2001
+From: Marios Pomonis <pomonis@google.com>
+Date: Wed, 11 Dec 2019 12:47:50 -0800
+Subject: KVM: x86: Refactor prefix decoding to prevent Spectre-v1/L1TF attacks
+
+From: Marios Pomonis <pomonis@google.com>
+
+commit 125ffc5e0a56a3eded608dc51e09d5ebf72cf652 upstream.
+
+This fixes Spectre-v1/L1TF vulnerabilities in
+vmx_read_guest_seg_selector(), vmx_read_guest_seg_base(),
+vmx_read_guest_seg_limit() and vmx_read_guest_seg_ar(). When
+invoked from emulation, these functions contain index computations
+based on the (attacker-influenced) segment value. Using constants
+prevents the attack.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/emulate.c | 16 ++++++++++++++--
+ 1 file changed, 14 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/emulate.c
++++ b/arch/x86/kvm/emulate.c
+@@ -5212,16 +5212,28 @@ int x86_decode_insn(struct x86_emulate_c
+ ctxt->ad_bytes = def_ad_bytes ^ 6;
+ break;
+ case 0x26: /* ES override */
++ has_seg_override = true;
++ ctxt->seg_override = VCPU_SREG_ES;
++ break;
+ case 0x2e: /* CS override */
++ has_seg_override = true;
++ ctxt->seg_override = VCPU_SREG_CS;
++ break;
+ case 0x36: /* SS override */
++ has_seg_override = true;
++ ctxt->seg_override = VCPU_SREG_SS;
++ break;
+ case 0x3e: /* DS override */
+ has_seg_override = true;
+- ctxt->seg_override = (ctxt->b >> 3) & 3;
++ ctxt->seg_override = VCPU_SREG_DS;
+ break;
+ case 0x64: /* FS override */
++ has_seg_override = true;
++ ctxt->seg_override = VCPU_SREG_FS;
++ break;
+ case 0x65: /* GS override */
+ has_seg_override = true;
+- ctxt->seg_override = ctxt->b & 7;
++ ctxt->seg_override = VCPU_SREG_GS;
+ break;
+ case 0x40 ... 0x4f: /* REX */
+ if (mode != X86EMUL_MODE_PROT64)
--- /dev/null
+From 1b257870a78b0a9ce98fdfb052c58542022ffb5b Mon Sep 17 00:00:00 2001
+From: Johan Hovold <johan@kernel.org>
+Date: Fri, 3 Jan 2020 17:35:13 +0100
+Subject: media: iguanair: fix endpoint sanity check
+
+From: Johan Hovold <johan@kernel.org>
+
+commit 1b257870a78b0a9ce98fdfb052c58542022ffb5b upstream.
+
+Make sure to use the current alternate setting, which need not be the
+first one by index, when verifying the endpoint descriptors and
+initialising the URBs.
+
+Failing to do so could cause the driver to misbehave or trigger a WARN()
+in usb_submit_urb() that kernels with panic_on_warn set would choke on.
+
+Fixes: 26ff63137c45 ("[media] Add support for the IguanaWorks USB IR Transceiver")
+Fixes: ab1cbdf159be ("media: iguanair: add sanity checks")
+Cc: stable <stable@vger.kernel.org> # 3.6
+Cc: Oliver Neukum <oneukum@suse.com>
+Signed-off-by: Johan Hovold <johan@kernel.org>
+Signed-off-by: Sean Young <sean@mess.org>
+Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/media/rc/iguanair.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/media/rc/iguanair.c
++++ b/drivers/media/rc/iguanair.c
+@@ -413,7 +413,7 @@ static int iguanair_probe(struct usb_int
+ int ret, pipein, pipeout;
+ struct usb_host_interface *idesc;
+
+- idesc = intf->altsetting;
++ idesc = intf->cur_altsetting;
+ if (idesc->desc.bNumEndpoints < 2)
+ return -ENODEV;
+
--- /dev/null
+From 080d89f522e2baddb4fbbd1af4b67b5f92537ef8 Mon Sep 17 00:00:00 2001
+From: Sean Young <sean@mess.org>
+Date: Thu, 21 Nov 2019 11:10:47 +0100
+Subject: media: rc: ensure lirc is initialized before registering input device
+
+From: Sean Young <sean@mess.org>
+
+commit 080d89f522e2baddb4fbbd1af4b67b5f92537ef8 upstream.
+
+Once rc_open is called on the input device, lirc events can be delivered.
+Ensure lirc is ready to do so else we might get this:
+
+Registered IR keymap rc-hauppauge
+rc rc0: Hauppauge WinTV PVR-350 as
+/devices/pci0000:00/0000:00:1e.0/0000:04:00.0/i2c-0/0-0018/rc/rc0
+input: Hauppauge WinTV PVR-350 as
+/devices/pci0000:00/0000:00:1e.0/0000:04:00.0/i2c-0/0-0018/rc/rc0/input9
+BUG: kernel NULL pointer dereference, address: 0000000000000038
+PGD 0 P4D 0
+Oops: 0000 [#1] SMP PTI
+CPU: 1 PID: 17 Comm: kworker/1:0 Not tainted 5.3.11-300.fc31.x86_64 #1
+Hardware name: /DG43NB, BIOS NBG4310H.86A.0096.2009.0903.1845 09/03/2009
+Workqueue: events ir_work [ir_kbd_i2c]
+RIP: 0010:ir_lirc_scancode_event+0x3d/0xb0
+Code: a6 b4 07 00 00 49 81 c6 b8 07 00 00 55 53 e8 ba a7 9d ff 4c 89
+e7 49 89 45 00 e8 5e 7a 25 00 49 8b 1e 48 89 c5 4c 39 f3 74 58 <8b> 43
+38 8b 53 40 89 c1 2b 4b 3c 39 ca 72 41 21 d0 49 8b 7d 00 49
+RSP: 0018:ffffaae2000b3d88 EFLAGS: 00010017
+RAX: 0000000000000002 RBX: 0000000000000000 RCX: 0000000000000019
+RDX: 0000000000000001 RSI: 006e801b1f26ce6a RDI: ffff9e39797c37b4
+RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000001
+R10: 0000000000000001 R11: 0000000000000001 R12: ffff9e39797c37b4
+R13: ffffaae2000b3db8 R14: ffff9e39797c37b8 R15: ffff9e39797c33d8
+FS: 0000000000000000(0000) GS:ffff9e397b680000(0000) knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000000000038 CR3: 0000000035844000 CR4: 00000000000006e0
+Call Trace:
+ir_do_keydown+0x8e/0x2b0
+rc_keydown+0x52/0xc0
+ir_work+0xb8/0x130 [ir_kbd_i2c]
+process_one_work+0x19d/0x340
+worker_thread+0x50/0x3b0
+kthread+0xfb/0x130
+? process_one_work+0x340/0x340
+? kthread_park+0x80/0x80
+ret_from_fork+0x35/0x40
+Modules linked in: rc_hauppauge tuner msp3400 saa7127 saa7115 ivtv(+)
+tveeprom cx2341x v4l2_common videodev mc i2c_algo_bit ir_kbd_i2c
+ip_tables firewire_ohci e1000e serio_raw firewire_core ata_generic
+crc_itu_t pata_acpi pata_jmicron fuse
+CR2: 0000000000000038
+---[ end trace c67c2697a99fa74b ]---
+RIP: 0010:ir_lirc_scancode_event+0x3d/0xb0
+Code: a6 b4 07 00 00 49 81 c6 b8 07 00 00 55 53 e8 ba a7 9d ff 4c 89
+e7 49 89 45 00 e8 5e 7a 25 00 49 8b 1e 48 89 c5 4c 39 f3 74 58 <8b> 43
+38 8b 53 40 89 c1 2b 4b 3c 39 ca 72 41 21 d0 49 8b 7d 00 49
+RSP: 0018:ffffaae2000b3d88 EFLAGS: 00010017
+RAX: 0000000000000002 RBX: 0000000000000000 RCX: 0000000000000019
+RDX: 0000000000000001 RSI: 006e801b1f26ce6a RDI: ffff9e39797c37b4
+RBP: 0000000000000002 R08: 0000000000000001 R09: 0000000000000001
+R10: 0000000000000001 R11: 0000000000000001 R12: ffff9e39797c37b4
+R13: ffffaae2000b3db8 R14: ffff9e39797c37b8 R15: ffff9e39797c33d8
+FS: 0000000000000000(0000) GS:ffff9e397b680000(0000) knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000000000038 CR3: 0000000035844000 CR4: 00000000000006e0
+rc rc0: lirc_dev: driver ir_kbd_i2c registered at minor = 0, scancode
+receiver, no transmitter
+tuner-simple 0-0061: creating new instance
+tuner-simple 0-0061: type set to 2 (Philips NTSC (FI1236,FM1236 and
+compatibles))
+ivtv0: Registered device video0 for encoder MPG (4096 kB)
+ivtv0: Registered device video32 for encoder YUV (2048 kB)
+ivtv0: Registered device vbi0 for encoder VBI (1024 kB)
+ivtv0: Registered device video24 for encoder PCM (320 kB)
+ivtv0: Registered device radio0 for encoder radio
+ivtv0: Registered device video16 for decoder MPG (1024 kB)
+ivtv0: Registered device vbi8 for decoder VBI (64 kB)
+ivtv0: Registered device vbi16 for decoder VOUT
+ivtv0: Registered device video48 for decoder YUV (1024 kB)
+
+Cc: stable@vger.kernel.org
+Tested-by: Nick French <nickfrench@gmail.com>
+Reported-by: Nick French <nickfrench@gmail.com>
+Signed-off-by: Sean Young <sean@mess.org>
+Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/media/rc/rc-main.c | 27 ++++++++++++++++-----------
+ 1 file changed, 16 insertions(+), 11 deletions(-)
+
+--- a/drivers/media/rc/rc-main.c
++++ b/drivers/media/rc/rc-main.c
+@@ -1891,23 +1891,28 @@ int rc_register_device(struct rc_dev *de
+
+ dev->registered = true;
+
+- if (dev->driver_type != RC_DRIVER_IR_RAW_TX) {
+- rc = rc_setup_rx_device(dev);
+- if (rc)
+- goto out_dev;
+- }
+-
+- /* Ensure that the lirc kfifo is setup before we start the thread */
++ /*
++ * once the the input device is registered in rc_setup_rx_device,
++ * userspace can open the input device and rc_open() will be called
++ * as a result. This results in driver code being allowed to submit
++ * keycodes with rc_keydown, so lirc must be registered first.
++ */
+ if (dev->allowed_protocols != RC_PROTO_BIT_CEC) {
+ rc = ir_lirc_register(dev);
+ if (rc < 0)
+- goto out_rx;
++ goto out_dev;
++ }
++
++ if (dev->driver_type != RC_DRIVER_IR_RAW_TX) {
++ rc = rc_setup_rx_device(dev);
++ if (rc)
++ goto out_lirc;
+ }
+
+ if (dev->driver_type == RC_DRIVER_IR_RAW) {
+ rc = ir_raw_event_register(dev);
+ if (rc < 0)
+- goto out_lirc;
++ goto out_rx;
+ }
+
+ dev_dbg(&dev->dev, "Registered rc%u (driver: %s)\n", dev->minor,
+@@ -1915,11 +1920,11 @@ int rc_register_device(struct rc_dev *de
+
+ return 0;
+
++out_rx:
++ rc_free_rx_device(dev);
+ out_lirc:
+ if (dev->allowed_protocols != RC_PROTO_BIT_CEC)
+ ir_lirc_unregister(dev);
+-out_rx:
+- rc_free_rx_device(dev);
+ out_dev:
+ device_del(&dev->dev);
+ out_rx_free:
--- /dev/null
+From 65b1aae0d9d5962faccc06bdb8e91a2a0b09451c Mon Sep 17 00:00:00 2001
+From: Brian Norris <briannorris@chromium.org>
+Date: Mon, 6 Jan 2020 14:42:12 -0800
+Subject: mwifiex: fix unbalanced locking in mwifiex_process_country_ie()
+
+From: Brian Norris <briannorris@chromium.org>
+
+commit 65b1aae0d9d5962faccc06bdb8e91a2a0b09451c upstream.
+
+We called rcu_read_lock(), so we need to call rcu_read_unlock() before
+we return.
+
+Fixes: 3d94a4a8373b ("mwifiex: fix possible heap overflow in mwifiex_process_country_ie()")
+Cc: stable@vger.kernel.org
+Cc: huangwen <huangwenabc@gmail.com>
+Cc: Ganapathi Bhat <ganapathi.bhat@nxp.com>
+Signed-off-by: Brian Norris <briannorris@chromium.org>
+Acked-by: Ganapathi Bhat <ganapathi.bhat@nxp.com>
+Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/wireless/marvell/mwifiex/sta_ioctl.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
++++ b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
+@@ -232,6 +232,7 @@ static int mwifiex_process_country_ie(st
+
+ if (country_ie_len >
+ (IEEE80211_COUNTRY_STRING_LEN + MWIFIEX_MAX_TRIPLET_802_11D)) {
++ rcu_read_unlock();
+ mwifiex_dbg(priv->adapter, ERROR,
+ "11D: country_ie_len overflow!, deauth AP\n");
+ return -EINVAL;
--- /dev/null
+From 114de38225d9b300f027e2aec9afbb6e0def154b Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <trondmy@gmail.com>
+Date: Sun, 2 Feb 2020 17:53:54 -0500
+Subject: NFS: Directory page cache pages need to be locked when read
+
+From: Trond Myklebust <trondmy@gmail.com>
+
+commit 114de38225d9b300f027e2aec9afbb6e0def154b upstream.
+
+When a NFS directory page cache page is removed from the page cache,
+its contents are freed through a call to nfs_readdir_clear_array().
+To prevent the removal of the page cache entry until after we've
+finished reading it, we must take the page lock.
+
+Fixes: 11de3b11e08c ("NFS: Fix a memory leak in nfs_readdir")
+Cc: stable@vger.kernel.org # v2.6.37+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfs/dir.c | 30 +++++++++++++++++++-----------
+ 1 file changed, 19 insertions(+), 11 deletions(-)
+
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -702,8 +702,6 @@ int nfs_readdir_filler(void *data, struc
+ static
+ void cache_page_release(nfs_readdir_descriptor_t *desc)
+ {
+- if (!desc->page->mapping)
+- nfs_readdir_clear_array(desc->page);
+ put_page(desc->page);
+ desc->page = NULL;
+ }
+@@ -717,19 +715,28 @@ struct page *get_cache_page(nfs_readdir_
+
+ /*
+ * Returns 0 if desc->dir_cookie was found on page desc->page_index
++ * and locks the page to prevent removal from the page cache.
+ */
+ static
+-int find_cache_page(nfs_readdir_descriptor_t *desc)
++int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
+ {
+ int res;
+
+ desc->page = get_cache_page(desc);
+ if (IS_ERR(desc->page))
+ return PTR_ERR(desc->page);
+-
+- res = nfs_readdir_search_array(desc);
++ res = lock_page_killable(desc->page);
+ if (res != 0)
+- cache_page_release(desc);
++ goto error;
++ res = -EAGAIN;
++ if (desc->page->mapping != NULL) {
++ res = nfs_readdir_search_array(desc);
++ if (res == 0)
++ return 0;
++ }
++ unlock_page(desc->page);
++error:
++ cache_page_release(desc);
+ return res;
+ }
+
+@@ -744,7 +751,7 @@ int readdir_search_pagecache(nfs_readdir
+ desc->last_cookie = 0;
+ }
+ do {
+- res = find_cache_page(desc);
++ res = find_and_lock_cache_page(desc);
+ } while (res == -EAGAIN);
+ return res;
+ }
+@@ -783,7 +790,6 @@ int nfs_do_filldir(nfs_readdir_descripto
+ desc->eof = true;
+
+ kunmap(desc->page);
+- cache_page_release(desc);
+ dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
+ (unsigned long long)*desc->dir_cookie, res);
+ return res;
+@@ -829,13 +835,13 @@ int uncached_readdir(nfs_readdir_descrip
+
+ status = nfs_do_filldir(desc);
+
++ out_release:
++ nfs_readdir_clear_array(desc->page);
++ cache_page_release(desc);
+ out:
+ dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
+ __func__, status);
+ return status;
+- out_release:
+- cache_page_release(desc);
+- goto out;
+ }
+
+ /* The file offset position represents the dirent entry number. A
+@@ -900,6 +906,8 @@ static int nfs_readdir(struct file *file
+ break;
+
+ res = nfs_do_filldir(desc);
++ unlock_page(desc->page);
++ cache_page_release(desc);
+ if (res < 0)
+ break;
+ } while (!desc->eof);
--- /dev/null
+From 4b310319c6a8ce708f1033d57145e2aa027a883c Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <trondmy@gmail.com>
+Date: Sun, 2 Feb 2020 17:53:53 -0500
+Subject: NFS: Fix memory leaks and corruption in readdir
+
+From: Trond Myklebust <trondmy@gmail.com>
+
+commit 4b310319c6a8ce708f1033d57145e2aa027a883c upstream.
+
+nfs_readdir_xdr_to_array() must not exit without having initialised
+the array, so that the page cache deletion routines can safely
+call nfs_readdir_clear_array().
+Furthermore, we should ensure that if we exit nfs_readdir_filler()
+with an error, we free up any page contents to prevent a leak
+if we try to fill the page again.
+
+Fixes: 11de3b11e08c ("NFS: Fix a memory leak in nfs_readdir")
+Cc: stable@vger.kernel.org # v2.6.37+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
+Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfs/dir.c | 17 +++++++++++++++--
+ 1 file changed, 15 insertions(+), 2 deletions(-)
+
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -162,6 +162,17 @@ typedef struct {
+ bool eof;
+ } nfs_readdir_descriptor_t;
+
++static
++void nfs_readdir_init_array(struct page *page)
++{
++ struct nfs_cache_array *array;
++
++ array = kmap_atomic(page);
++ memset(array, 0, sizeof(struct nfs_cache_array));
++ array->eof_index = -1;
++ kunmap_atomic(array);
++}
++
+ /*
+ * we are freeing strings created by nfs_add_to_readdir_array()
+ */
+@@ -174,6 +185,7 @@ void nfs_readdir_clear_array(struct page
+ array = kmap_atomic(page);
+ for (i = 0; i < array->size; i++)
+ kfree(array->array[i].string.name);
++ array->size = 0;
+ kunmap_atomic(array);
+ }
+
+@@ -610,6 +622,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir
+ int status = -ENOMEM;
+ unsigned int array_size = ARRAY_SIZE(pages);
+
++ nfs_readdir_init_array(page);
++
+ entry.prev_cookie = 0;
+ entry.cookie = desc->last_cookie;
+ entry.eof = 0;
+@@ -626,8 +640,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir
+ }
+
+ array = kmap(page);
+- memset(array, 0, sizeof(struct nfs_cache_array));
+- array->eof_index = -1;
+
+ status = nfs_readdir_alloc_pages(pages, array_size);
+ if (status < 0)
+@@ -682,6 +694,7 @@ int nfs_readdir_filler(void *data, struc
+ unlock_page(page);
+ return 0;
+ error:
++ nfs_readdir_clear_array(page);
+ unlock_page(page);
+ return ret;
+ }
--- /dev/null
+From 28c7d86bb6172ffbb1a1237c6388e77f9fe5f181 Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <trondmy@gmail.com>
+Date: Mon, 6 Jan 2020 13:18:03 -0500
+Subject: nfsd: fix filecache lookup
+
+From: Trond Myklebust <trondmy@gmail.com>
+
+commit 28c7d86bb6172ffbb1a1237c6388e77f9fe5f181 upstream.
+
+If the lookup keeps finding a nfsd_file with an unhashed open file,
+then retry once only.
+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Cc: stable@vger.kernel.org
+Fixes: 65294c1f2c5e "nfsd: add a new struct file caching facility to nfsd"
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfsd/filecache.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/nfsd/filecache.c
++++ b/fs/nfsd/filecache.c
+@@ -791,6 +791,7 @@ nfsd_file_acquire(struct svc_rqst *rqstp
+ struct nfsd_file *nf, *new;
+ struct inode *inode;
+ unsigned int hashval;
++ bool retry = true;
+
+ /* FIXME: skip this if fh_dentry is already set? */
+ status = fh_verify(rqstp, fhp, S_IFREG,
+@@ -826,6 +827,11 @@ wait_for_construction:
+
+ /* Did construction of this file fail? */
+ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
++ if (!retry) {
++ status = nfserr_jukebox;
++ goto out;
++ }
++ retry = false;
+ nfsd_file_put_noref(nf);
+ goto retry;
+ }
--- /dev/null
+From 9dc086f1e9ef39dd823bd27954b884b2062f9e70 Mon Sep 17 00:00:00 2001
+From: Michael Ellerman <mpe@ellerman.id.au>
+Date: Fri, 7 Feb 2020 22:15:46 +1100
+Subject: powerpc/futex: Fix incorrect user access blocking
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+commit 9dc086f1e9ef39dd823bd27954b884b2062f9e70 upstream.
+
+The early versions of our kernel user access prevention (KUAP) were
+written by Russell and Christophe, and didn't have separate
+read/write access.
+
+At some point I picked up the series and added the read/write access,
+but I failed to update the usages in futex.h to correctly allow read
+and write.
+
+However we didn't notice because of another bug which was causing the
+low-level code to always enable read and write. That bug was fixed
+recently in commit 1d8f739b07bd ("powerpc/kuap: Fix set direction in
+allow/prevent_user_access()").
+
+futex_atomic_cmpxchg_inatomic() is passed the user address as %3 and
+does:
+
+ 1: lwarx %1, 0, %3
+ cmpw 0, %1, %4
+ bne- 3f
+ 2: stwcx. %5, 0, %3
+
+Which clearly loads and stores from/to %3. The logic in
+arch_futex_atomic_op_inuser() is similar, so fix both of them to use
+allow_read_write_user().
+
+Without this fix, and with PPC_KUAP_DEBUG=y, we see eg:
+
+ Bug: Read fault blocked by AMR!
+ WARNING: CPU: 94 PID: 149215 at arch/powerpc/include/asm/book3s/64/kup-radix.h:126 __do_page_fault+0x600/0xf30
+ CPU: 94 PID: 149215 Comm: futex_requeue_p Tainted: G W 5.5.0-rc7-gcc9x-g4c25df5640ae #1
+ ...
+ NIP [c000000000070680] __do_page_fault+0x600/0xf30
+ LR [c00000000007067c] __do_page_fault+0x5fc/0xf30
+ Call Trace:
+ [c00020138e5637e0] [c00000000007067c] __do_page_fault+0x5fc/0xf30 (unreliable)
+ [c00020138e5638c0] [c00000000000ada8] handle_page_fault+0x10/0x30
+ --- interrupt: 301 at cmpxchg_futex_value_locked+0x68/0xd0
+ LR = futex_lock_pi_atomic+0xe0/0x1f0
+ [c00020138e563bc0] [c000000000217b50] futex_lock_pi_atomic+0x80/0x1f0 (unreliable)
+ [c00020138e563c30] [c00000000021b668] futex_requeue+0x438/0xb60
+ [c00020138e563d60] [c00000000021c6cc] do_futex+0x1ec/0x2b0
+ [c00020138e563d90] [c00000000021c8b8] sys_futex+0x128/0x200
+ [c00020138e563e20] [c00000000000b7ac] system_call+0x5c/0x68
+
+Fixes: de78a9c42a79 ("powerpc: Add a framework for Kernel Userspace Access Protection")
+Cc: stable@vger.kernel.org # v5.2+
+Reported-by: syzbot+e808452bad7c375cbee6@syzkaller-ppc64.appspotmail.com
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Reviewed-by: Christophe Leroy <christophe.leroy@c-s.fr>
+Link: https://lore.kernel.org/r/20200207122145.11928-1-mpe@ellerman.id.au
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/powerpc/include/asm/futex.h | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/arch/powerpc/include/asm/futex.h
++++ b/arch/powerpc/include/asm/futex.h
+@@ -35,7 +35,7 @@ static inline int arch_futex_atomic_op_i
+ {
+ int oldval = 0, ret;
+
+- allow_write_to_user(uaddr, sizeof(*uaddr));
++ allow_read_write_user(uaddr, uaddr, sizeof(*uaddr));
+ pagefault_disable();
+
+ switch (op) {
+@@ -62,7 +62,7 @@ static inline int arch_futex_atomic_op_i
+
+ *oval = oldval;
+
+- prevent_write_to_user(uaddr, sizeof(*uaddr));
++ prevent_read_write_user(uaddr, uaddr, sizeof(*uaddr));
+ return ret;
+ }
+
+@@ -76,7 +76,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval,
+ if (!access_ok(uaddr, sizeof(u32)))
+ return -EFAULT;
+
+- allow_write_to_user(uaddr, sizeof(*uaddr));
++ allow_read_write_user(uaddr, uaddr, sizeof(*uaddr));
++
+ __asm__ __volatile__ (
+ PPC_ATOMIC_ENTRY_BARRIER
+ "1: lwarx %1,0,%3 # futex_atomic_cmpxchg_inatomic\n\
+@@ -97,7 +98,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval,
+ : "cc", "memory");
+
+ *uval = prev;
+- prevent_write_to_user(uaddr, sizeof(*uaddr));
++ prevent_read_write_user(uaddr, uaddr, sizeof(*uaddr));
++
+ return ret;
+ }
+
--- /dev/null
+From 00fe717ee1ea3c2979db4f94b1533c57aed8dea9 Mon Sep 17 00:00:00 2001
+From: Arun Easi <aeasi@marvell.com>
+Date: Thu, 23 Jan 2020 20:50:14 -0800
+Subject: scsi: qla2xxx: Fix unbound NVME response length
+
+From: Arun Easi <aeasi@marvell.com>
+
+commit 00fe717ee1ea3c2979db4f94b1533c57aed8dea9 upstream.
+
+On certain cases when response length is less than 32, NVME response data
+is supplied inline in IOCB. This is indicated by some combination of state
+flags. There was an instance when a high, and incorrect, response length
+was indicated causing driver to overrun buffers. Fix this by checking and
+limiting the response payload length.
+
+Fixes: 7401bc18d1ee3 ("scsi: qla2xxx: Add FC-NVMe command handling")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20200124045014.23554-1-hmadhani@marvell.com
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Himanshu Madhani <hmadhani@marvell.com>
+Reviewed-by: Ewan D. Milne <emilne@redhat.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/qla2xxx/qla_dbg.c | 6 ------
+ drivers/scsi/qla2xxx/qla_dbg.h | 6 ++++++
+ drivers/scsi/qla2xxx/qla_isr.c | 12 ++++++++++++
+ 3 files changed, 18 insertions(+), 6 deletions(-)
+
+--- a/drivers/scsi/qla2xxx/qla_dbg.c
++++ b/drivers/scsi/qla2xxx/qla_dbg.c
+@@ -2519,12 +2519,6 @@ qla83xx_fw_dump_failed:
+ /* Driver Debug Functions. */
+ /****************************************************************************/
+
+-static inline int
+-ql_mask_match(uint level)
+-{
+- return (level & ql2xextended_error_logging) == level;
+-}
+-
+ /*
+ * This function is for formatting and logging debug information.
+ * It is to be used when vha is available. It formats the message
+--- a/drivers/scsi/qla2xxx/qla_dbg.h
++++ b/drivers/scsi/qla2xxx/qla_dbg.h
+@@ -374,3 +374,9 @@ extern int qla24xx_dump_ram(struct qla_h
+ extern void qla24xx_pause_risc(struct device_reg_24xx __iomem *,
+ struct qla_hw_data *);
+ extern int qla24xx_soft_reset(struct qla_hw_data *);
++
++static inline int
++ql_mask_match(uint level)
++{
++ return (level & ql2xextended_error_logging) == level;
++}
+--- a/drivers/scsi/qla2xxx/qla_isr.c
++++ b/drivers/scsi/qla2xxx/qla_isr.c
+@@ -1897,6 +1897,18 @@ static void qla24xx_nvme_iocb_entry(scsi
+ inbuf = (uint32_t *)&sts->nvme_ersp_data;
+ outbuf = (uint32_t *)fd->rspaddr;
+ iocb->u.nvme.rsp_pyld_len = le16_to_cpu(sts->nvme_rsp_pyld_len);
++ if (unlikely(iocb->u.nvme.rsp_pyld_len >
++ sizeof(struct nvme_fc_ersp_iu))) {
++ if (ql_mask_match(ql_dbg_io)) {
++ WARN_ONCE(1, "Unexpected response payload length %u.\n",
++ iocb->u.nvme.rsp_pyld_len);
++ ql_log(ql_log_warn, fcport->vha, 0x5100,
++ "Unexpected response payload length %u.\n",
++ iocb->u.nvme.rsp_pyld_len);
++ }
++ iocb->u.nvme.rsp_pyld_len =
++ sizeof(struct nvme_fc_ersp_iu);
++ }
+ iter = iocb->u.nvme.rsp_pyld_len >> 2;
+ for (; iter; iter--)
+ *outbuf++ = swab32(*inbuf++);
crypto-atmel-aes-fix-counter-overflow-in-ctr-mode.patch
crypto-api-fix-race-condition-in-crypto_spawn_alg.patch
crypto-picoxcell-adjust-the-position-of-tasklet_init-and-fix-missed-tasklet_kill.patch
+powerpc-futex-fix-incorrect-user-access-blocking.patch
+scsi-qla2xxx-fix-unbound-nvme-response-length.patch
+nfs-fix-memory-leaks-and-corruption-in-readdir.patch
+nfs-directory-page-cache-pages-need-to-be-locked-when-read.patch
+nfsd-fix-filecache-lookup.patch
+jbd2_seq_info_next-should-increase-position-index.patch
+ext4-fix-deadlock-allocating-crypto-bounce-page-from-mempool.patch
+ext4-fix-race-conditions-in-d_compare-and-d_hash.patch
+btrfs-fix-missing-hole-after-hole-punching-and-fsync-when-using-no_holes.patch
+btrfs-make-deduplication-with-range-including-the-last-block-work.patch
+btrfs-fix-infinite-loop-during-fsync-after-rename-operations.patch
+btrfs-set-trans-drity-in-btrfs_commit_transaction.patch
+btrfs-drop-log-root-for-dropped-roots.patch
+btrfs-fix-race-between-adding-and-putting-tree-mod-seq-elements-and-nodes.patch
+btrfs-flush-write-bio-if-we-loop-in-extent_write_cache_pages.patch
+btrfs-correctly-handle-empty-trees-in-find_first_clear_extent_bit.patch
+arm-tegra-enable-pllp-bypass-during-tegra124-lp1.patch
+iwlwifi-don-t-throw-error-when-trying-to-remove-igtk.patch
+mwifiex-fix-unbalanced-locking-in-mwifiex_process_country_ie.patch
+sunrpc-expiry_time-should-be-seconds-not-timeval.patch
+gfs2-fix-gfs2_find_jhead-that-returns-uninitialized-jhead-with-seq-0.patch
+gfs2-move-setting-current-backing_dev_info.patch
+gfs2-fix-o_sync-write-handling.patch
+drm-atmel-hlcdc-use-double-rate-for-pixel-clock-only-if-supported.patch
+drm-atmel-hlcdc-enable-clock-before-configuring-timing-engine.patch
+drm-atmel-hlcdc-prefer-a-lower-pixel-clock-than-requested.patch
+drm-rect-avoid-division-by-zero.patch
+media-iguanair-fix-endpoint-sanity-check.patch
+media-rc-ensure-lirc-is-initialized-before-registering-input-device.patch
+tools-kvm_stat-fix-kvm_exit-filter-name.patch
+xen-balloon-support-xend-based-toolstack-take-two.patch
+watchdog-fix-uaf-in-reboot-notifier-handling-in-watchdog-core-code.patch
+bcache-add-readahead-cache-policy-options-via-sysfs-interface.patch
+eventfd-track-eventfd_signal-recursion-depth.patch
+aio-prevent-potential-eventfd-recursion-on-poll.patch
+kvm-x86-refactor-picdev_write-to-prevent-spectre-v1-l1tf-attacks.patch
+kvm-x86-refactor-prefix-decoding-to-prevent-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-pmu_intel.c-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-dr-based-index-computations-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-kvm_lapic_reg_write-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-kvm_hv_msr__crash_data-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-ioapic_write_indirect-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-msr-based-index-computations-in-pmu.h-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-ioapic_read_indirect-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-msr-based-index-computations-from-spectre-v1-l1tf-attacks-in-x86.c.patch
+kvm-x86-protect-x86_decode_insn-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-protect-msr-based-index-computations-in-fixed_msr_to_seg_unit-from-spectre-v1-l1tf-attacks.patch
+kvm-x86-fix-potential-put_fpu-w-o-load_fpu-on-mpx-platform.patch
+kvm-ppc-book3s-hv-uninit-vcpu-if-vcore-creation-fails.patch
+kvm-ppc-book3s-pr-free-shared-page-if-mmu-initialization-fails.patch
+kvm-svm-pku-not-currently-supported.patch
--- /dev/null
+From 3d96208c30f84d6edf9ab4fac813306ac0d20c10 Mon Sep 17 00:00:00 2001
+From: Roberto Bergantinos Corpas <rbergant@redhat.com>
+Date: Tue, 4 Feb 2020 11:32:56 +0100
+Subject: sunrpc: expiry_time should be seconds not timeval
+
+From: Roberto Bergantinos Corpas <rbergant@redhat.com>
+
+commit 3d96208c30f84d6edf9ab4fac813306ac0d20c10 upstream.
+
+When upcalling gssproxy, cache_head.expiry_time is set as a
+timeval, not seconds since boot. As such, RPC cache expiry
+logic will not clean expired objects created under
+auth.rpcsec.context cache.
+
+This has proven to cause kernel memory leaks on field. Using
+64 bit variants of getboottime/timespec
+
+Expiration times have worked this way since 2010's c5b29f885afe "sunrpc:
+use seconds since boot in expiry cache". The gssproxy code introduced
+in 2012 added gss_proxy_save_rsc and introduced the bug. That's a while
+for this to lurk, but it required a bit of an extreme case to make it
+obvious.
+
+Signed-off-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
+Cc: stable@vger.kernel.org
+Fixes: 030d794bf498 "SUNRPC: Use gssproxy upcall for server..."
+Tested-By: Frank Sorenson <sorenson@redhat.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/sunrpc/auth_gss/svcauth_gss.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/net/sunrpc/auth_gss/svcauth_gss.c
++++ b/net/sunrpc/auth_gss/svcauth_gss.c
+@@ -1245,6 +1245,7 @@ static int gss_proxy_save_rsc(struct cac
+ dprintk("RPC: No creds found!\n");
+ goto out;
+ } else {
++ struct timespec64 boot;
+
+ /* steal creds */
+ rsci.cred = ud->creds;
+@@ -1265,6 +1266,9 @@ static int gss_proxy_save_rsc(struct cac
+ &expiry, GFP_KERNEL);
+ if (status)
+ goto out;
++
++ getboottime64(&boot);
++ expiry -= boot.tv_sec;
+ }
+
+ rsci.h.expiry_time = expiry;
--- /dev/null
+From 5fcf3a55a62afb0760ccb6f391d62f20bce4a42f Mon Sep 17 00:00:00 2001
+From: Gavin Shan <gshan@redhat.com>
+Date: Tue, 10 Dec 2019 15:48:29 +1100
+Subject: tools/kvm_stat: Fix kvm_exit filter name
+
+From: Gavin Shan <gshan@redhat.com>
+
+commit 5fcf3a55a62afb0760ccb6f391d62f20bce4a42f upstream.
+
+The filter name is fixed to "exit_reason" for some kvm_exit events, no
+matter what architect we have. Actually, the filter name ("exit_reason")
+is only applicable to x86, meaning it's broken on other architects
+including aarch64.
+
+This fixes the issue by providing various kvm_exit filter names, depending
+on architect we're on. Afterwards, the variable filter name is picked and
+applied through ioctl(fd, SET_FILTER).
+
+Reported-by: Andrew Jones <drjones@redhat.com>
+Signed-off-by: Gavin Shan <gshan@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/kvm/kvm_stat/kvm_stat | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/tools/kvm/kvm_stat/kvm_stat
++++ b/tools/kvm/kvm_stat/kvm_stat
+@@ -270,6 +270,7 @@ class ArchX86(Arch):
+ def __init__(self, exit_reasons):
+ self.sc_perf_evt_open = 298
+ self.ioctl_numbers = IOCTL_NUMBERS
++ self.exit_reason_field = 'exit_reason'
+ self.exit_reasons = exit_reasons
+
+ def debugfs_is_child(self, field):
+@@ -289,6 +290,7 @@ class ArchPPC(Arch):
+ # numbers depend on the wordsize.
+ char_ptr_size = ctypes.sizeof(ctypes.c_char_p)
+ self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16
++ self.exit_reason_field = 'exit_nr'
+ self.exit_reasons = {}
+
+ def debugfs_is_child(self, field):
+@@ -300,6 +302,7 @@ class ArchA64(Arch):
+ def __init__(self):
+ self.sc_perf_evt_open = 241
+ self.ioctl_numbers = IOCTL_NUMBERS
++ self.exit_reason_field = 'esr_ec'
+ self.exit_reasons = AARCH64_EXIT_REASONS
+
+ def debugfs_is_child(self, field):
+@@ -311,6 +314,7 @@ class ArchS390(Arch):
+ def __init__(self):
+ self.sc_perf_evt_open = 331
+ self.ioctl_numbers = IOCTL_NUMBERS
++ self.exit_reason_field = None
+ self.exit_reasons = None
+
+ def debugfs_is_child(self, field):
+@@ -541,8 +545,8 @@ class TracepointProvider(Provider):
+ """
+ filters = {}
+ filters['kvm_userspace_exit'] = ('reason', USERSPACE_EXIT_REASONS)
+- if ARCH.exit_reasons:
+- filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons)
++ if ARCH.exit_reason_field and ARCH.exit_reasons:
++ filters['kvm_exit'] = (ARCH.exit_reason_field, ARCH.exit_reasons)
+ return filters
+
+ def _get_available_fields(self):
--- /dev/null
+From 69503e585192fdd84b240f18a0873d20e18a2e0a Mon Sep 17 00:00:00 2001
+From: Vladis Dronov <vdronov@redhat.com>
+Date: Wed, 8 Jan 2020 13:53:47 +0100
+Subject: watchdog: fix UAF in reboot notifier handling in watchdog core code
+
+From: Vladis Dronov <vdronov@redhat.com>
+
+commit 69503e585192fdd84b240f18a0873d20e18a2e0a upstream.
+
+After the commit 44ea39420fc9 ("drivers/watchdog: make use of
+devm_register_reboot_notifier()") the struct notifier_block reboot_nb in
+the struct watchdog_device is removed from the reboot notifiers chain at
+the time watchdog's chardev is closed. But at least in i6300esb.c case
+reboot_nb is embedded in the struct esb_dev which can be freed on its
+device removal and before the chardev is closed, thus UAF at reboot:
+
+[ 7.728581] esb_probe: esb_dev.watchdog_device ffff91316f91ab28
+ts# uname -r note the address ^^^
+5.5.0-rc5-ae6088-wdog
+ts# ./openwdog0 &
+[1] 696
+ts# opened /dev/watchdog0, sleeping 10s...
+ts# echo 1 > /sys/devices/pci0000\:00/0000\:00\:09.0/remove
+[ 178.086079] devres:rel_nodes: dev ffff91317668a0b0 data ffff91316f91ab28
+ esb_dev.watchdog_device.reboot_nb memory is freed here ^^^
+ts# ...woken up
+[ 181.459010] devres:rel_nodes: dev ffff913171781000 data ffff913174a1dae8
+[ 181.460195] devm_unreg_reboot_notifier: res ffff913174a1dae8 nb ffff91316f91ab78
+ attempt to use memory already freed ^^^
+[ 181.461063] devm_unreg_reboot_notifier: nb->call 6b6b6b6b6b6b6b6b
+[ 181.461243] devm_unreg_reboot_notifier: nb->next 6b6b6b6b6b6b6b6b
+ freed memory is filled with a slub poison ^^^
+[1]+ Done ./openwdog0
+ts# reboot
+[ 229.921862] systemd-shutdown[1]: Rebooting.
+[ 229.939265] notifier_call_chain: nb ffffffff9c6c2f20 nb->next ffffffff9c6d50c0
+[ 229.943080] notifier_call_chain: nb ffffffff9c6d50c0 nb->next 6b6b6b6b6b6b6b6b
+[ 229.946054] notifier_call_chain: nb 6b6b6b6b6b6b6b6b INVAL
+[ 229.957584] general protection fault: 0000 [#1] SMP
+[ 229.958770] CPU: 0 PID: 1 Comm: systemd-shutdow Not tainted 5.5.0-rc5-ae6088-wdog
+[ 229.960224] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
+[ 229.963288] RIP: 0010:notifier_call_chain+0x66/0xd0
+[ 229.969082] RSP: 0018:ffffb20dc0013d88 EFLAGS: 00010246
+[ 229.970812] RAX: 000000000000002e RBX: 6b6b6b6b6b6b6b6b RCX: 00000000000008b3
+[ 229.972929] RDX: 0000000000000000 RSI: 0000000000000096 RDI: ffffffff9ccc46ac
+[ 229.975028] RBP: 0000000000000001 R08: 0000000000000000 R09: 00000000000008b3
+[ 229.977039] R10: 0000000000000001 R11: ffffffff9c26c740 R12: 0000000000000000
+[ 229.979155] R13: 6b6b6b6b6b6b6b6b R14: 0000000000000000 R15: 00000000fffffffa
+... slub_debug=FZP poison ^^^
+[ 229.989089] Call Trace:
+[ 229.990157] blocking_notifier_call_chain+0x43/0x59
+[ 229.991401] kernel_restart_prepare+0x14/0x30
+[ 229.992607] kernel_restart+0x9/0x30
+[ 229.993800] __do_sys_reboot+0x1d2/0x210
+[ 230.000149] do_syscall_64+0x3d/0x130
+[ 230.001277] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+[ 230.002639] RIP: 0033:0x7f5461bdd177
+[ 230.016402] Modules linked in: i6300esb
+[ 230.050261] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
+
+Fix the crash by reverting 44ea39420fc9 so unregister_reboot_notifier()
+is called when watchdog device is removed. This also makes handling of
+the reboot notifier unified with the handling of the restart handler,
+which is freed with unregister_restart_handler() in the same place.
+
+Fixes: 44ea39420fc9 ("drivers/watchdog: make use of devm_register_reboot_notifier()")
+Cc: stable@vger.kernel.org # v4.15+
+Signed-off-by: Vladis Dronov <vdronov@redhat.com>
+Reviewed-by: Guenter Roeck <linux@roeck-us.net>
+Link: https://lore.kernel.org/r/20200108125347.6067-1-vdronov@redhat.com
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/watchdog/watchdog_core.c | 35 +++++++++++++++++++++++++++++++++++
+ drivers/watchdog/watchdog_dev.c | 36 +-----------------------------------
+ 2 files changed, 36 insertions(+), 35 deletions(-)
+
+--- a/drivers/watchdog/watchdog_core.c
++++ b/drivers/watchdog/watchdog_core.c
+@@ -147,6 +147,25 @@ int watchdog_init_timeout(struct watchdo
+ }
+ EXPORT_SYMBOL_GPL(watchdog_init_timeout);
+
++static int watchdog_reboot_notifier(struct notifier_block *nb,
++ unsigned long code, void *data)
++{
++ struct watchdog_device *wdd;
++
++ wdd = container_of(nb, struct watchdog_device, reboot_nb);
++ if (code == SYS_DOWN || code == SYS_HALT) {
++ if (watchdog_active(wdd)) {
++ int ret;
++
++ ret = wdd->ops->stop(wdd);
++ if (ret)
++ return NOTIFY_BAD;
++ }
++ }
++
++ return NOTIFY_DONE;
++}
++
+ static int watchdog_restart_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+ {
+@@ -235,6 +254,19 @@ static int __watchdog_register_device(st
+ }
+ }
+
++ if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
++ wdd->reboot_nb.notifier_call = watchdog_reboot_notifier;
++
++ ret = register_reboot_notifier(&wdd->reboot_nb);
++ if (ret) {
++ pr_err("watchdog%d: Cannot register reboot notifier (%d)\n",
++ wdd->id, ret);
++ watchdog_dev_unregister(wdd);
++ ida_simple_remove(&watchdog_ida, id);
++ return ret;
++ }
++ }
++
+ if (wdd->ops->restart) {
+ wdd->restart_nb.notifier_call = watchdog_restart_notifier;
+
+@@ -289,6 +321,9 @@ static void __watchdog_unregister_device
+ if (wdd->ops->restart)
+ unregister_restart_handler(&wdd->restart_nb);
+
++ if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status))
++ unregister_reboot_notifier(&wdd->reboot_nb);
++
+ watchdog_dev_unregister(wdd);
+ ida_simple_remove(&watchdog_ida, wdd->id);
+ }
+--- a/drivers/watchdog/watchdog_dev.c
++++ b/drivers/watchdog/watchdog_dev.c
+@@ -38,7 +38,6 @@
+ #include <linux/miscdevice.h> /* For handling misc devices */
+ #include <linux/module.h> /* For module stuff/... */
+ #include <linux/mutex.h> /* For mutexes */
+-#include <linux/reboot.h> /* For reboot notifier */
+ #include <linux/slab.h> /* For memory functions */
+ #include <linux/types.h> /* For standard types (like size_t) */
+ #include <linux/watchdog.h> /* For watchdog specific items */
+@@ -1077,25 +1076,6 @@ static void watchdog_cdev_unregister(str
+ put_device(&wd_data->dev);
+ }
+
+-static int watchdog_reboot_notifier(struct notifier_block *nb,
+- unsigned long code, void *data)
+-{
+- struct watchdog_device *wdd;
+-
+- wdd = container_of(nb, struct watchdog_device, reboot_nb);
+- if (code == SYS_DOWN || code == SYS_HALT) {
+- if (watchdog_active(wdd)) {
+- int ret;
+-
+- ret = wdd->ops->stop(wdd);
+- if (ret)
+- return NOTIFY_BAD;
+- }
+- }
+-
+- return NOTIFY_DONE;
+-}
+-
+ /*
+ * watchdog_dev_register: register a watchdog device
+ * @wdd: watchdog device
+@@ -1114,22 +1094,8 @@ int watchdog_dev_register(struct watchdo
+ return ret;
+
+ ret = watchdog_register_pretimeout(wdd);
+- if (ret) {
++ if (ret)
+ watchdog_cdev_unregister(wdd);
+- return ret;
+- }
+-
+- if (test_bit(WDOG_STOP_ON_REBOOT, &wdd->status)) {
+- wdd->reboot_nb.notifier_call = watchdog_reboot_notifier;
+-
+- ret = devm_register_reboot_notifier(&wdd->wd_data->dev,
+- &wdd->reboot_nb);
+- if (ret) {
+- pr_err("watchdog%d: Cannot register reboot notifier (%d)\n",
+- wdd->id, ret);
+- watchdog_dev_unregister(wdd);
+- }
+- }
+
+ return ret;
+ }
--- /dev/null
+From eda4eabf86fd6806eaabc23fb90dd056fdac037b Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Fri, 17 Jan 2020 14:49:31 +0100
+Subject: xen/balloon: Support xend-based toolstack take two
+
+From: Juergen Gross <jgross@suse.com>
+
+commit eda4eabf86fd6806eaabc23fb90dd056fdac037b upstream.
+
+Commit 3aa6c19d2f38be ("xen/balloon: Support xend-based toolstack")
+tried to fix a regression with running on rather ancient Xen versions.
+Unfortunately the fix was based on the assumption that xend would
+just use another Xenstore node, but in reality only some downstream
+versions of xend are doing that. The upstream xend does not write
+that Xenstore node at all, so the problem must be fixed in another
+way.
+
+The easiest way to achieve that is to fall back to the behavior
+before commit 96edd61dcf4436 ("xen/balloon: don't online new memory
+initially") in case the static memory maximum can't be read.
+
+This is achieved by setting static_max to the current number of
+memory pages known by the system resulting in target_diff becoming
+zero.
+
+Fixes: 3aa6c19d2f38be ("xen/balloon: Support xend-based toolstack")
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Cc: <stable@vger.kernel.org> # 4.13
+Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/xen/xen-balloon.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/xen/xen-balloon.c
++++ b/drivers/xen/xen-balloon.c
+@@ -94,7 +94,7 @@ static void watch_target(struct xenbus_w
+ "%llu", &static_max) == 1))
+ static_max >>= PAGE_SHIFT - 10;
+ else
+- static_max = new_target;
++ static_max = balloon_stats.current_pages;
+
+ target_diff = (xen_pv_domain() || xen_initial_domain()) ? 0
+ : static_max - balloon_stats.target_pages;