5.16-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 23 Jan 2022 17:18:27 +0000 (18:18 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 23 Jan 2022 17:18:27 +0000 (18:18 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 23 Jan 2022 17:18:27 +0000 (18:18 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 23 Jan 2022 17:18:27 +0000 (18:18 +0100)
diff --git a/queue-5.16/btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch b/queue-5.16/btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch

new file mode 100644 (file)

index 0000000..1d24507
--- /dev/null
+++ b/queue-5.16/btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch
@@ -0,0 +1,59 @@
+From 50475cd57706359d6cc652be88369dace7a4c2eb Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Wed, 8 Dec 2021 00:35:48 +0900
+Subject: btrfs: add extent allocator hook to decide to allocate chunk or not
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 50475cd57706359d6cc652be88369dace7a4c2eb upstream.
+
+Introduce a new hook for an extent allocator policy. With the new
+hook, a policy can decide to allocate a new block group or not. If
+not, it will return -ENOSPC, so btrfs_reserve_extent() will cut the
+allocation size in half and retry the allocation if min_alloc_size is
+large enough.
+
+The hook has a place holder and will be replaced with the real
+implementation in the next patch.
+
+CC: stable@vger.kernel.org # 5.16
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c |   17 +++++++++++++++++
+ 1 file changed, 17 insertions(+)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3947,6 +3947,19 @@ static void found_extent(struct find_fre
+       }
+ }
+ 
++static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
++                             struct find_free_extent_ctl *ffe_ctl)
++{
++      switch (ffe_ctl->policy) {
++      case BTRFS_EXTENT_ALLOC_CLUSTERED:
++              return true;
++      case BTRFS_EXTENT_ALLOC_ZONED:
++              return true;
++      default:
++              BUG();
++      }
++}
++
+ static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
+ {
+       switch (ffe_ctl->policy) {
+@@ -4034,6 +4047,10 @@ static int find_free_extent_update_loop(
+                       struct btrfs_trans_handle *trans;
+                       int exist = 0;
+ 
++                      /*Check if allocation policy allows to create a new chunk */
++                      if (!can_allocate_chunk(fs_info, ffe_ctl))
++                              return -ENOSPC;
++
+                       trans = current->journal_info;
+                       if (trans)
+                               exist = 1;
diff --git a/queue-5.16/btrfs-check-the-root-node-for-uptodate-before-returning-it.patch b/queue-5.16/btrfs-check-the-root-node-for-uptodate-before-returning-it.patch

new file mode 100644 (file)

index 0000000..cb978f9
--- /dev/null
+++ b/queue-5.16/btrfs-check-the-root-node-for-uptodate-before-returning-it.patch
@@ -0,0 +1,68 @@
+From 120de408e4b97504a2d9b5ca534b383de2c73d49 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 24 Nov 2021 14:14:24 -0500
+Subject: btrfs: check the root node for uptodate before returning it
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 120de408e4b97504a2d9b5ca534b383de2c73d49 upstream.
+
+Now that we clear the extent buffer uptodate if we fail to write it out
+we need to check to see if our root node is uptodate before we search
+down it.  Otherwise we could return stale data (or potentially corrupt
+data that was caught by the write verification step) and think that the
+path is OK to search down.
+
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.c |   19 +++++++++++++++----
+ 1 file changed, 15 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -1570,12 +1570,9 @@ static struct extent_buffer *btrfs_searc
+ {
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct extent_buffer *b;
+-      int root_lock;
++      int root_lock = 0;
+       int level = 0;
+ 
+-      /* We try very hard to do read locks on the root */
+-      root_lock = BTRFS_READ_LOCK;
+-
+       if (p->search_commit_root) {
+               /*
+                * The commit roots are read only so we always do read locks,
+@@ -1613,6 +1610,9 @@ static struct extent_buffer *btrfs_searc
+               goto out;
+       }
+ 
++      /* We try very hard to do read locks on the root */
++      root_lock = BTRFS_READ_LOCK;
++
+       /*
+        * If the level is set to maximum, we can skip trying to get the read
+        * lock.
+@@ -1639,6 +1639,17 @@ static struct extent_buffer *btrfs_searc
+       level = btrfs_header_level(b);
+ 
+ out:
++      /*
++       * The root may have failed to write out at some point, and thus is no
++       * longer valid, return an error in this case.
++       */
++      if (!extent_buffer_uptodate(b)) {
++              if (root_lock)
++                      btrfs_tree_unlock_rw(b, root_lock);
++              free_extent_buffer(b);
++              return ERR_PTR(-EIO);
++      }
++
+       p->nodes[level] = b;
+       if (!p->skip_locking)
+               p->locks[level] = root_lock;
diff --git a/queue-5.16/btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch b/queue-5.16/btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch

new file mode 100644 (file)

index 0000000..ac9b5cf
--- /dev/null
+++ b/queue-5.16/btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch
@@ -0,0 +1,128 @@
+From 232796df8c1437c41d308d161007f0715bac0a54 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 27 Oct 2021 18:30:25 +0100
+Subject: btrfs: fix deadlock between quota enable and other quota operations
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 232796df8c1437c41d308d161007f0715bac0a54 upstream.
+
+When enabling quotas, we attempt to commit a transaction while holding the
+mutex fs_info->qgroup_ioctl_lock. This can result on a deadlock with other
+quota operations such as:
+
+- qgroup creation and deletion, ioctl BTRFS_IOC_QGROUP_CREATE;
+
+- adding and removing qgroup relations, ioctl BTRFS_IOC_QGROUP_ASSIGN.
+
+This is because these operations join a transaction and after that they
+attempt to lock the mutex fs_info->qgroup_ioctl_lock. Acquiring that mutex
+after joining or starting a transaction is a pattern followed everywhere
+in qgroups, so the quota enablement operation is the one at fault here,
+and should not commit a transaction while holding that mutex.
+
+Fix this by making the transaction commit while not holding the mutex.
+We are safe from two concurrent tasks trying to enable quotas because
+we are serialized by the rw semaphore fs_info->subvol_sem at
+btrfs_ioctl_quota_ctl(), which is the only call site for enabling
+quotas.
+
+When this deadlock happens, it produces a trace like the following:
+
+  INFO: task syz-executor:25604 blocked for more than 143 seconds.
+  Not tainted 5.15.0-rc6 #4
+  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  task:syz-executor state:D stack:24800 pid:25604 ppid: 24873 flags:0x00004004
+  Call Trace:
+  context_switch kernel/sched/core.c:4940 [inline]
+  __schedule+0xcd9/0x2530 kernel/sched/core.c:6287
+  schedule+0xd3/0x270 kernel/sched/core.c:6366
+  btrfs_commit_transaction+0x994/0x2e90 fs/btrfs/transaction.c:2201
+  btrfs_quota_enable+0x95c/0x1790 fs/btrfs/qgroup.c:1120
+  btrfs_ioctl_quota_ctl fs/btrfs/ioctl.c:4229 [inline]
+  btrfs_ioctl+0x637e/0x7b70 fs/btrfs/ioctl.c:5010
+  vfs_ioctl fs/ioctl.c:51 [inline]
+  __do_sys_ioctl fs/ioctl.c:874 [inline]
+  __se_sys_ioctl fs/ioctl.c:860 [inline]
+  __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
+  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
+  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+  RIP: 0033:0x7f86920b2c4d
+  RSP: 002b:00007f868f61ac58 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+  RAX: ffffffffffffffda RBX: 00007f86921d90a0 RCX: 00007f86920b2c4d
+  RDX: 0000000020005e40 RSI: 00000000c0109428 RDI: 0000000000000008
+  RBP: 00007f869212bd80 R08: 0000000000000000 R09: 0000000000000000
+  R10: 0000000000000000 R11: 0000000000000246 R12: 00007f86921d90a0
+  R13: 00007fff6d233e4f R14: 00007fff6d233ff0 R15: 00007f868f61adc0
+  INFO: task syz-executor:25628 blocked for more than 143 seconds.
+  Not tainted 5.15.0-rc6 #4
+  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  task:syz-executor state:D stack:29080 pid:25628 ppid: 24873 flags:0x00004004
+  Call Trace:
+  context_switch kernel/sched/core.c:4940 [inline]
+  __schedule+0xcd9/0x2530 kernel/sched/core.c:6287
+  schedule+0xd3/0x270 kernel/sched/core.c:6366
+  schedule_preempt_disabled+0xf/0x20 kernel/sched/core.c:6425
+  __mutex_lock_common kernel/locking/mutex.c:669 [inline]
+  __mutex_lock+0xc96/0x1680 kernel/locking/mutex.c:729
+  btrfs_remove_qgroup+0xb7/0x7d0 fs/btrfs/qgroup.c:1548
+  btrfs_ioctl_qgroup_create fs/btrfs/ioctl.c:4333 [inline]
+  btrfs_ioctl+0x683c/0x7b70 fs/btrfs/ioctl.c:5014
+  vfs_ioctl fs/ioctl.c:51 [inline]
+  __do_sys_ioctl fs/ioctl.c:874 [inline]
+  __se_sys_ioctl fs/ioctl.c:860 [inline]
+  __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
+  do_syscall_x64 arch/x86/entry/common.c:50 [inline]
+  do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+Reported-by: Hao Sun <sunhao.th@gmail.com>
+Link: https://lore.kernel.org/linux-btrfs/CACkBjsZQF19bQ1C6=yetF3BvL10OSORpFUcWXTP6HErshDB4dQ@mail.gmail.com/
+Fixes: 340f1aa27f36 ("btrfs: qgroups: Move transaction management inside btrfs_quota_enable/disable")
+CC: stable@vger.kernel.org # 4.19
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c |   19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -940,6 +940,14 @@ int btrfs_quota_enable(struct btrfs_fs_i
+       int ret = 0;
+       int slot;
+ 
++      /*
++       * We need to have subvol_sem write locked, to prevent races between
++       * concurrent tasks trying to enable quotas, because we will unlock
++       * and relock qgroup_ioctl_lock before setting fs_info->quota_root
++       * and before setting BTRFS_FS_QUOTA_ENABLED.
++       */
++      lockdep_assert_held_write(&fs_info->subvol_sem);
++
+       mutex_lock(&fs_info->qgroup_ioctl_lock);
+       if (fs_info->quota_root)
+               goto out;
+@@ -1117,8 +1125,19 @@ out_add_root:
+               goto out_free_path;
+       }
+ 
++      mutex_unlock(&fs_info->qgroup_ioctl_lock);
++      /*
++       * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
++       * a deadlock with tasks concurrently doing other qgroup operations, such
++       * adding/removing qgroups or adding/deleting qgroup relations for example,
++       * because all qgroup operations first start or join a transaction and then
++       * lock the qgroup_ioctl_lock mutex.
++       * We are safe from a concurrent task trying to enable quotas, by calling
++       * this function, since we are serialized by fs_info->subvol_sem.
++       */
+       ret = btrfs_commit_transaction(trans);
+       trans = NULL;
++      mutex_lock(&fs_info->qgroup_ioctl_lock);
+       if (ret)
+               goto out_free_path;
+ 
diff --git a/queue-5.16/btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch b/queue-5.16/btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch

new file mode 100644 (file)

index 0000000..8d468e8
--- /dev/null
+++ b/queue-5.16/btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch
@@ -0,0 +1,65 @@
+From c2f822635df873c510bda6fb7fd1b10b7c31be2d Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 16 Dec 2021 15:00:32 +0000
+Subject: btrfs: respect the max size in the header when activating swap file
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit c2f822635df873c510bda6fb7fd1b10b7c31be2d upstream.
+
+If we extended the size of a swapfile after its header was created (by the
+mkswap utility) and then try to activate it, we will map the entire file
+when activating the swap file, instead of limiting to the max size defined
+in the swap file's header.
+
+Currently test case generic/643 from fstests fails because we do not
+respect that size limit defined in the swap file's header.
+
+So fix this by not mapping file ranges beyond the max size defined in the
+swap header.
+
+This is the same type of bug that iomap used to have, and was fixed in
+commit 36ca7943ac18ae ("mm/swap: consider max pages in
+iomap_swapfile_add_extent").
+
+Fixes: ed46ff3d423780 ("Btrfs: support swap files")
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-and-tested-by: Josef Bacik <josef@toxicpanda.com
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c |   11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -10595,9 +10595,19 @@ static int btrfs_add_swap_extent(struct
+                                struct btrfs_swap_info *bsi)
+ {
+       unsigned long nr_pages;
++      unsigned long max_pages;
+       u64 first_ppage, first_ppage_reported, next_ppage;
+       int ret;
+ 
++      /*
++       * Our swapfile may have had its size extended after the swap header was
++       * written. In that case activating the swapfile should not go beyond
++       * the max size set in the swap header.
++       */
++      if (bsi->nr_pages >= sis->max)
++              return 0;
++
++      max_pages = sis->max - bsi->nr_pages;
+       first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
+       next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
+                               PAGE_SIZE) >> PAGE_SHIFT;
+@@ -10605,6 +10615,7 @@ static int btrfs_add_swap_extent(struct
+       if (first_ppage >= next_ppage)
+               return 0;
+       nr_pages = next_ppage - first_ppage;
++      nr_pages = min(nr_pages, max_pages);
+ 
+       first_ppage_reported = first_ppage;
+       if (bsi->start == 0)
diff --git a/queue-5.16/btrfs-zoned-cache-reported-zone-during-mount.patch b/queue-5.16/btrfs-zoned-cache-reported-zone-during-mount.patch

new file mode 100644 (file)

index 0000000..c3fdb57
--- /dev/null
+++ b/queue-5.16/btrfs-zoned-cache-reported-zone-during-mount.patch
@@ -0,0 +1,291 @@
+From 16beac87e95e2fb278b552397c8260637f8a63f7 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Thu, 11 Nov 2021 14:14:38 +0900
+Subject: btrfs: zoned: cache reported zone during mount
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 16beac87e95e2fb278b552397c8260637f8a63f7 upstream.
+
+When mounting a device, we are reporting the zones twice: once for
+checking the zone attributes in btrfs_get_dev_zone_info and once for
+loading block groups' zone info in
+btrfs_load_block_group_zone_info(). With a lot of block groups, that
+leads to a lot of REPORT ZONE commands and slows down the mount
+process.
+
+This patch introduces a zone info cache in struct
+btrfs_zoned_device_info. The cache is populated while in
+btrfs_get_dev_zone_info() and used for
+btrfs_load_block_group_zone_info() to reduce the number of REPORT ZONE
+commands. The zone cache is then released after loading the block
+groups, as it will not be much effective during the run time.
+
+Benchmark: Mount an HDD with 57,007 block groups
+Before patch: 171.368 seconds
+After patch: 64.064 seconds
+
+While it still takes a minute due to the slowness of loading all the
+block groups, the patch reduces the mount time by 1/3.
+
+Link: https://lore.kernel.org/linux-btrfs/CAHQ7scUiLtcTqZOMMY5kbWUBOhGRwKo6J6wYPT5WY+C=cD49nQ@mail.gmail.com/
+Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices")
+CC: stable@vger.kernel.org
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/dev-replace.c |    2 -
+ fs/btrfs/disk-io.c     |    2 +
+ fs/btrfs/volumes.c     |    2 -
+ fs/btrfs/zoned.c       |   86 +++++++++++++++++++++++++++++++++++++++++++------
+ fs/btrfs/zoned.h       |    8 +++-
+ 5 files changed, 87 insertions(+), 13 deletions(-)
+
+--- a/fs/btrfs/dev-replace.c
++++ b/fs/btrfs/dev-replace.c
+@@ -322,7 +322,7 @@ static int btrfs_init_dev_replace_tgtdev
+       set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+       device->fs_devices = fs_info->fs_devices;
+ 
+-      ret = btrfs_get_dev_zone_info(device);
++      ret = btrfs_get_dev_zone_info(device, false);
+       if (ret)
+               goto error;
+ 
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3571,6 +3571,8 @@ int __cold open_ctree(struct super_block
+               goto fail_sysfs;
+       }
+ 
++      btrfs_free_zone_cache(fs_info);
++
+       if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+           !btrfs_check_rw_degradable(fs_info, NULL)) {
+               btrfs_warn(fs_info,
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -2643,7 +2643,7 @@ int btrfs_init_new_device(struct btrfs_f
+       device->fs_info = fs_info;
+       device->bdev = bdev;
+ 
+-      ret = btrfs_get_dev_zone_info(device);
++      ret = btrfs_get_dev_zone_info(device, false);
+       if (ret)
+               goto error_free_device;
+ 
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -5,6 +5,7 @@
+ #include <linux/blkdev.h>
+ #include <linux/sched/mm.h>
+ #include <linux/atomic.h>
++#include <linux/vmalloc.h>
+ #include "ctree.h"
+ #include "volumes.h"
+ #include "zoned.h"
+@@ -213,6 +214,8 @@ static int emulate_report_zones(struct b
+ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+                              struct blk_zone *zones, unsigned int *nr_zones)
+ {
++      struct btrfs_zoned_device_info *zinfo = device->zone_info;
++      u32 zno;
+       int ret;
+ 
+       if (!*nr_zones)
+@@ -224,6 +227,34 @@ static int btrfs_get_dev_zones(struct bt
+               return 0;
+       }
+ 
++      /* Check cache */
++      if (zinfo->zone_cache) {
++              unsigned int i;
++
++              ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
++              zno = pos >> zinfo->zone_size_shift;
++              /*
++               * We cannot report zones beyond the zone end. So, it is OK to
++               * cap *nr_zones to at the end.
++               */
++              *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
++
++              for (i = 0; i < *nr_zones; i++) {
++                      struct blk_zone *zone_info;
++
++                      zone_info = &zinfo->zone_cache[zno + i];
++                      if (!zone_info->len)
++                              break;
++              }
++
++              if (i == *nr_zones) {
++                      /* Cache hit on all the zones */
++                      memcpy(zones, zinfo->zone_cache + zno,
++                             sizeof(*zinfo->zone_cache) * *nr_zones);
++                      return 0;
++              }
++      }
++
+       ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
+                                 copy_zone_info_cb, zones);
+       if (ret < 0) {
+@@ -237,6 +268,11 @@ static int btrfs_get_dev_zones(struct bt
+       if (!ret)
+               return -EIO;
+ 
++      /* Populate cache */
++      if (zinfo->zone_cache)
++              memcpy(zinfo->zone_cache + zno, zones,
++                     sizeof(*zinfo->zone_cache) * *nr_zones);
++
+       return 0;
+ }
+ 
+@@ -300,7 +336,7 @@ int btrfs_get_dev_zone_info_all_devices(
+               if (!device->bdev)
+                       continue;
+ 
+-              ret = btrfs_get_dev_zone_info(device);
++              ret = btrfs_get_dev_zone_info(device, true);
+               if (ret)
+                       break;
+       }
+@@ -309,7 +345,7 @@ int btrfs_get_dev_zone_info_all_devices(
+       return ret;
+ }
+ 
+-int btrfs_get_dev_zone_info(struct btrfs_device *device)
++int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
+ {
+       struct btrfs_fs_info *fs_info = device->fs_info;
+       struct btrfs_zoned_device_info *zone_info = NULL;
+@@ -339,6 +375,8 @@ int btrfs_get_dev_zone_info(struct btrfs
+       if (!zone_info)
+               return -ENOMEM;
+ 
++      device->zone_info = zone_info;
++
+       if (!bdev_is_zoned(bdev)) {
+               if (!fs_info->zone_size) {
+                       ret = calculate_emulated_zone_size(fs_info);
+@@ -407,6 +445,23 @@ int btrfs_get_dev_zone_info(struct btrfs
+               goto out;
+       }
+ 
++      /*
++       * Enable zone cache only for a zoned device. On a non-zoned device, we
++       * fill the zone info with emulated CONVENTIONAL zones, so no need to
++       * use the cache.
++       */
++      if (populate_cache && bdev_is_zoned(device->bdev)) {
++              zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
++                                              zone_info->nr_zones);
++              if (!zone_info->zone_cache) {
++                      btrfs_err_in_rcu(device->fs_info,
++                              "zoned: failed to allocate zone cache for %s",
++                              rcu_str_deref(device->name));
++                      ret = -ENOMEM;
++                      goto out;
++              }
++      }
++
+       /* Get zones type */
+       nactive = 0;
+       while (sector < nr_sectors) {
+@@ -505,8 +560,6 @@ int btrfs_get_dev_zone_info(struct btrfs
+ 
+       kfree(zones);
+ 
+-      device->zone_info = zone_info;
+-
+       switch (bdev_zoned_model(bdev)) {
+       case BLK_ZONED_HM:
+               model = "host-managed zoned";
+@@ -539,11 +592,7 @@ int btrfs_get_dev_zone_info(struct btrfs
+ out:
+       kfree(zones);
+ out_free_zone_info:
+-      bitmap_free(zone_info->active_zones);
+-      bitmap_free(zone_info->empty_zones);
+-      bitmap_free(zone_info->seq_zones);
+-      kfree(zone_info);
+-      device->zone_info = NULL;
++      btrfs_destroy_dev_zone_info(device);
+ 
+       return ret;
+ }
+@@ -558,6 +607,7 @@ void btrfs_destroy_dev_zone_info(struct
+       bitmap_free(zone_info->active_zones);
+       bitmap_free(zone_info->seq_zones);
+       bitmap_free(zone_info->empty_zones);
++      vfree(zone_info->zone_cache);
+       kfree(zone_info);
+       device->zone_info = NULL;
+ }
+@@ -1975,3 +2025,21 @@ void btrfs_clear_data_reloc_bg(struct bt
+               fs_info->data_reloc_bg = 0;
+       spin_unlock(&fs_info->relocation_bg_lock);
+ }
++
++void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
++{
++      struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
++      struct btrfs_device *device;
++
++      if (!btrfs_is_zoned(fs_info))
++              return;
++
++      mutex_lock(&fs_devices->device_list_mutex);
++      list_for_each_entry(device, &fs_devices->devices, dev_list) {
++              if (device->zone_info) {
++                      vfree(device->zone_info->zone_cache);
++                      device->zone_info->zone_cache = NULL;
++              }
++      }
++      mutex_unlock(&fs_devices->device_list_mutex);
++}
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -28,6 +28,7 @@ struct btrfs_zoned_device_info {
+       unsigned long *seq_zones;
+       unsigned long *empty_zones;
+       unsigned long *active_zones;
++      struct blk_zone *zone_cache;
+       struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
+ };
+ 
+@@ -35,7 +36,7 @@ struct btrfs_zoned_device_info {
+ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+                      struct blk_zone *zone);
+ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
+-int btrfs_get_dev_zone_info(struct btrfs_device *device);
++int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
+ void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
+ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
+ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+@@ -76,6 +77,7 @@ bool btrfs_can_activate_zone(struct btrf
+ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+                            u64 length);
+ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
++void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+ #else /* CONFIG_BLK_DEV_ZONED */
+ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+                                    struct blk_zone *zone)
+@@ -88,7 +90,8 @@ static inline int btrfs_get_dev_zone_inf
+       return 0;
+ }
+ 
+-static inline int btrfs_get_dev_zone_info(struct btrfs_device *device)
++static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
++                                        bool populate_cache)
+ {
+       return 0;
+ }
+@@ -232,6 +235,7 @@ static inline void btrfs_zone_finish_end
+ 
+ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+ 
++static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+ #endif
+ 
+ static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
diff --git a/queue-5.16/btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch b/queue-5.16/btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch

new file mode 100644 (file)

index 0000000..f4353f7
--- /dev/null
+++ b/queue-5.16/btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch
@@ -0,0 +1,144 @@
+From 82187d2ecdfb22ab7ee05f388402a39236d31428 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Wed, 8 Dec 2021 00:35:49 +0900
+Subject: btrfs: zoned: fix chunk allocation condition for zoned allocator
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 82187d2ecdfb22ab7ee05f388402a39236d31428 upstream.
+
+The ZNS specification defines a limit on the number of "active"
+zones. That limit impose us to limit the number of block groups which
+can be used for an allocation at the same time. Not to exceed the
+limit, we reuse the existing active block groups as much as possible
+when we can't activate any other zones without sacrificing an already
+activated block group in commit a85f05e59bc1 ("btrfs: zoned: avoid
+chunk allocation if active block group has enough space").
+
+However, the check is wrong in two ways. First, it checks the
+condition for every raid index (ffe_ctl->index). Even if it reaches
+the condition and "ffe_ctl->max_extent_size >=
+ffe_ctl->min_alloc_size" is met, there can be other block groups
+having enough space to hold ffe_ctl->num_bytes. (Actually, this won't
+happen in the current zoned code as it only supports SINGLE
+profile. But, it can happen once it enables other RAID types.)
+
+Second, it checks the active zone availability depending on the
+raid index. The raid index is just an index for
+space_info->block_groups, so it has nothing to do with chunk allocation.
+
+These mistakes are causing a faulty allocation in a certain
+situation. Consider we are running zoned btrfs on a device whose
+max_active_zone == 0 (no limit). And, suppose no block group have a
+room to fit ffe_ctl->num_bytes but some room to meet
+ffe_ctl->min_alloc_size (i.e. max_extent_size > num_bytes >=
+min_alloc_size).
+
+In this situation, the following occur:
+
+- With SINGLE raid_index, it reaches the chunk allocation checking
+  code
+- The check returns true because we can activate a new zone (no limit)
+- But, before allocating the chunk, it iterates to the next raid index
+  (RAID5)
+- Since there are no RAID5 block groups on zoned mode, it again
+  reaches the check code
+- The check returns false because of btrfs_can_activate_zone()'s "if
+  (raid_index != BTRFS_RAID_SINGLE)" part
+- That results in returning -ENOSPC without allocating a new chunk
+
+As a result, we end up hitting -ENOSPC too early.
+
+Move the check to the right place in the can_allocate_chunk() hook,
+and do the active zone check depending on the allocation flag, not on
+the raid index.
+
+CC: stable@vger.kernel.org # 5.16
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c |   21 +++++++++------------
+ fs/btrfs/zoned.c       |    5 ++---
+ fs/btrfs/zoned.h       |    5 ++---
+ 3 files changed, 13 insertions(+), 18 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3966,6 +3966,15 @@ static bool can_allocate_chunk(struct bt
+       case BTRFS_EXTENT_ALLOC_CLUSTERED:
+               return true;
+       case BTRFS_EXTENT_ALLOC_ZONED:
++              /*
++               * If we have enough free space left in an already
++               * active block group and we can't activate any other
++               * zone now, do not allow allocating a new chunk and
++               * let find_free_extent() retry with a smaller size.
++               */
++              if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
++                  !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
++                      return false;
+               return true;
+       default:
+               BUG();
+@@ -4012,18 +4021,6 @@ static int find_free_extent_update_loop(
+               return 0;
+       }
+ 
+-      if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+-          !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
+-              /*
+-               * If we have enough free space left in an already active block
+-               * group and we can't activate any other zone now, retry the
+-               * active ones with a smaller allocation size.  Returning early
+-               * from here will tell btrfs_reserve_extent() to haven the
+-               * size.
+-               */
+-              return -ENOSPC;
+-      }
+-
+       if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+               return 1;
+ 
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -1934,7 +1934,7 @@ int btrfs_zone_finish(struct btrfs_block
+       return ret;
+ }
+ 
+-bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index)
++bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
+ {
+       struct btrfs_device *device;
+       bool ret = false;
+@@ -1943,8 +1943,7 @@ bool btrfs_can_activate_zone(struct btrf
+               return true;
+ 
+       /* Non-single profiles are not supported yet */
+-      if (raid_index != BTRFS_RAID_SINGLE)
+-              return false;
++      ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0);
+ 
+       /* Check if there is a device with active zones left */
+       mutex_lock(&fs_devices->device_list_mutex);
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -72,8 +72,7 @@ struct btrfs_device *btrfs_zoned_get_dev
+                                           u64 logical, u64 length);
+ bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+ int btrfs_zone_finish(struct btrfs_block_group *block_group);
+-bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+-                           int raid_index);
++bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
+ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+                            u64 length);
+ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+@@ -225,7 +224,7 @@ static inline int btrfs_zone_finish(stru
+ }
+ 
+ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+-                                         int raid_index)
++                                         u64 flags)
+ {
+       return true;
+ }
diff --git a/queue-5.16/btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch b/queue-5.16/btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch

new file mode 100644 (file)

index 0000000..ed2dc2d
--- /dev/null
+++ b/queue-5.16/btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch
@@ -0,0 +1,72 @@
+From 1ada69f61c88abb75a1038ee457633325658a183 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Wed, 8 Dec 2021 00:35:47 +0900
+Subject: btrfs: zoned: unset dedicated block group on allocation failure
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 1ada69f61c88abb75a1038ee457633325658a183 upstream.
+
+Allocating an extent from a block group can fail for various reasons.
+When an allocation from a dedicated block group (for tree-log or
+relocation data) fails, we need to unregister it as a dedicated one so
+that we can allocate a new block group for the dedicated one.
+
+However, we are returning early when the block group in case it is
+read-only, fully used, or not be able to activate the zone. As a result,
+we keep the non-usable block group as a dedicated one, leading to
+further allocation failure. With many block groups, the allocator will
+iterate hopeless loop to find a free extent, results in a hung task.
+
+Fix the issue by delaying the return and doing the proper cleanups.
+
+CC: stable@vger.kernel.org # 5.16
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c |   20 ++++++++++++++++----
+ 1 file changed, 16 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3790,23 +3790,35 @@ static int do_allocation_zoned(struct bt
+       spin_unlock(&fs_info->relocation_bg_lock);
+       if (skip)
+               return 1;
++
+       /* Check RO and no space case before trying to activate it */
+       spin_lock(&block_group->lock);
+       if (block_group->ro ||
+           block_group->alloc_offset == block_group->zone_capacity) {
+-              spin_unlock(&block_group->lock);
+-              return 1;
++              ret = 1;
++              /*
++               * May need to clear fs_info->{treelog,data_reloc}_bg.
++               * Return the error after taking the locks.
++               */
+       }
+       spin_unlock(&block_group->lock);
+ 
+-      if (!btrfs_zone_activate(block_group))
+-              return 1;
++      if (!ret && !btrfs_zone_activate(block_group)) {
++              ret = 1;
++              /*
++               * May need to clear fs_info->{treelog,data_reloc}_bg.
++               * Return the error after taking the locks.
++               */
++      }
+ 
+       spin_lock(&space_info->lock);
+       spin_lock(&block_group->lock);
+       spin_lock(&fs_info->treelog_bg_lock);
+       spin_lock(&fs_info->relocation_bg_lock);
+ 
++      if (ret)
++              goto out;
++
+       ASSERT(!ffe_ctl->for_treelog ||
+              block_group->start == fs_info->treelog_bg ||
+              fs_info->treelog_bg == 0);
diff --git a/queue-5.16/ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch b/queue-5.16/ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch

new file mode 100644 (file)

index 0000000..f347f0d
--- /dev/null
+++ b/queue-5.16/ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch
@@ -0,0 +1,154 @@
+From 8c80fb312d7abf8bcd66cca1d843a80318a2c522 Mon Sep 17 00:00:00 2001
+From: Chunguang Xu <brookxu@tencent.com>
+Date: Tue, 23 Nov 2021 09:17:57 +0800
+Subject: ext4: fix a possible ABBA deadlock due to busy PA
+
+From: Chunguang Xu <brookxu@tencent.com>
+
+commit 8c80fb312d7abf8bcd66cca1d843a80318a2c522 upstream.
+
+We found on older kernel (3.10) that in the scenario of insufficient
+disk space, system may trigger an ABBA deadlock problem, it seems that
+this problem still exists in latest kernel, try to fix it here. The
+main process triggered by this problem is that task A occupies the PA
+and waits for the jbd2 transaction finish, the jbd2 transaction waits
+for the completion of task B's IO (plug_list), but task B waits for
+the release of PA by task A to finish discard, which indirectly forms
+an ABBA deadlock. The related calltrace is as follows:
+
+    Task A
+    vfs_write
+    ext4_mb_new_blocks()
+    ext4_mb_mark_diskspace_used()       JBD2
+    jbd2_journal_get_write_access()  -> jbd2_journal_commit_transaction()
+  ->schedule()                          filemap_fdatawait()
+ |                                              |
+ | Task B                                       |
+ | do_unlinkat()                                |
+ | ext4_evict_inode()                           |
+ | jbd2_journal_begin_ordered_truncate()        |
+ | filemap_fdatawrite_range()                   |
+ | ext4_mb_new_blocks()                         |
+  -ext4_mb_discard_group_preallocations() <-----
+
+Here, try to cancel ext4_mb_discard_group_preallocations() internal
+retry due to PA busy, and do a limited number of retries inside
+ext4_mb_discard_preallocations(), which can circumvent the above
+problems, but also has some advantages:
+
+1. Since the PA is in a busy state, if other groups have free PAs,
+   keeping the current PA may help to reduce fragmentation.
+2. Continue to traverse forward instead of waiting for the current
+   group PA to be released. In most scenarios, the PA discard time
+   can be reduced.
+
+However, in the case of smaller free space, if only a few groups have
+space, then due to multiple traversals of the group, it may increase
+CPU overhead. But in contrast, I feel that the overall benefit is
+better than the cost.
+
+Signed-off-by: Chunguang Xu <brookxu@tencent.com>
+Reported-by: kernel test robot <lkp@intel.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/1637630277-23496-1-git-send-email-brookxu.cn@gmail.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/mballoc.c |   40 ++++++++++++++++++----------------------
+ 1 file changed, 18 insertions(+), 22 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -4814,7 +4814,7 @@ ext4_mb_release_group_pa(struct ext4_bud
+  */
+ static noinline_for_stack int
+ ext4_mb_discard_group_preallocations(struct super_block *sb,
+-                                      ext4_group_t group, int needed)
++                                   ext4_group_t group, int *busy)
+ {
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+       struct buffer_head *bitmap_bh = NULL;
+@@ -4822,8 +4822,7 @@ ext4_mb_discard_group_preallocations(str
+       struct list_head list;
+       struct ext4_buddy e4b;
+       int err;
+-      int busy = 0;
+-      int free, free_total = 0;
++      int free = 0;
+ 
+       mb_debug(sb, "discard preallocation for group %u\n", group);
+       if (list_empty(&grp->bb_prealloc_list))
+@@ -4846,19 +4845,14 @@ ext4_mb_discard_group_preallocations(str
+               goto out_dbg;
+       }
+ 
+-      if (needed == 0)
+-              needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
+-
+       INIT_LIST_HEAD(&list);
+-repeat:
+-      free = 0;
+       ext4_lock_group(sb, group);
+       list_for_each_entry_safe(pa, tmp,
+                               &grp->bb_prealloc_list, pa_group_list) {
+               spin_lock(&pa->pa_lock);
+               if (atomic_read(&pa->pa_count)) {
+                       spin_unlock(&pa->pa_lock);
+-                      busy = 1;
++                      *busy = 1;
+                       continue;
+               }
+               if (pa->pa_deleted) {
+@@ -4898,22 +4892,13 @@ repeat:
+               call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+       }
+ 
+-      free_total += free;
+-
+-      /* if we still need more blocks and some PAs were used, try again */
+-      if (free_total < needed && busy) {
+-              ext4_unlock_group(sb, group);
+-              cond_resched();
+-              busy = 0;
+-              goto repeat;
+-      }
+       ext4_unlock_group(sb, group);
+       ext4_mb_unload_buddy(&e4b);
+       put_bh(bitmap_bh);
+ out_dbg:
+       mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
+-               free_total, group, grp->bb_free);
+-      return free_total;
++               free, group, grp->bb_free);
++      return free;
+ }
+ 
+ /*
+@@ -5455,13 +5440,24 @@ static int ext4_mb_discard_preallocation
+ {
+       ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+       int ret;
+-      int freed = 0;
++      int freed = 0, busy = 0;
++      int retry = 0;
+ 
+       trace_ext4_mb_discard_preallocations(sb, needed);
++
++      if (needed == 0)
++              needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
++ repeat:
+       for (i = 0; i < ngroups && needed > 0; i++) {
+-              ret = ext4_mb_discard_group_preallocations(sb, i, needed);
++              ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
+               freed += ret;
+               needed -= ret;
++              cond_resched();
++      }
++
++      if (needed > 0 && busy && ++retry < 3) {
++              busy = 0;
++              goto repeat;
+       }
+ 
+       return freed;
diff --git a/queue-5.16/ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch b/queue-5.16/ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch

new file mode 100644 (file)

index 0000000..a8c3fac
--- /dev/null
+++ b/queue-5.16/ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch
@@ -0,0 +1,54 @@
+From 5e4d0eba1ccaf19f93222abdeda5a368be141785 Mon Sep 17 00:00:00 2001
+From: Xin Yin <yinxin.x@bytedance.com>
+Date: Tue, 21 Dec 2021 10:28:39 +0800
+Subject: ext4: fix fast commit may miss tracking range for FALLOC_FL_ZERO_RANGE
+
+From: Xin Yin <yinxin.x@bytedance.com>
+
+commit 5e4d0eba1ccaf19f93222abdeda5a368be141785 upstream.
+
+when call falloc with FALLOC_FL_ZERO_RANGE, to set an range to unwritten,
+which has been already initialized. If the range is align to blocksize,
+fast commit will not track range for this change.
+
+Also track range for unwritten range in ext4_map_blocks().
+
+Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
+Reviewed-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+Link: https://lore.kernel.org/r/20211221022839.374606-1-yinxin.x@bytedance.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/extents.c |    2 --
+ fs/ext4/inode.c   |    7 ++++---
+ 2 files changed, 4 insertions(+), 5 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -4647,8 +4647,6 @@ static long ext4_zero_range(struct file
+       ret = ext4_mark_inode_dirty(handle, inode);
+       if (unlikely(ret))
+               goto out_handle;
+-      ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
+-                      (offset + len - 1) >> inode->i_sb->s_blocksize_bits);
+       /* Zero out partial block at the edges of the range */
+       ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+       if (ret >= 0)
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -741,10 +741,11 @@ out_sem:
+                       if (ret)
+                               return ret;
+               }
+-              ext4_fc_track_range(handle, inode, map->m_lblk,
+-                          map->m_lblk + map->m_len - 1);
+       }
+-
++      if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
++                              map->m_flags & EXT4_MAP_MAPPED))
++              ext4_fc_track_range(handle, inode, map->m_lblk,
++                                      map->m_lblk + map->m_len - 1);
+       if (retval < 0)
+               ext_debug(inode, "failed with err %d\n", retval);
+       return retval;
diff --git a/queue-5.16/ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch b/queue-5.16/ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch

new file mode 100644 (file)

index 0000000..3c19045
--- /dev/null
+++ b/queue-5.16/ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch
@@ -0,0 +1,43 @@
+From c27c29c6af4f3f4ce925a2111c256733c5a5b430 Mon Sep 17 00:00:00 2001
+From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+Date: Wed, 1 Dec 2021 08:34:21 -0800
+Subject: ext4: initialize err_blk before calling __ext4_get_inode_loc
+
+From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+
+commit c27c29c6af4f3f4ce925a2111c256733c5a5b430 upstream.
+
+It is not guaranteed that __ext4_get_inode_loc will definitely set
+err_blk pointer when it returns EIO. To avoid using uninitialized
+variables, let's first set err_blk to 0.
+
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+Link: https://lore.kernel.org/r/20211201163421.2631661-1-harshads@google.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/inode.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4523,7 +4523,7 @@ has_buffer:
+ static int __ext4_get_inode_loc_noinmem(struct inode *inode,
+                                       struct ext4_iloc *iloc)
+ {
+-      ext4_fsblk_t err_blk;
++      ext4_fsblk_t err_blk = 0;
+       int ret;
+ 
+       ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
+@@ -4538,7 +4538,7 @@ static int __ext4_get_inode_loc_noinmem(
+ 
+ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
+ {
+-      ext4_fsblk_t err_blk;
++      ext4_fsblk_t err_blk = 0;
+       int ret;
+ 
+       ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
diff --git a/queue-5.16/ext4-make-sure-quota-gets-properly-shutdown-on-error.patch b/queue-5.16/ext4-make-sure-quota-gets-properly-shutdown-on-error.patch

new file mode 100644 (file)

index 0000000..19993a9
--- /dev/null
+++ b/queue-5.16/ext4-make-sure-quota-gets-properly-shutdown-on-error.patch
@@ -0,0 +1,51 @@
+From 15fc69bbbbbc8c72e5f6cc4e1be0f51283c5448e Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 7 Oct 2021 17:53:35 +0200
+Subject: ext4: make sure quota gets properly shutdown on error
+
+From: Jan Kara <jack@suse.cz>
+
+commit 15fc69bbbbbc8c72e5f6cc4e1be0f51283c5448e upstream.
+
+When we hit an error when enabling quotas and setting inode flags, we do
+not properly shutdown quota subsystem despite returning error from
+Q_QUOTAON quotactl. This can lead to some odd situations like kernel
+using quota file while it is still writeable for userspace. Make sure we
+properly cleanup the quota subsystem in case of error.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Cc: stable@kernel.org
+Link: https://lore.kernel.org/r/20211007155336.12493-2-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/super.c |   10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -6275,10 +6275,7 @@ static int ext4_quota_on(struct super_bl
+ 
+       lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
+       err = dquot_quota_on(sb, type, format_id, path);
+-      if (err) {
+-              lockdep_set_quota_inode(path->dentry->d_inode,
+-                                           I_DATA_SEM_NORMAL);
+-      } else {
++      if (!err) {
+               struct inode *inode = d_inode(path->dentry);
+               handle_t *handle;
+ 
+@@ -6298,7 +6295,12 @@ static int ext4_quota_on(struct super_bl
+               ext4_journal_stop(handle);
+       unlock_inode:
+               inode_unlock(inode);
++              if (err)
++                      dquot_quota_off(sb, type);
+       }
++      if (err)
++              lockdep_set_quota_inode(path->dentry->d_inode,
++                                           I_DATA_SEM_NORMAL);
+       return err;
+ }
+ 
diff --git a/queue-5.16/ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch b/queue-5.16/ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch

new file mode 100644 (file)

index 0000000..16123e1
--- /dev/null
+++ b/queue-5.16/ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch
@@ -0,0 +1,49 @@
+From 4013d47a5307fdb5c13370b5392498b00fedd274 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 7 Oct 2021 17:53:36 +0200
+Subject: ext4: make sure to reset inode lockdep class when quota enabling fails
+
+From: Jan Kara <jack@suse.cz>
+
+commit 4013d47a5307fdb5c13370b5392498b00fedd274 upstream.
+
+When we succeed in enabling some quota type but fail to enable another
+one with quota feature, we correctly disable all enabled quota types.
+However we forget to reset i_data_sem lockdep class. When the inode gets
+freed and reused, it will inherit this lockdep class (i_data_sem is
+initialized only when a slab is created) and thus eventually lockdep
+barfs about possible deadlocks.
+
+Reported-and-tested-by: syzbot+3b6f9218b1301ddda3e2@syzkaller.appspotmail.com
+Signed-off-by: Jan Kara <jack@suse.cz>
+Cc: stable@kernel.org
+Link: https://lore.kernel.org/r/20211007155336.12493-3-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/super.c |   13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -6361,8 +6361,19 @@ int ext4_enable_quotas(struct super_bloc
+                                       "Failed to enable quota tracking "
+                                       "(type=%d, err=%d). Please run "
+                                       "e2fsck to fix.", type, err);
+-                              for (type--; type >= 0; type--)
++                              for (type--; type >= 0; type--) {
++                                      struct inode *inode;
++
++                                      inode = sb_dqopt(sb)->files[type];
++                                      if (inode)
++                                              inode = igrab(inode);
+                                       dquot_quota_off(sb, type);
++                                      if (inode) {
++                                              lockdep_set_quota_inode(inode,
++                                                      I_DATA_SEM_NORMAL);
++                                              iput(inode);
++                                      }
++                              }
+ 
+                               return err;
+                       }
diff --git a/queue-5.16/series b/queue-5.16/series

index 3eb224c967cb72697350831889e4820bbf72e8d8..fdcb0a26cada9e23c72776926567586804ccd03f 100644 (file)
--- a/queue-5.16/series
+++ b/queue-5.16/series
@@ -879,3 +879,15 @@ pci-pci-bridge-emul-correctly-set-pcie-capabilities.patch
  pci-pci-bridge-emul-set-pci_status_cap_list-for-pcie-device.patch
  xfrm-fix-policy-lookup-for-ipv6-gre-packets.patch
  xfrm-fix-dflt-policy-check-when-there-is-no-policy-configured.patch
+btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch
+btrfs-zoned-cache-reported-zone-during-mount.patch
+btrfs-check-the-root-node-for-uptodate-before-returning-it.patch
+btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch
+btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch
+btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch
+btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch
+ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch
+ext4-make-sure-quota-gets-properly-shutdown-on-error.patch
+ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch
+ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch
+ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 23 Jan 2022 17:18:27 +0000 (18:18 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 23 Jan 2022 17:18:27 +0000 (18:18 +0100)
queue-5.16/btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/btrfs-check-the-root-node-for-uptodate-before-returning-it.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/btrfs-zoned-cache-reported-zone-during-mount.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/ext4-make-sure-quota-gets-properly-shutdown-on-error.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch	[new file with mode: 0644]	patch \| blob
queue-5.16/series		patch \| blob \| blame \| history