--- /dev/null
+From 50475cd57706359d6cc652be88369dace7a4c2eb Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Wed, 8 Dec 2021 00:35:48 +0900
+Subject: btrfs: add extent allocator hook to decide to allocate chunk or not
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 50475cd57706359d6cc652be88369dace7a4c2eb upstream.
+
+Introduce a new hook for an extent allocator policy. With the new
+hook, a policy can decide to allocate a new block group or not. If
+not, it will return -ENOSPC, so btrfs_reserve_extent() will cut the
+allocation size in half and retry the allocation if min_alloc_size is
+large enough.
+
+The hook has a place holder and will be replaced with the real
+implementation in the next patch.
+
+CC: stable@vger.kernel.org # 5.16
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c | 17 +++++++++++++++++
+ 1 file changed, 17 insertions(+)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3947,6 +3947,19 @@ static void found_extent(struct find_fre
+ }
+ }
+
++static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
++ struct find_free_extent_ctl *ffe_ctl)
++{
++ switch (ffe_ctl->policy) {
++ case BTRFS_EXTENT_ALLOC_CLUSTERED:
++ return true;
++ case BTRFS_EXTENT_ALLOC_ZONED:
++ return true;
++ default:
++ BUG();
++ }
++}
++
+ static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
+ {
+ switch (ffe_ctl->policy) {
+@@ -4034,6 +4047,10 @@ static int find_free_extent_update_loop(
+ struct btrfs_trans_handle *trans;
+ int exist = 0;
+
++ /*Check if allocation policy allows to create a new chunk */
++ if (!can_allocate_chunk(fs_info, ffe_ctl))
++ return -ENOSPC;
++
+ trans = current->journal_info;
+ if (trans)
+ exist = 1;
--- /dev/null
+From 120de408e4b97504a2d9b5ca534b383de2c73d49 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 24 Nov 2021 14:14:24 -0500
+Subject: btrfs: check the root node for uptodate before returning it
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 120de408e4b97504a2d9b5ca534b383de2c73d49 upstream.
+
+Now that we clear the extent buffer uptodate if we fail to write it out
+we need to check to see if our root node is uptodate before we search
+down it. Otherwise we could return stale data (or potentially corrupt
+data that was caught by the write verification step) and think that the
+path is OK to search down.
+
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.c | 19 +++++++++++++++----
+ 1 file changed, 15 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -1570,12 +1570,9 @@ static struct extent_buffer *btrfs_searc
+ {
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_buffer *b;
+- int root_lock;
++ int root_lock = 0;
+ int level = 0;
+
+- /* We try very hard to do read locks on the root */
+- root_lock = BTRFS_READ_LOCK;
+-
+ if (p->search_commit_root) {
+ /*
+ * The commit roots are read only so we always do read locks,
+@@ -1613,6 +1610,9 @@ static struct extent_buffer *btrfs_searc
+ goto out;
+ }
+
++ /* We try very hard to do read locks on the root */
++ root_lock = BTRFS_READ_LOCK;
++
+ /*
+ * If the level is set to maximum, we can skip trying to get the read
+ * lock.
+@@ -1639,6 +1639,17 @@ static struct extent_buffer *btrfs_searc
+ level = btrfs_header_level(b);
+
+ out:
++ /*
++ * The root may have failed to write out at some point, and thus is no
++ * longer valid, return an error in this case.
++ */
++ if (!extent_buffer_uptodate(b)) {
++ if (root_lock)
++ btrfs_tree_unlock_rw(b, root_lock);
++ free_extent_buffer(b);
++ return ERR_PTR(-EIO);
++ }
++
+ p->nodes[level] = b;
+ if (!p->skip_locking)
+ p->locks[level] = root_lock;
--- /dev/null
+From 232796df8c1437c41d308d161007f0715bac0a54 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 27 Oct 2021 18:30:25 +0100
+Subject: btrfs: fix deadlock between quota enable and other quota operations
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 232796df8c1437c41d308d161007f0715bac0a54 upstream.
+
+When enabling quotas, we attempt to commit a transaction while holding the
+mutex fs_info->qgroup_ioctl_lock. This can result on a deadlock with other
+quota operations such as:
+
+- qgroup creation and deletion, ioctl BTRFS_IOC_QGROUP_CREATE;
+
+- adding and removing qgroup relations, ioctl BTRFS_IOC_QGROUP_ASSIGN.
+
+This is because these operations join a transaction and after that they
+attempt to lock the mutex fs_info->qgroup_ioctl_lock. Acquiring that mutex
+after joining or starting a transaction is a pattern followed everywhere
+in qgroups, so the quota enablement operation is the one at fault here,
+and should not commit a transaction while holding that mutex.
+
+Fix this by making the transaction commit while not holding the mutex.
+We are safe from two concurrent tasks trying to enable quotas because
+we are serialized by the rw semaphore fs_info->subvol_sem at
+btrfs_ioctl_quota_ctl(), which is the only call site for enabling
+quotas.
+
+When this deadlock happens, it produces a trace like the following:
+
+ INFO: task syz-executor:25604 blocked for more than 143 seconds.
+ Not tainted 5.15.0-rc6 #4
+ "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ task:syz-executor state:D stack:24800 pid:25604 ppid: 24873 flags:0x00004004
+ Call Trace:
+ context_switch kernel/sched/core.c:4940 [inline]
+ __schedule+0xcd9/0x2530 kernel/sched/core.c:6287
+ schedule+0xd3/0x270 kernel/sched/core.c:6366
+ btrfs_commit_transaction+0x994/0x2e90 fs/btrfs/transaction.c:2201
+ btrfs_quota_enable+0x95c/0x1790 fs/btrfs/qgroup.c:1120
+ btrfs_ioctl_quota_ctl fs/btrfs/ioctl.c:4229 [inline]
+ btrfs_ioctl+0x637e/0x7b70 fs/btrfs/ioctl.c:5010
+ vfs_ioctl fs/ioctl.c:51 [inline]
+ __do_sys_ioctl fs/ioctl.c:874 [inline]
+ __se_sys_ioctl fs/ioctl.c:860 [inline]
+ __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
+ do_syscall_x64 arch/x86/entry/common.c:50 [inline]
+ do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
+ entry_SYSCALL_64_after_hwframe+0x44/0xae
+ RIP: 0033:0x7f86920b2c4d
+ RSP: 002b:00007f868f61ac58 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+ RAX: ffffffffffffffda RBX: 00007f86921d90a0 RCX: 00007f86920b2c4d
+ RDX: 0000000020005e40 RSI: 00000000c0109428 RDI: 0000000000000008
+ RBP: 00007f869212bd80 R08: 0000000000000000 R09: 0000000000000000
+ R10: 0000000000000000 R11: 0000000000000246 R12: 00007f86921d90a0
+ R13: 00007fff6d233e4f R14: 00007fff6d233ff0 R15: 00007f868f61adc0
+ INFO: task syz-executor:25628 blocked for more than 143 seconds.
+ Not tainted 5.15.0-rc6 #4
+ "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ task:syz-executor state:D stack:29080 pid:25628 ppid: 24873 flags:0x00004004
+ Call Trace:
+ context_switch kernel/sched/core.c:4940 [inline]
+ __schedule+0xcd9/0x2530 kernel/sched/core.c:6287
+ schedule+0xd3/0x270 kernel/sched/core.c:6366
+ schedule_preempt_disabled+0xf/0x20 kernel/sched/core.c:6425
+ __mutex_lock_common kernel/locking/mutex.c:669 [inline]
+ __mutex_lock+0xc96/0x1680 kernel/locking/mutex.c:729
+ btrfs_remove_qgroup+0xb7/0x7d0 fs/btrfs/qgroup.c:1548
+ btrfs_ioctl_qgroup_create fs/btrfs/ioctl.c:4333 [inline]
+ btrfs_ioctl+0x683c/0x7b70 fs/btrfs/ioctl.c:5014
+ vfs_ioctl fs/ioctl.c:51 [inline]
+ __do_sys_ioctl fs/ioctl.c:874 [inline]
+ __se_sys_ioctl fs/ioctl.c:860 [inline]
+ __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860
+ do_syscall_x64 arch/x86/entry/common.c:50 [inline]
+ do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
+ entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+Reported-by: Hao Sun <sunhao.th@gmail.com>
+Link: https://lore.kernel.org/linux-btrfs/CACkBjsZQF19bQ1C6=yetF3BvL10OSORpFUcWXTP6HErshDB4dQ@mail.gmail.com/
+Fixes: 340f1aa27f36 ("btrfs: qgroups: Move transaction management inside btrfs_quota_enable/disable")
+CC: stable@vger.kernel.org # 4.19
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c | 19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -940,6 +940,14 @@ int btrfs_quota_enable(struct btrfs_fs_i
+ int ret = 0;
+ int slot;
+
++ /*
++ * We need to have subvol_sem write locked, to prevent races between
++ * concurrent tasks trying to enable quotas, because we will unlock
++ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
++ * and before setting BTRFS_FS_QUOTA_ENABLED.
++ */
++ lockdep_assert_held_write(&fs_info->subvol_sem);
++
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (fs_info->quota_root)
+ goto out;
+@@ -1117,8 +1125,19 @@ out_add_root:
+ goto out_free_path;
+ }
+
++ mutex_unlock(&fs_info->qgroup_ioctl_lock);
++ /*
++ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
++ * a deadlock with tasks concurrently doing other qgroup operations, such
++ * adding/removing qgroups or adding/deleting qgroup relations for example,
++ * because all qgroup operations first start or join a transaction and then
++ * lock the qgroup_ioctl_lock mutex.
++ * We are safe from a concurrent task trying to enable quotas, by calling
++ * this function, since we are serialized by fs_info->subvol_sem.
++ */
+ ret = btrfs_commit_transaction(trans);
+ trans = NULL;
++ mutex_lock(&fs_info->qgroup_ioctl_lock);
+ if (ret)
+ goto out_free_path;
+
--- /dev/null
+From c2f822635df873c510bda6fb7fd1b10b7c31be2d Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 16 Dec 2021 15:00:32 +0000
+Subject: btrfs: respect the max size in the header when activating swap file
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit c2f822635df873c510bda6fb7fd1b10b7c31be2d upstream.
+
+If we extended the size of a swapfile after its header was created (by the
+mkswap utility) and then try to activate it, we will map the entire file
+when activating the swap file, instead of limiting to the max size defined
+in the swap file's header.
+
+Currently test case generic/643 from fstests fails because we do not
+respect that size limit defined in the swap file's header.
+
+So fix this by not mapping file ranges beyond the max size defined in the
+swap header.
+
+This is the same type of bug that iomap used to have, and was fixed in
+commit 36ca7943ac18ae ("mm/swap: consider max pages in
+iomap_swapfile_add_extent").
+
+Fixes: ed46ff3d423780 ("Btrfs: support swap files")
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-and-tested-by: Josef Bacik <josef@toxicpanda.com
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -10595,9 +10595,19 @@ static int btrfs_add_swap_extent(struct
+ struct btrfs_swap_info *bsi)
+ {
+ unsigned long nr_pages;
++ unsigned long max_pages;
+ u64 first_ppage, first_ppage_reported, next_ppage;
+ int ret;
+
++ /*
++ * Our swapfile may have had its size extended after the swap header was
++ * written. In that case activating the swapfile should not go beyond
++ * the max size set in the swap header.
++ */
++ if (bsi->nr_pages >= sis->max)
++ return 0;
++
++ max_pages = sis->max - bsi->nr_pages;
+ first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
+ next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
+ PAGE_SIZE) >> PAGE_SHIFT;
+@@ -10605,6 +10615,7 @@ static int btrfs_add_swap_extent(struct
+ if (first_ppage >= next_ppage)
+ return 0;
+ nr_pages = next_ppage - first_ppage;
++ nr_pages = min(nr_pages, max_pages);
+
+ first_ppage_reported = first_ppage;
+ if (bsi->start == 0)
--- /dev/null
+From 16beac87e95e2fb278b552397c8260637f8a63f7 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Thu, 11 Nov 2021 14:14:38 +0900
+Subject: btrfs: zoned: cache reported zone during mount
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 16beac87e95e2fb278b552397c8260637f8a63f7 upstream.
+
+When mounting a device, we are reporting the zones twice: once for
+checking the zone attributes in btrfs_get_dev_zone_info and once for
+loading block groups' zone info in
+btrfs_load_block_group_zone_info(). With a lot of block groups, that
+leads to a lot of REPORT ZONE commands and slows down the mount
+process.
+
+This patch introduces a zone info cache in struct
+btrfs_zoned_device_info. The cache is populated while in
+btrfs_get_dev_zone_info() and used for
+btrfs_load_block_group_zone_info() to reduce the number of REPORT ZONE
+commands. The zone cache is then released after loading the block
+groups, as it will not be much effective during the run time.
+
+Benchmark: Mount an HDD with 57,007 block groups
+Before patch: 171.368 seconds
+After patch: 64.064 seconds
+
+While it still takes a minute due to the slowness of loading all the
+block groups, the patch reduces the mount time by 1/3.
+
+Link: https://lore.kernel.org/linux-btrfs/CAHQ7scUiLtcTqZOMMY5kbWUBOhGRwKo6J6wYPT5WY+C=cD49nQ@mail.gmail.com/
+Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices")
+CC: stable@vger.kernel.org
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/dev-replace.c | 2 -
+ fs/btrfs/disk-io.c | 2 +
+ fs/btrfs/volumes.c | 2 -
+ fs/btrfs/zoned.c | 86 +++++++++++++++++++++++++++++++++++++++++++------
+ fs/btrfs/zoned.h | 8 +++-
+ 5 files changed, 87 insertions(+), 13 deletions(-)
+
+--- a/fs/btrfs/dev-replace.c
++++ b/fs/btrfs/dev-replace.c
+@@ -322,7 +322,7 @@ static int btrfs_init_dev_replace_tgtdev
+ set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+ device->fs_devices = fs_info->fs_devices;
+
+- ret = btrfs_get_dev_zone_info(device);
++ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ goto error;
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3571,6 +3571,8 @@ int __cold open_ctree(struct super_block
+ goto fail_sysfs;
+ }
+
++ btrfs_free_zone_cache(fs_info);
++
+ if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ !btrfs_check_rw_degradable(fs_info, NULL)) {
+ btrfs_warn(fs_info,
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -2643,7 +2643,7 @@ int btrfs_init_new_device(struct btrfs_f
+ device->fs_info = fs_info;
+ device->bdev = bdev;
+
+- ret = btrfs_get_dev_zone_info(device);
++ ret = btrfs_get_dev_zone_info(device, false);
+ if (ret)
+ goto error_free_device;
+
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -5,6 +5,7 @@
+ #include <linux/blkdev.h>
+ #include <linux/sched/mm.h>
+ #include <linux/atomic.h>
++#include <linux/vmalloc.h>
+ #include "ctree.h"
+ #include "volumes.h"
+ #include "zoned.h"
+@@ -213,6 +214,8 @@ static int emulate_report_zones(struct b
+ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zones, unsigned int *nr_zones)
+ {
++ struct btrfs_zoned_device_info *zinfo = device->zone_info;
++ u32 zno;
+ int ret;
+
+ if (!*nr_zones)
+@@ -224,6 +227,34 @@ static int btrfs_get_dev_zones(struct bt
+ return 0;
+ }
+
++ /* Check cache */
++ if (zinfo->zone_cache) {
++ unsigned int i;
++
++ ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
++ zno = pos >> zinfo->zone_size_shift;
++ /*
++ * We cannot report zones beyond the zone end. So, it is OK to
++ * cap *nr_zones to at the end.
++ */
++ *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
++
++ for (i = 0; i < *nr_zones; i++) {
++ struct blk_zone *zone_info;
++
++ zone_info = &zinfo->zone_cache[zno + i];
++ if (!zone_info->len)
++ break;
++ }
++
++ if (i == *nr_zones) {
++ /* Cache hit on all the zones */
++ memcpy(zones, zinfo->zone_cache + zno,
++ sizeof(*zinfo->zone_cache) * *nr_zones);
++ return 0;
++ }
++ }
++
+ ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
+ copy_zone_info_cb, zones);
+ if (ret < 0) {
+@@ -237,6 +268,11 @@ static int btrfs_get_dev_zones(struct bt
+ if (!ret)
+ return -EIO;
+
++ /* Populate cache */
++ if (zinfo->zone_cache)
++ memcpy(zinfo->zone_cache + zno, zones,
++ sizeof(*zinfo->zone_cache) * *nr_zones);
++
+ return 0;
+ }
+
+@@ -300,7 +336,7 @@ int btrfs_get_dev_zone_info_all_devices(
+ if (!device->bdev)
+ continue;
+
+- ret = btrfs_get_dev_zone_info(device);
++ ret = btrfs_get_dev_zone_info(device, true);
+ if (ret)
+ break;
+ }
+@@ -309,7 +345,7 @@ int btrfs_get_dev_zone_info_all_devices(
+ return ret;
+ }
+
+-int btrfs_get_dev_zone_info(struct btrfs_device *device)
++int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
+ {
+ struct btrfs_fs_info *fs_info = device->fs_info;
+ struct btrfs_zoned_device_info *zone_info = NULL;
+@@ -339,6 +375,8 @@ int btrfs_get_dev_zone_info(struct btrfs
+ if (!zone_info)
+ return -ENOMEM;
+
++ device->zone_info = zone_info;
++
+ if (!bdev_is_zoned(bdev)) {
+ if (!fs_info->zone_size) {
+ ret = calculate_emulated_zone_size(fs_info);
+@@ -407,6 +445,23 @@ int btrfs_get_dev_zone_info(struct btrfs
+ goto out;
+ }
+
++ /*
++ * Enable zone cache only for a zoned device. On a non-zoned device, we
++ * fill the zone info with emulated CONVENTIONAL zones, so no need to
++ * use the cache.
++ */
++ if (populate_cache && bdev_is_zoned(device->bdev)) {
++ zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
++ zone_info->nr_zones);
++ if (!zone_info->zone_cache) {
++ btrfs_err_in_rcu(device->fs_info,
++ "zoned: failed to allocate zone cache for %s",
++ rcu_str_deref(device->name));
++ ret = -ENOMEM;
++ goto out;
++ }
++ }
++
+ /* Get zones type */
+ nactive = 0;
+ while (sector < nr_sectors) {
+@@ -505,8 +560,6 @@ int btrfs_get_dev_zone_info(struct btrfs
+
+ kfree(zones);
+
+- device->zone_info = zone_info;
+-
+ switch (bdev_zoned_model(bdev)) {
+ case BLK_ZONED_HM:
+ model = "host-managed zoned";
+@@ -539,11 +592,7 @@ int btrfs_get_dev_zone_info(struct btrfs
+ out:
+ kfree(zones);
+ out_free_zone_info:
+- bitmap_free(zone_info->active_zones);
+- bitmap_free(zone_info->empty_zones);
+- bitmap_free(zone_info->seq_zones);
+- kfree(zone_info);
+- device->zone_info = NULL;
++ btrfs_destroy_dev_zone_info(device);
+
+ return ret;
+ }
+@@ -558,6 +607,7 @@ void btrfs_destroy_dev_zone_info(struct
+ bitmap_free(zone_info->active_zones);
+ bitmap_free(zone_info->seq_zones);
+ bitmap_free(zone_info->empty_zones);
++ vfree(zone_info->zone_cache);
+ kfree(zone_info);
+ device->zone_info = NULL;
+ }
+@@ -1975,3 +2025,21 @@ void btrfs_clear_data_reloc_bg(struct bt
+ fs_info->data_reloc_bg = 0;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ }
++
++void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
++{
++ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
++ struct btrfs_device *device;
++
++ if (!btrfs_is_zoned(fs_info))
++ return;
++
++ mutex_lock(&fs_devices->device_list_mutex);
++ list_for_each_entry(device, &fs_devices->devices, dev_list) {
++ if (device->zone_info) {
++ vfree(device->zone_info->zone_cache);
++ device->zone_info->zone_cache = NULL;
++ }
++ }
++ mutex_unlock(&fs_devices->device_list_mutex);
++}
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -28,6 +28,7 @@ struct btrfs_zoned_device_info {
+ unsigned long *seq_zones;
+ unsigned long *empty_zones;
+ unsigned long *active_zones;
++ struct blk_zone *zone_cache;
+ struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
+ };
+
+@@ -35,7 +36,7 @@ struct btrfs_zoned_device_info {
+ int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone);
+ int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
+-int btrfs_get_dev_zone_info(struct btrfs_device *device);
++int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
+ void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
+ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
+ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+@@ -76,6 +77,7 @@ bool btrfs_can_activate_zone(struct btrf
+ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
++void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+ #else /* CONFIG_BLK_DEV_ZONED */
+ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+@@ -88,7 +90,8 @@ static inline int btrfs_get_dev_zone_inf
+ return 0;
+ }
+
+-static inline int btrfs_get_dev_zone_info(struct btrfs_device *device)
++static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
++ bool populate_cache)
+ {
+ return 0;
+ }
+@@ -232,6 +235,7 @@ static inline void btrfs_zone_finish_end
+
+ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+
++static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+ #endif
+
+ static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
--- /dev/null
+From 82187d2ecdfb22ab7ee05f388402a39236d31428 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Wed, 8 Dec 2021 00:35:49 +0900
+Subject: btrfs: zoned: fix chunk allocation condition for zoned allocator
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 82187d2ecdfb22ab7ee05f388402a39236d31428 upstream.
+
+The ZNS specification defines a limit on the number of "active"
+zones. That limit impose us to limit the number of block groups which
+can be used for an allocation at the same time. Not to exceed the
+limit, we reuse the existing active block groups as much as possible
+when we can't activate any other zones without sacrificing an already
+activated block group in commit a85f05e59bc1 ("btrfs: zoned: avoid
+chunk allocation if active block group has enough space").
+
+However, the check is wrong in two ways. First, it checks the
+condition for every raid index (ffe_ctl->index). Even if it reaches
+the condition and "ffe_ctl->max_extent_size >=
+ffe_ctl->min_alloc_size" is met, there can be other block groups
+having enough space to hold ffe_ctl->num_bytes. (Actually, this won't
+happen in the current zoned code as it only supports SINGLE
+profile. But, it can happen once it enables other RAID types.)
+
+Second, it checks the active zone availability depending on the
+raid index. The raid index is just an index for
+space_info->block_groups, so it has nothing to do with chunk allocation.
+
+These mistakes are causing a faulty allocation in a certain
+situation. Consider we are running zoned btrfs on a device whose
+max_active_zone == 0 (no limit). And, suppose no block group have a
+room to fit ffe_ctl->num_bytes but some room to meet
+ffe_ctl->min_alloc_size (i.e. max_extent_size > num_bytes >=
+min_alloc_size).
+
+In this situation, the following occur:
+
+- With SINGLE raid_index, it reaches the chunk allocation checking
+ code
+- The check returns true because we can activate a new zone (no limit)
+- But, before allocating the chunk, it iterates to the next raid index
+ (RAID5)
+- Since there are no RAID5 block groups on zoned mode, it again
+ reaches the check code
+- The check returns false because of btrfs_can_activate_zone()'s "if
+ (raid_index != BTRFS_RAID_SINGLE)" part
+- That results in returning -ENOSPC without allocating a new chunk
+
+As a result, we end up hitting -ENOSPC too early.
+
+Move the check to the right place in the can_allocate_chunk() hook,
+and do the active zone check depending on the allocation flag, not on
+the raid index.
+
+CC: stable@vger.kernel.org # 5.16
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c | 21 +++++++++------------
+ fs/btrfs/zoned.c | 5 ++---
+ fs/btrfs/zoned.h | 5 ++---
+ 3 files changed, 13 insertions(+), 18 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3966,6 +3966,15 @@ static bool can_allocate_chunk(struct bt
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return true;
+ case BTRFS_EXTENT_ALLOC_ZONED:
++ /*
++ * If we have enough free space left in an already
++ * active block group and we can't activate any other
++ * zone now, do not allow allocating a new chunk and
++ * let find_free_extent() retry with a smaller size.
++ */
++ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
++ !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
++ return false;
+ return true;
+ default:
+ BUG();
+@@ -4012,18 +4021,6 @@ static int find_free_extent_update_loop(
+ return 0;
+ }
+
+- if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+- !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
+- /*
+- * If we have enough free space left in an already active block
+- * group and we can't activate any other zone now, retry the
+- * active ones with a smaller allocation size. Returning early
+- * from here will tell btrfs_reserve_extent() to haven the
+- * size.
+- */
+- return -ENOSPC;
+- }
+-
+ if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
+ return 1;
+
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -1934,7 +1934,7 @@ int btrfs_zone_finish(struct btrfs_block
+ return ret;
+ }
+
+-bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index)
++bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
+ {
+ struct btrfs_device *device;
+ bool ret = false;
+@@ -1943,8 +1943,7 @@ bool btrfs_can_activate_zone(struct btrf
+ return true;
+
+ /* Non-single profiles are not supported yet */
+- if (raid_index != BTRFS_RAID_SINGLE)
+- return false;
++ ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0);
+
+ /* Check if there is a device with active zones left */
+ mutex_lock(&fs_devices->device_list_mutex);
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -72,8 +72,7 @@ struct btrfs_device *btrfs_zoned_get_dev
+ u64 logical, u64 length);
+ bool btrfs_zone_activate(struct btrfs_block_group *block_group);
+ int btrfs_zone_finish(struct btrfs_block_group *block_group);
+-bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+- int raid_index);
++bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
+ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+@@ -225,7 +224,7 @@ static inline int btrfs_zone_finish(stru
+ }
+
+ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
+- int raid_index)
++ u64 flags)
+ {
+ return true;
+ }
--- /dev/null
+From 1ada69f61c88abb75a1038ee457633325658a183 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Wed, 8 Dec 2021 00:35:47 +0900
+Subject: btrfs: zoned: unset dedicated block group on allocation failure
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 1ada69f61c88abb75a1038ee457633325658a183 upstream.
+
+Allocating an extent from a block group can fail for various reasons.
+When an allocation from a dedicated block group (for tree-log or
+relocation data) fails, we need to unregister it as a dedicated one so
+that we can allocate a new block group for the dedicated one.
+
+However, we are returning early when the block group in case it is
+read-only, fully used, or not be able to activate the zone. As a result,
+we keep the non-usable block group as a dedicated one, leading to
+further allocation failure. With many block groups, the allocator will
+iterate hopeless loop to find a free extent, results in a hung task.
+
+Fix the issue by delaying the return and doing the proper cleanups.
+
+CC: stable@vger.kernel.org # 5.16
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c | 20 ++++++++++++++++----
+ 1 file changed, 16 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3790,23 +3790,35 @@ static int do_allocation_zoned(struct bt
+ spin_unlock(&fs_info->relocation_bg_lock);
+ if (skip)
+ return 1;
++
+ /* Check RO and no space case before trying to activate it */
+ spin_lock(&block_group->lock);
+ if (block_group->ro ||
+ block_group->alloc_offset == block_group->zone_capacity) {
+- spin_unlock(&block_group->lock);
+- return 1;
++ ret = 1;
++ /*
++ * May need to clear fs_info->{treelog,data_reloc}_bg.
++ * Return the error after taking the locks.
++ */
+ }
+ spin_unlock(&block_group->lock);
+
+- if (!btrfs_zone_activate(block_group))
+- return 1;
++ if (!ret && !btrfs_zone_activate(block_group)) {
++ ret = 1;
++ /*
++ * May need to clear fs_info->{treelog,data_reloc}_bg.
++ * Return the error after taking the locks.
++ */
++ }
+
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ spin_lock(&fs_info->treelog_bg_lock);
+ spin_lock(&fs_info->relocation_bg_lock);
+
++ if (ret)
++ goto out;
++
+ ASSERT(!ffe_ctl->for_treelog ||
+ block_group->start == fs_info->treelog_bg ||
+ fs_info->treelog_bg == 0);
--- /dev/null
+From 8c80fb312d7abf8bcd66cca1d843a80318a2c522 Mon Sep 17 00:00:00 2001
+From: Chunguang Xu <brookxu@tencent.com>
+Date: Tue, 23 Nov 2021 09:17:57 +0800
+Subject: ext4: fix a possible ABBA deadlock due to busy PA
+
+From: Chunguang Xu <brookxu@tencent.com>
+
+commit 8c80fb312d7abf8bcd66cca1d843a80318a2c522 upstream.
+
+We found on older kernel (3.10) that in the scenario of insufficient
+disk space, system may trigger an ABBA deadlock problem, it seems that
+this problem still exists in latest kernel, try to fix it here. The
+main process triggered by this problem is that task A occupies the PA
+and waits for the jbd2 transaction finish, the jbd2 transaction waits
+for the completion of task B's IO (plug_list), but task B waits for
+the release of PA by task A to finish discard, which indirectly forms
+an ABBA deadlock. The related calltrace is as follows:
+
+ Task A
+ vfs_write
+ ext4_mb_new_blocks()
+ ext4_mb_mark_diskspace_used() JBD2
+ jbd2_journal_get_write_access() -> jbd2_journal_commit_transaction()
+ ->schedule() filemap_fdatawait()
+ | |
+ | Task B |
+ | do_unlinkat() |
+ | ext4_evict_inode() |
+ | jbd2_journal_begin_ordered_truncate() |
+ | filemap_fdatawrite_range() |
+ | ext4_mb_new_blocks() |
+ -ext4_mb_discard_group_preallocations() <-----
+
+Here, try to cancel ext4_mb_discard_group_preallocations() internal
+retry due to PA busy, and do a limited number of retries inside
+ext4_mb_discard_preallocations(), which can circumvent the above
+problems, but also has some advantages:
+
+1. Since the PA is in a busy state, if other groups have free PAs,
+ keeping the current PA may help to reduce fragmentation.
+2. Continue to traverse forward instead of waiting for the current
+ group PA to be released. In most scenarios, the PA discard time
+ can be reduced.
+
+However, in the case of smaller free space, if only a few groups have
+space, then due to multiple traversals of the group, it may increase
+CPU overhead. But in contrast, I feel that the overall benefit is
+better than the cost.
+
+Signed-off-by: Chunguang Xu <brookxu@tencent.com>
+Reported-by: kernel test robot <lkp@intel.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/1637630277-23496-1-git-send-email-brookxu.cn@gmail.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/mballoc.c | 40 ++++++++++++++++++----------------------
+ 1 file changed, 18 insertions(+), 22 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -4814,7 +4814,7 @@ ext4_mb_release_group_pa(struct ext4_bud
+ */
+ static noinline_for_stack int
+ ext4_mb_discard_group_preallocations(struct super_block *sb,
+- ext4_group_t group, int needed)
++ ext4_group_t group, int *busy)
+ {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct buffer_head *bitmap_bh = NULL;
+@@ -4822,8 +4822,7 @@ ext4_mb_discard_group_preallocations(str
+ struct list_head list;
+ struct ext4_buddy e4b;
+ int err;
+- int busy = 0;
+- int free, free_total = 0;
++ int free = 0;
+
+ mb_debug(sb, "discard preallocation for group %u\n", group);
+ if (list_empty(&grp->bb_prealloc_list))
+@@ -4846,19 +4845,14 @@ ext4_mb_discard_group_preallocations(str
+ goto out_dbg;
+ }
+
+- if (needed == 0)
+- needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
+-
+ INIT_LIST_HEAD(&list);
+-repeat:
+- free = 0;
+ ext4_lock_group(sb, group);
+ list_for_each_entry_safe(pa, tmp,
+ &grp->bb_prealloc_list, pa_group_list) {
+ spin_lock(&pa->pa_lock);
+ if (atomic_read(&pa->pa_count)) {
+ spin_unlock(&pa->pa_lock);
+- busy = 1;
++ *busy = 1;
+ continue;
+ }
+ if (pa->pa_deleted) {
+@@ -4898,22 +4892,13 @@ repeat:
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ }
+
+- free_total += free;
+-
+- /* if we still need more blocks and some PAs were used, try again */
+- if (free_total < needed && busy) {
+- ext4_unlock_group(sb, group);
+- cond_resched();
+- busy = 0;
+- goto repeat;
+- }
+ ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
+ put_bh(bitmap_bh);
+ out_dbg:
+ mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
+- free_total, group, grp->bb_free);
+- return free_total;
++ free, group, grp->bb_free);
++ return free;
+ }
+
+ /*
+@@ -5455,13 +5440,24 @@ static int ext4_mb_discard_preallocation
+ {
+ ext4_group_t i, ngroups = ext4_get_groups_count(sb);
+ int ret;
+- int freed = 0;
++ int freed = 0, busy = 0;
++ int retry = 0;
+
+ trace_ext4_mb_discard_preallocations(sb, needed);
++
++ if (needed == 0)
++ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
++ repeat:
+ for (i = 0; i < ngroups && needed > 0; i++) {
+- ret = ext4_mb_discard_group_preallocations(sb, i, needed);
++ ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
+ freed += ret;
+ needed -= ret;
++ cond_resched();
++ }
++
++ if (needed > 0 && busy && ++retry < 3) {
++ busy = 0;
++ goto repeat;
+ }
+
+ return freed;
--- /dev/null
+From 5e4d0eba1ccaf19f93222abdeda5a368be141785 Mon Sep 17 00:00:00 2001
+From: Xin Yin <yinxin.x@bytedance.com>
+Date: Tue, 21 Dec 2021 10:28:39 +0800
+Subject: ext4: fix fast commit may miss tracking range for FALLOC_FL_ZERO_RANGE
+
+From: Xin Yin <yinxin.x@bytedance.com>
+
+commit 5e4d0eba1ccaf19f93222abdeda5a368be141785 upstream.
+
+when call falloc with FALLOC_FL_ZERO_RANGE, to set an range to unwritten,
+which has been already initialized. If the range is align to blocksize,
+fast commit will not track range for this change.
+
+Also track range for unwritten range in ext4_map_blocks().
+
+Signed-off-by: Xin Yin <yinxin.x@bytedance.com>
+Reviewed-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+Link: https://lore.kernel.org/r/20211221022839.374606-1-yinxin.x@bytedance.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/extents.c | 2 --
+ fs/ext4/inode.c | 7 ++++---
+ 2 files changed, 4 insertions(+), 5 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -4647,8 +4647,6 @@ static long ext4_zero_range(struct file
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret))
+ goto out_handle;
+- ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits,
+- (offset + len - 1) >> inode->i_sb->s_blocksize_bits);
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+ if (ret >= 0)
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -741,10 +741,11 @@ out_sem:
+ if (ret)
+ return ret;
+ }
+- ext4_fc_track_range(handle, inode, map->m_lblk,
+- map->m_lblk + map->m_len - 1);
+ }
+-
++ if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
++ map->m_flags & EXT4_MAP_MAPPED))
++ ext4_fc_track_range(handle, inode, map->m_lblk,
++ map->m_lblk + map->m_len - 1);
+ if (retval < 0)
+ ext_debug(inode, "failed with err %d\n", retval);
+ return retval;
--- /dev/null
+From c27c29c6af4f3f4ce925a2111c256733c5a5b430 Mon Sep 17 00:00:00 2001
+From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+Date: Wed, 1 Dec 2021 08:34:21 -0800
+Subject: ext4: initialize err_blk before calling __ext4_get_inode_loc
+
+From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+
+commit c27c29c6af4f3f4ce925a2111c256733c5a5b430 upstream.
+
+It is not guaranteed that __ext4_get_inode_loc will definitely set
+err_blk pointer when it returns EIO. To avoid using uninitialized
+variables, let's first set err_blk to 0.
+
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
+Link: https://lore.kernel.org/r/20211201163421.2631661-1-harshads@google.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/inode.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4523,7 +4523,7 @@ has_buffer:
+ static int __ext4_get_inode_loc_noinmem(struct inode *inode,
+ struct ext4_iloc *iloc)
+ {
+- ext4_fsblk_t err_blk;
++ ext4_fsblk_t err_blk = 0;
+ int ret;
+
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
+@@ -4538,7 +4538,7 @@ static int __ext4_get_inode_loc_noinmem(
+
+ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
+ {
+- ext4_fsblk_t err_blk;
++ ext4_fsblk_t err_blk = 0;
+ int ret;
+
+ ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
--- /dev/null
+From 15fc69bbbbbc8c72e5f6cc4e1be0f51283c5448e Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 7 Oct 2021 17:53:35 +0200
+Subject: ext4: make sure quota gets properly shutdown on error
+
+From: Jan Kara <jack@suse.cz>
+
+commit 15fc69bbbbbc8c72e5f6cc4e1be0f51283c5448e upstream.
+
+When we hit an error when enabling quotas and setting inode flags, we do
+not properly shutdown quota subsystem despite returning error from
+Q_QUOTAON quotactl. This can lead to some odd situations like kernel
+using quota file while it is still writeable for userspace. Make sure we
+properly cleanup the quota subsystem in case of error.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Cc: stable@kernel.org
+Link: https://lore.kernel.org/r/20211007155336.12493-2-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/super.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -6275,10 +6275,7 @@ static int ext4_quota_on(struct super_bl
+
+ lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
+ err = dquot_quota_on(sb, type, format_id, path);
+- if (err) {
+- lockdep_set_quota_inode(path->dentry->d_inode,
+- I_DATA_SEM_NORMAL);
+- } else {
++ if (!err) {
+ struct inode *inode = d_inode(path->dentry);
+ handle_t *handle;
+
+@@ -6298,7 +6295,12 @@ static int ext4_quota_on(struct super_bl
+ ext4_journal_stop(handle);
+ unlock_inode:
+ inode_unlock(inode);
++ if (err)
++ dquot_quota_off(sb, type);
+ }
++ if (err)
++ lockdep_set_quota_inode(path->dentry->d_inode,
++ I_DATA_SEM_NORMAL);
+ return err;
+ }
+
--- /dev/null
+From 4013d47a5307fdb5c13370b5392498b00fedd274 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 7 Oct 2021 17:53:36 +0200
+Subject: ext4: make sure to reset inode lockdep class when quota enabling fails
+
+From: Jan Kara <jack@suse.cz>
+
+commit 4013d47a5307fdb5c13370b5392498b00fedd274 upstream.
+
+When we succeed in enabling some quota type but fail to enable another
+one with quota feature, we correctly disable all enabled quota types.
+However we forget to reset i_data_sem lockdep class. When the inode gets
+freed and reused, it will inherit this lockdep class (i_data_sem is
+initialized only when a slab is created) and thus eventually lockdep
+barfs about possible deadlocks.
+
+Reported-and-tested-by: syzbot+3b6f9218b1301ddda3e2@syzkaller.appspotmail.com
+Signed-off-by: Jan Kara <jack@suse.cz>
+Cc: stable@kernel.org
+Link: https://lore.kernel.org/r/20211007155336.12493-3-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/super.c | 13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -6361,8 +6361,19 @@ int ext4_enable_quotas(struct super_bloc
+ "Failed to enable quota tracking "
+ "(type=%d, err=%d). Please run "
+ "e2fsck to fix.", type, err);
+- for (type--; type >= 0; type--)
++ for (type--; type >= 0; type--) {
++ struct inode *inode;
++
++ inode = sb_dqopt(sb)->files[type];
++ if (inode)
++ inode = igrab(inode);
+ dquot_quota_off(sb, type);
++ if (inode) {
++ lockdep_set_quota_inode(inode,
++ I_DATA_SEM_NORMAL);
++ iput(inode);
++ }
++ }
+
+ return err;
+ }
pci-pci-bridge-emul-set-pci_status_cap_list-for-pcie-device.patch
xfrm-fix-policy-lookup-for-ipv6-gre-packets.patch
xfrm-fix-dflt-policy-check-when-there-is-no-policy-configured.patch
+btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch
+btrfs-zoned-cache-reported-zone-during-mount.patch
+btrfs-check-the-root-node-for-uptodate-before-returning-it.patch
+btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch
+btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch
+btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch
+btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch
+ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch
+ext4-make-sure-quota-gets-properly-shutdown-on-error.patch
+ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch
+ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch
+ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch