From: Greg Kroah-Hartman Date: Sun, 23 Jan 2022 17:18:27 +0000 (+0100) Subject: 5.16-stable patches X-Git-Tag: v4.4.300~107 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c92e3b62e9d4f80fb4790ebde27822fa6365d392;p=thirdparty%2Fkernel%2Fstable-queue.git 5.16-stable patches added patches: btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch btrfs-check-the-root-node-for-uptodate-before-returning-it.patch btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch btrfs-zoned-cache-reported-zone-during-mount.patch btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch ext4-make-sure-quota-gets-properly-shutdown-on-error.patch ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch --- diff --git a/queue-5.16/btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch b/queue-5.16/btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch new file mode 100644 index 00000000000..1d245070255 --- /dev/null +++ b/queue-5.16/btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch @@ -0,0 +1,59 @@ +From 50475cd57706359d6cc652be88369dace7a4c2eb Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Wed, 8 Dec 2021 00:35:48 +0900 +Subject: btrfs: add extent allocator hook to decide to allocate chunk or not + +From: Naohiro Aota + +commit 50475cd57706359d6cc652be88369dace7a4c2eb upstream. + +Introduce a new hook for an extent allocator policy. With the new +hook, a policy can decide to allocate a new block group or not. If +not, it will return -ENOSPC, so btrfs_reserve_extent() will cut the +allocation size in half and retry the allocation if min_alloc_size is +large enough. + +The hook has a place holder and will be replaced with the real +implementation in the next patch. + +CC: stable@vger.kernel.org # 5.16 +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent-tree.c | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3947,6 +3947,19 @@ static void found_extent(struct find_fre + } + } + ++static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, ++ struct find_free_extent_ctl *ffe_ctl) ++{ ++ switch (ffe_ctl->policy) { ++ case BTRFS_EXTENT_ALLOC_CLUSTERED: ++ return true; ++ case BTRFS_EXTENT_ALLOC_ZONED: ++ return true; ++ default: ++ BUG(); ++ } ++} ++ + static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl) + { + switch (ffe_ctl->policy) { +@@ -4034,6 +4047,10 @@ static int find_free_extent_update_loop( + struct btrfs_trans_handle *trans; + int exist = 0; + ++ /*Check if allocation policy allows to create a new chunk */ ++ if (!can_allocate_chunk(fs_info, ffe_ctl)) ++ return -ENOSPC; ++ + trans = current->journal_info; + if (trans) + exist = 1; diff --git a/queue-5.16/btrfs-check-the-root-node-for-uptodate-before-returning-it.patch b/queue-5.16/btrfs-check-the-root-node-for-uptodate-before-returning-it.patch new file mode 100644 index 00000000000..cb978f9bc6c --- /dev/null +++ b/queue-5.16/btrfs-check-the-root-node-for-uptodate-before-returning-it.patch @@ -0,0 +1,68 @@ +From 120de408e4b97504a2d9b5ca534b383de2c73d49 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 24 Nov 2021 14:14:24 -0500 +Subject: btrfs: check the root node for uptodate before returning it + +From: Josef Bacik + +commit 120de408e4b97504a2d9b5ca534b383de2c73d49 upstream. + +Now that we clear the extent buffer uptodate if we fail to write it out +we need to check to see if our root node is uptodate before we search +down it. Otherwise we could return stale data (or potentially corrupt +data that was caught by the write verification step) and think that the +path is OK to search down. + +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Nikolay Borisov +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/ctree.c | 19 +++++++++++++++---- + 1 file changed, 15 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -1570,12 +1570,9 @@ static struct extent_buffer *btrfs_searc + { + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *b; +- int root_lock; ++ int root_lock = 0; + int level = 0; + +- /* We try very hard to do read locks on the root */ +- root_lock = BTRFS_READ_LOCK; +- + if (p->search_commit_root) { + /* + * The commit roots are read only so we always do read locks, +@@ -1613,6 +1610,9 @@ static struct extent_buffer *btrfs_searc + goto out; + } + ++ /* We try very hard to do read locks on the root */ ++ root_lock = BTRFS_READ_LOCK; ++ + /* + * If the level is set to maximum, we can skip trying to get the read + * lock. +@@ -1639,6 +1639,17 @@ static struct extent_buffer *btrfs_searc + level = btrfs_header_level(b); + + out: ++ /* ++ * The root may have failed to write out at some point, and thus is no ++ * longer valid, return an error in this case. ++ */ ++ if (!extent_buffer_uptodate(b)) { ++ if (root_lock) ++ btrfs_tree_unlock_rw(b, root_lock); ++ free_extent_buffer(b); ++ return ERR_PTR(-EIO); ++ } ++ + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = root_lock; diff --git a/queue-5.16/btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch b/queue-5.16/btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch new file mode 100644 index 00000000000..ac9b5cfff84 --- /dev/null +++ b/queue-5.16/btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch @@ -0,0 +1,128 @@ +From 232796df8c1437c41d308d161007f0715bac0a54 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 27 Oct 2021 18:30:25 +0100 +Subject: btrfs: fix deadlock between quota enable and other quota operations + +From: Filipe Manana + +commit 232796df8c1437c41d308d161007f0715bac0a54 upstream. + +When enabling quotas, we attempt to commit a transaction while holding the +mutex fs_info->qgroup_ioctl_lock. This can result on a deadlock with other +quota operations such as: + +- qgroup creation and deletion, ioctl BTRFS_IOC_QGROUP_CREATE; + +- adding and removing qgroup relations, ioctl BTRFS_IOC_QGROUP_ASSIGN. + +This is because these operations join a transaction and after that they +attempt to lock the mutex fs_info->qgroup_ioctl_lock. Acquiring that mutex +after joining or starting a transaction is a pattern followed everywhere +in qgroups, so the quota enablement operation is the one at fault here, +and should not commit a transaction while holding that mutex. + +Fix this by making the transaction commit while not holding the mutex. +We are safe from two concurrent tasks trying to enable quotas because +we are serialized by the rw semaphore fs_info->subvol_sem at +btrfs_ioctl_quota_ctl(), which is the only call site for enabling +quotas. + +When this deadlock happens, it produces a trace like the following: + + INFO: task syz-executor:25604 blocked for more than 143 seconds. + Not tainted 5.15.0-rc6 #4 + "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. + task:syz-executor state:D stack:24800 pid:25604 ppid: 24873 flags:0x00004004 + Call Trace: + context_switch kernel/sched/core.c:4940 [inline] + __schedule+0xcd9/0x2530 kernel/sched/core.c:6287 + schedule+0xd3/0x270 kernel/sched/core.c:6366 + btrfs_commit_transaction+0x994/0x2e90 fs/btrfs/transaction.c:2201 + btrfs_quota_enable+0x95c/0x1790 fs/btrfs/qgroup.c:1120 + btrfs_ioctl_quota_ctl fs/btrfs/ioctl.c:4229 [inline] + btrfs_ioctl+0x637e/0x7b70 fs/btrfs/ioctl.c:5010 + vfs_ioctl fs/ioctl.c:51 [inline] + __do_sys_ioctl fs/ioctl.c:874 [inline] + __se_sys_ioctl fs/ioctl.c:860 [inline] + __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x44/0xae + RIP: 0033:0x7f86920b2c4d + RSP: 002b:00007f868f61ac58 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 + RAX: ffffffffffffffda RBX: 00007f86921d90a0 RCX: 00007f86920b2c4d + RDX: 0000000020005e40 RSI: 00000000c0109428 RDI: 0000000000000008 + RBP: 00007f869212bd80 R08: 0000000000000000 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000000246 R12: 00007f86921d90a0 + R13: 00007fff6d233e4f R14: 00007fff6d233ff0 R15: 00007f868f61adc0 + INFO: task syz-executor:25628 blocked for more than 143 seconds. + Not tainted 5.15.0-rc6 #4 + "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. + task:syz-executor state:D stack:29080 pid:25628 ppid: 24873 flags:0x00004004 + Call Trace: + context_switch kernel/sched/core.c:4940 [inline] + __schedule+0xcd9/0x2530 kernel/sched/core.c:6287 + schedule+0xd3/0x270 kernel/sched/core.c:6366 + schedule_preempt_disabled+0xf/0x20 kernel/sched/core.c:6425 + __mutex_lock_common kernel/locking/mutex.c:669 [inline] + __mutex_lock+0xc96/0x1680 kernel/locking/mutex.c:729 + btrfs_remove_qgroup+0xb7/0x7d0 fs/btrfs/qgroup.c:1548 + btrfs_ioctl_qgroup_create fs/btrfs/ioctl.c:4333 [inline] + btrfs_ioctl+0x683c/0x7b70 fs/btrfs/ioctl.c:5014 + vfs_ioctl fs/ioctl.c:51 [inline] + __do_sys_ioctl fs/ioctl.c:874 [inline] + __se_sys_ioctl fs/ioctl.c:860 [inline] + __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:860 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x44/0xae + +Reported-by: Hao Sun +Link: https://lore.kernel.org/linux-btrfs/CACkBjsZQF19bQ1C6=yetF3BvL10OSORpFUcWXTP6HErshDB4dQ@mail.gmail.com/ +Fixes: 340f1aa27f36 ("btrfs: qgroups: Move transaction management inside btrfs_quota_enable/disable") +CC: stable@vger.kernel.org # 4.19 +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -940,6 +940,14 @@ int btrfs_quota_enable(struct btrfs_fs_i + int ret = 0; + int slot; + ++ /* ++ * We need to have subvol_sem write locked, to prevent races between ++ * concurrent tasks trying to enable quotas, because we will unlock ++ * and relock qgroup_ioctl_lock before setting fs_info->quota_root ++ * and before setting BTRFS_FS_QUOTA_ENABLED. ++ */ ++ lockdep_assert_held_write(&fs_info->subvol_sem); ++ + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (fs_info->quota_root) + goto out; +@@ -1117,8 +1125,19 @@ out_add_root: + goto out_free_path; + } + ++ mutex_unlock(&fs_info->qgroup_ioctl_lock); ++ /* ++ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid ++ * a deadlock with tasks concurrently doing other qgroup operations, such ++ * adding/removing qgroups or adding/deleting qgroup relations for example, ++ * because all qgroup operations first start or join a transaction and then ++ * lock the qgroup_ioctl_lock mutex. ++ * We are safe from a concurrent task trying to enable quotas, by calling ++ * this function, since we are serialized by fs_info->subvol_sem. ++ */ + ret = btrfs_commit_transaction(trans); + trans = NULL; ++ mutex_lock(&fs_info->qgroup_ioctl_lock); + if (ret) + goto out_free_path; + diff --git a/queue-5.16/btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch b/queue-5.16/btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch new file mode 100644 index 00000000000..8d468e83810 --- /dev/null +++ b/queue-5.16/btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch @@ -0,0 +1,65 @@ +From c2f822635df873c510bda6fb7fd1b10b7c31be2d Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 16 Dec 2021 15:00:32 +0000 +Subject: btrfs: respect the max size in the header when activating swap file + +From: Filipe Manana + +commit c2f822635df873c510bda6fb7fd1b10b7c31be2d upstream. + +If we extended the size of a swapfile after its header was created (by the +mkswap utility) and then try to activate it, we will map the entire file +when activating the swap file, instead of limiting to the max size defined +in the swap file's header. + +Currently test case generic/643 from fstests fails because we do not +respect that size limit defined in the swap file's header. + +So fix this by not mapping file ranges beyond the max size defined in the +swap header. + +This is the same type of bug that iomap used to have, and was fixed in +commit 36ca7943ac18ae ("mm/swap: consider max pages in +iomap_swapfile_add_extent"). + +Fixes: ed46ff3d423780 ("Btrfs: support swap files") +CC: stable@vger.kernel.org # 5.4+ +Reviewed-and-tested-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/inode.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -10595,9 +10595,19 @@ static int btrfs_add_swap_extent(struct + struct btrfs_swap_info *bsi) + { + unsigned long nr_pages; ++ unsigned long max_pages; + u64 first_ppage, first_ppage_reported, next_ppage; + int ret; + ++ /* ++ * Our swapfile may have had its size extended after the swap header was ++ * written. In that case activating the swapfile should not go beyond ++ * the max size set in the swap header. ++ */ ++ if (bsi->nr_pages >= sis->max) ++ return 0; ++ ++ max_pages = sis->max - bsi->nr_pages; + first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len, + PAGE_SIZE) >> PAGE_SHIFT; +@@ -10605,6 +10615,7 @@ static int btrfs_add_swap_extent(struct + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; ++ nr_pages = min(nr_pages, max_pages); + + first_ppage_reported = first_ppage; + if (bsi->start == 0) diff --git a/queue-5.16/btrfs-zoned-cache-reported-zone-during-mount.patch b/queue-5.16/btrfs-zoned-cache-reported-zone-during-mount.patch new file mode 100644 index 00000000000..c3fdb57ff85 --- /dev/null +++ b/queue-5.16/btrfs-zoned-cache-reported-zone-during-mount.patch @@ -0,0 +1,291 @@ +From 16beac87e95e2fb278b552397c8260637f8a63f7 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Thu, 11 Nov 2021 14:14:38 +0900 +Subject: btrfs: zoned: cache reported zone during mount + +From: Naohiro Aota + +commit 16beac87e95e2fb278b552397c8260637f8a63f7 upstream. + +When mounting a device, we are reporting the zones twice: once for +checking the zone attributes in btrfs_get_dev_zone_info and once for +loading block groups' zone info in +btrfs_load_block_group_zone_info(). With a lot of block groups, that +leads to a lot of REPORT ZONE commands and slows down the mount +process. + +This patch introduces a zone info cache in struct +btrfs_zoned_device_info. The cache is populated while in +btrfs_get_dev_zone_info() and used for +btrfs_load_block_group_zone_info() to reduce the number of REPORT ZONE +commands. The zone cache is then released after loading the block +groups, as it will not be much effective during the run time. + +Benchmark: Mount an HDD with 57,007 block groups +Before patch: 171.368 seconds +After patch: 64.064 seconds + +While it still takes a minute due to the slowness of loading all the +block groups, the patch reduces the mount time by 1/3. + +Link: https://lore.kernel.org/linux-btrfs/CAHQ7scUiLtcTqZOMMY5kbWUBOhGRwKo6J6wYPT5WY+C=cD49nQ@mail.gmail.com/ +Fixes: 5b316468983d ("btrfs: get zone information of zoned block devices") +CC: stable@vger.kernel.org +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/dev-replace.c | 2 - + fs/btrfs/disk-io.c | 2 + + fs/btrfs/volumes.c | 2 - + fs/btrfs/zoned.c | 86 +++++++++++++++++++++++++++++++++++++++++++------ + fs/btrfs/zoned.h | 8 +++- + 5 files changed, 87 insertions(+), 13 deletions(-) + +--- a/fs/btrfs/dev-replace.c ++++ b/fs/btrfs/dev-replace.c +@@ -322,7 +322,7 @@ static int btrfs_init_dev_replace_tgtdev + set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); + device->fs_devices = fs_info->fs_devices; + +- ret = btrfs_get_dev_zone_info(device); ++ ret = btrfs_get_dev_zone_info(device, false); + if (ret) + goto error; + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3571,6 +3571,8 @@ int __cold open_ctree(struct super_block + goto fail_sysfs; + } + ++ btrfs_free_zone_cache(fs_info); ++ + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && + !btrfs_check_rw_degradable(fs_info, NULL)) { + btrfs_warn(fs_info, +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -2643,7 +2643,7 @@ int btrfs_init_new_device(struct btrfs_f + device->fs_info = fs_info; + device->bdev = bdev; + +- ret = btrfs_get_dev_zone_info(device); ++ ret = btrfs_get_dev_zone_info(device, false); + if (ret) + goto error_free_device; + +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + #include "ctree.h" + #include "volumes.h" + #include "zoned.h" +@@ -213,6 +214,8 @@ static int emulate_report_zones(struct b + static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, + struct blk_zone *zones, unsigned int *nr_zones) + { ++ struct btrfs_zoned_device_info *zinfo = device->zone_info; ++ u32 zno; + int ret; + + if (!*nr_zones) +@@ -224,6 +227,34 @@ static int btrfs_get_dev_zones(struct bt + return 0; + } + ++ /* Check cache */ ++ if (zinfo->zone_cache) { ++ unsigned int i; ++ ++ ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); ++ zno = pos >> zinfo->zone_size_shift; ++ /* ++ * We cannot report zones beyond the zone end. So, it is OK to ++ * cap *nr_zones to at the end. ++ */ ++ *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); ++ ++ for (i = 0; i < *nr_zones; i++) { ++ struct blk_zone *zone_info; ++ ++ zone_info = &zinfo->zone_cache[zno + i]; ++ if (!zone_info->len) ++ break; ++ } ++ ++ if (i == *nr_zones) { ++ /* Cache hit on all the zones */ ++ memcpy(zones, zinfo->zone_cache + zno, ++ sizeof(*zinfo->zone_cache) * *nr_zones); ++ return 0; ++ } ++ } ++ + ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, + copy_zone_info_cb, zones); + if (ret < 0) { +@@ -237,6 +268,11 @@ static int btrfs_get_dev_zones(struct bt + if (!ret) + return -EIO; + ++ /* Populate cache */ ++ if (zinfo->zone_cache) ++ memcpy(zinfo->zone_cache + zno, zones, ++ sizeof(*zinfo->zone_cache) * *nr_zones); ++ + return 0; + } + +@@ -300,7 +336,7 @@ int btrfs_get_dev_zone_info_all_devices( + if (!device->bdev) + continue; + +- ret = btrfs_get_dev_zone_info(device); ++ ret = btrfs_get_dev_zone_info(device, true); + if (ret) + break; + } +@@ -309,7 +345,7 @@ int btrfs_get_dev_zone_info_all_devices( + return ret; + } + +-int btrfs_get_dev_zone_info(struct btrfs_device *device) ++int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) + { + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_zoned_device_info *zone_info = NULL; +@@ -339,6 +375,8 @@ int btrfs_get_dev_zone_info(struct btrfs + if (!zone_info) + return -ENOMEM; + ++ device->zone_info = zone_info; ++ + if (!bdev_is_zoned(bdev)) { + if (!fs_info->zone_size) { + ret = calculate_emulated_zone_size(fs_info); +@@ -407,6 +445,23 @@ int btrfs_get_dev_zone_info(struct btrfs + goto out; + } + ++ /* ++ * Enable zone cache only for a zoned device. On a non-zoned device, we ++ * fill the zone info with emulated CONVENTIONAL zones, so no need to ++ * use the cache. ++ */ ++ if (populate_cache && bdev_is_zoned(device->bdev)) { ++ zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * ++ zone_info->nr_zones); ++ if (!zone_info->zone_cache) { ++ btrfs_err_in_rcu(device->fs_info, ++ "zoned: failed to allocate zone cache for %s", ++ rcu_str_deref(device->name)); ++ ret = -ENOMEM; ++ goto out; ++ } ++ } ++ + /* Get zones type */ + nactive = 0; + while (sector < nr_sectors) { +@@ -505,8 +560,6 @@ int btrfs_get_dev_zone_info(struct btrfs + + kfree(zones); + +- device->zone_info = zone_info; +- + switch (bdev_zoned_model(bdev)) { + case BLK_ZONED_HM: + model = "host-managed zoned"; +@@ -539,11 +592,7 @@ int btrfs_get_dev_zone_info(struct btrfs + out: + kfree(zones); + out_free_zone_info: +- bitmap_free(zone_info->active_zones); +- bitmap_free(zone_info->empty_zones); +- bitmap_free(zone_info->seq_zones); +- kfree(zone_info); +- device->zone_info = NULL; ++ btrfs_destroy_dev_zone_info(device); + + return ret; + } +@@ -558,6 +607,7 @@ void btrfs_destroy_dev_zone_info(struct + bitmap_free(zone_info->active_zones); + bitmap_free(zone_info->seq_zones); + bitmap_free(zone_info->empty_zones); ++ vfree(zone_info->zone_cache); + kfree(zone_info); + device->zone_info = NULL; + } +@@ -1975,3 +2025,21 @@ void btrfs_clear_data_reloc_bg(struct bt + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); + } ++ ++void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) ++{ ++ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; ++ struct btrfs_device *device; ++ ++ if (!btrfs_is_zoned(fs_info)) ++ return; ++ ++ mutex_lock(&fs_devices->device_list_mutex); ++ list_for_each_entry(device, &fs_devices->devices, dev_list) { ++ if (device->zone_info) { ++ vfree(device->zone_info->zone_cache); ++ device->zone_info->zone_cache = NULL; ++ } ++ } ++ mutex_unlock(&fs_devices->device_list_mutex); ++} +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -28,6 +28,7 @@ struct btrfs_zoned_device_info { + unsigned long *seq_zones; + unsigned long *empty_zones; + unsigned long *active_zones; ++ struct blk_zone *zone_cache; + struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; + }; + +@@ -35,7 +36,7 @@ struct btrfs_zoned_device_info { + int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone); + int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info); +-int btrfs_get_dev_zone_info(struct btrfs_device *device); ++int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); + void btrfs_destroy_dev_zone_info(struct btrfs_device *device); + int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); + int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); +@@ -76,6 +77,7 @@ bool btrfs_can_activate_zone(struct btrf + void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); + void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); ++void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); + #else /* CONFIG_BLK_DEV_ZONED */ + static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +@@ -88,7 +90,8 @@ static inline int btrfs_get_dev_zone_inf + return 0; + } + +-static inline int btrfs_get_dev_zone_info(struct btrfs_device *device) ++static inline int btrfs_get_dev_zone_info(struct btrfs_device *device, ++ bool populate_cache) + { + return 0; + } +@@ -232,6 +235,7 @@ static inline void btrfs_zone_finish_end + + static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } + ++static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } + #endif + + static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/queue-5.16/btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch b/queue-5.16/btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch new file mode 100644 index 00000000000..f4353f770e6 --- /dev/null +++ b/queue-5.16/btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch @@ -0,0 +1,144 @@ +From 82187d2ecdfb22ab7ee05f388402a39236d31428 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Wed, 8 Dec 2021 00:35:49 +0900 +Subject: btrfs: zoned: fix chunk allocation condition for zoned allocator + +From: Naohiro Aota + +commit 82187d2ecdfb22ab7ee05f388402a39236d31428 upstream. + +The ZNS specification defines a limit on the number of "active" +zones. That limit impose us to limit the number of block groups which +can be used for an allocation at the same time. Not to exceed the +limit, we reuse the existing active block groups as much as possible +when we can't activate any other zones without sacrificing an already +activated block group in commit a85f05e59bc1 ("btrfs: zoned: avoid +chunk allocation if active block group has enough space"). + +However, the check is wrong in two ways. First, it checks the +condition for every raid index (ffe_ctl->index). Even if it reaches +the condition and "ffe_ctl->max_extent_size >= +ffe_ctl->min_alloc_size" is met, there can be other block groups +having enough space to hold ffe_ctl->num_bytes. (Actually, this won't +happen in the current zoned code as it only supports SINGLE +profile. But, it can happen once it enables other RAID types.) + +Second, it checks the active zone availability depending on the +raid index. The raid index is just an index for +space_info->block_groups, so it has nothing to do with chunk allocation. + +These mistakes are causing a faulty allocation in a certain +situation. Consider we are running zoned btrfs on a device whose +max_active_zone == 0 (no limit). And, suppose no block group have a +room to fit ffe_ctl->num_bytes but some room to meet +ffe_ctl->min_alloc_size (i.e. max_extent_size > num_bytes >= +min_alloc_size). + +In this situation, the following occur: + +- With SINGLE raid_index, it reaches the chunk allocation checking + code +- The check returns true because we can activate a new zone (no limit) +- But, before allocating the chunk, it iterates to the next raid index + (RAID5) +- Since there are no RAID5 block groups on zoned mode, it again + reaches the check code +- The check returns false because of btrfs_can_activate_zone()'s "if + (raid_index != BTRFS_RAID_SINGLE)" part +- That results in returning -ENOSPC without allocating a new chunk + +As a result, we end up hitting -ENOSPC too early. + +Move the check to the right place in the can_allocate_chunk() hook, +and do the active zone check depending on the allocation flag, not on +the raid index. + +CC: stable@vger.kernel.org # 5.16 +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent-tree.c | 21 +++++++++------------ + fs/btrfs/zoned.c | 5 ++--- + fs/btrfs/zoned.h | 5 ++--- + 3 files changed, 13 insertions(+), 18 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3966,6 +3966,15 @@ static bool can_allocate_chunk(struct bt + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return true; + case BTRFS_EXTENT_ALLOC_ZONED: ++ /* ++ * If we have enough free space left in an already ++ * active block group and we can't activate any other ++ * zone now, do not allow allocating a new chunk and ++ * let find_free_extent() retry with a smaller size. ++ */ ++ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && ++ !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) ++ return false; + return true; + default: + BUG(); +@@ -4012,18 +4021,6 @@ static int find_free_extent_update_loop( + return 0; + } + +- if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && +- !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) { +- /* +- * If we have enough free space left in an already active block +- * group and we can't activate any other zone now, retry the +- * active ones with a smaller allocation size. Returning early +- * from here will tell btrfs_reserve_extent() to haven the +- * size. +- */ +- return -ENOSPC; +- } +- + if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) + return 1; + +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -1934,7 +1934,7 @@ int btrfs_zone_finish(struct btrfs_block + return ret; + } + +-bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index) ++bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) + { + struct btrfs_device *device; + bool ret = false; +@@ -1943,8 +1943,7 @@ bool btrfs_can_activate_zone(struct btrf + return true; + + /* Non-single profiles are not supported yet */ +- if (raid_index != BTRFS_RAID_SINGLE) +- return false; ++ ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0); + + /* Check if there is a device with active zones left */ + mutex_lock(&fs_devices->device_list_mutex); +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -72,8 +72,7 @@ struct btrfs_device *btrfs_zoned_get_dev + u64 logical, u64 length); + bool btrfs_zone_activate(struct btrfs_block_group *block_group); + int btrfs_zone_finish(struct btrfs_block_group *block_group); +-bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, +- int raid_index); ++bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); + void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); + void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); +@@ -225,7 +224,7 @@ static inline int btrfs_zone_finish(stru + } + + static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, +- int raid_index) ++ u64 flags) + { + return true; + } diff --git a/queue-5.16/btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch b/queue-5.16/btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch new file mode 100644 index 00000000000..ed2dc2d4e77 --- /dev/null +++ b/queue-5.16/btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch @@ -0,0 +1,72 @@ +From 1ada69f61c88abb75a1038ee457633325658a183 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Wed, 8 Dec 2021 00:35:47 +0900 +Subject: btrfs: zoned: unset dedicated block group on allocation failure + +From: Naohiro Aota + +commit 1ada69f61c88abb75a1038ee457633325658a183 upstream. + +Allocating an extent from a block group can fail for various reasons. +When an allocation from a dedicated block group (for tree-log or +relocation data) fails, we need to unregister it as a dedicated one so +that we can allocate a new block group for the dedicated one. + +However, we are returning early when the block group in case it is +read-only, fully used, or not be able to activate the zone. As a result, +we keep the non-usable block group as a dedicated one, leading to +further allocation failure. With many block groups, the allocator will +iterate hopeless loop to find a free extent, results in a hung task. + +Fix the issue by delaying the return and doing the proper cleanups. + +CC: stable@vger.kernel.org # 5.16 +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent-tree.c | 20 ++++++++++++++++---- + 1 file changed, 16 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3790,23 +3790,35 @@ static int do_allocation_zoned(struct bt + spin_unlock(&fs_info->relocation_bg_lock); + if (skip) + return 1; ++ + /* Check RO and no space case before trying to activate it */ + spin_lock(&block_group->lock); + if (block_group->ro || + block_group->alloc_offset == block_group->zone_capacity) { +- spin_unlock(&block_group->lock); +- return 1; ++ ret = 1; ++ /* ++ * May need to clear fs_info->{treelog,data_reloc}_bg. ++ * Return the error after taking the locks. ++ */ + } + spin_unlock(&block_group->lock); + +- if (!btrfs_zone_activate(block_group)) +- return 1; ++ if (!ret && !btrfs_zone_activate(block_group)) { ++ ret = 1; ++ /* ++ * May need to clear fs_info->{treelog,data_reloc}_bg. ++ * Return the error after taking the locks. ++ */ ++ } + + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + spin_lock(&fs_info->treelog_bg_lock); + spin_lock(&fs_info->relocation_bg_lock); + ++ if (ret) ++ goto out; ++ + ASSERT(!ffe_ctl->for_treelog || + block_group->start == fs_info->treelog_bg || + fs_info->treelog_bg == 0); diff --git a/queue-5.16/ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch b/queue-5.16/ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch new file mode 100644 index 00000000000..f347f0d37dd --- /dev/null +++ b/queue-5.16/ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch @@ -0,0 +1,154 @@ +From 8c80fb312d7abf8bcd66cca1d843a80318a2c522 Mon Sep 17 00:00:00 2001 +From: Chunguang Xu +Date: Tue, 23 Nov 2021 09:17:57 +0800 +Subject: ext4: fix a possible ABBA deadlock due to busy PA + +From: Chunguang Xu + +commit 8c80fb312d7abf8bcd66cca1d843a80318a2c522 upstream. + +We found on older kernel (3.10) that in the scenario of insufficient +disk space, system may trigger an ABBA deadlock problem, it seems that +this problem still exists in latest kernel, try to fix it here. The +main process triggered by this problem is that task A occupies the PA +and waits for the jbd2 transaction finish, the jbd2 transaction waits +for the completion of task B's IO (plug_list), but task B waits for +the release of PA by task A to finish discard, which indirectly forms +an ABBA deadlock. The related calltrace is as follows: + + Task A + vfs_write + ext4_mb_new_blocks() + ext4_mb_mark_diskspace_used() JBD2 + jbd2_journal_get_write_access() -> jbd2_journal_commit_transaction() + ->schedule() filemap_fdatawait() + | | + | Task B | + | do_unlinkat() | + | ext4_evict_inode() | + | jbd2_journal_begin_ordered_truncate() | + | filemap_fdatawrite_range() | + | ext4_mb_new_blocks() | + -ext4_mb_discard_group_preallocations() <----- + +Here, try to cancel ext4_mb_discard_group_preallocations() internal +retry due to PA busy, and do a limited number of retries inside +ext4_mb_discard_preallocations(), which can circumvent the above +problems, but also has some advantages: + +1. Since the PA is in a busy state, if other groups have free PAs, + keeping the current PA may help to reduce fragmentation. +2. Continue to traverse forward instead of waiting for the current + group PA to be released. In most scenarios, the PA discard time + can be reduced. + +However, in the case of smaller free space, if only a few groups have +space, then due to multiple traversals of the group, it may increase +CPU overhead. But in contrast, I feel that the overall benefit is +better than the cost. + +Signed-off-by: Chunguang Xu +Reported-by: kernel test robot +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/1637630277-23496-1-git-send-email-brookxu.cn@gmail.com +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 40 ++++++++++++++++++---------------------- + 1 file changed, 18 insertions(+), 22 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -4814,7 +4814,7 @@ ext4_mb_release_group_pa(struct ext4_bud + */ + static noinline_for_stack int + ext4_mb_discard_group_preallocations(struct super_block *sb, +- ext4_group_t group, int needed) ++ ext4_group_t group, int *busy) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct buffer_head *bitmap_bh = NULL; +@@ -4822,8 +4822,7 @@ ext4_mb_discard_group_preallocations(str + struct list_head list; + struct ext4_buddy e4b; + int err; +- int busy = 0; +- int free, free_total = 0; ++ int free = 0; + + mb_debug(sb, "discard preallocation for group %u\n", group); + if (list_empty(&grp->bb_prealloc_list)) +@@ -4846,19 +4845,14 @@ ext4_mb_discard_group_preallocations(str + goto out_dbg; + } + +- if (needed == 0) +- needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; +- + INIT_LIST_HEAD(&list); +-repeat: +- free = 0; + ext4_lock_group(sb, group); + list_for_each_entry_safe(pa, tmp, + &grp->bb_prealloc_list, pa_group_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + spin_unlock(&pa->pa_lock); +- busy = 1; ++ *busy = 1; + continue; + } + if (pa->pa_deleted) { +@@ -4898,22 +4892,13 @@ repeat: + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } + +- free_total += free; +- +- /* if we still need more blocks and some PAs were used, try again */ +- if (free_total < needed && busy) { +- ext4_unlock_group(sb, group); +- cond_resched(); +- busy = 0; +- goto repeat; +- } + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); + out_dbg: + mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", +- free_total, group, grp->bb_free); +- return free_total; ++ free, group, grp->bb_free); ++ return free; + } + + /* +@@ -5455,13 +5440,24 @@ static int ext4_mb_discard_preallocation + { + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + int ret; +- int freed = 0; ++ int freed = 0, busy = 0; ++ int retry = 0; + + trace_ext4_mb_discard_preallocations(sb, needed); ++ ++ if (needed == 0) ++ needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; ++ repeat: + for (i = 0; i < ngroups && needed > 0; i++) { +- ret = ext4_mb_discard_group_preallocations(sb, i, needed); ++ ret = ext4_mb_discard_group_preallocations(sb, i, &busy); + freed += ret; + needed -= ret; ++ cond_resched(); ++ } ++ ++ if (needed > 0 && busy && ++retry < 3) { ++ busy = 0; ++ goto repeat; + } + + return freed; diff --git a/queue-5.16/ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch b/queue-5.16/ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch new file mode 100644 index 00000000000..a8c3facd3aa --- /dev/null +++ b/queue-5.16/ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch @@ -0,0 +1,54 @@ +From 5e4d0eba1ccaf19f93222abdeda5a368be141785 Mon Sep 17 00:00:00 2001 +From: Xin Yin +Date: Tue, 21 Dec 2021 10:28:39 +0800 +Subject: ext4: fix fast commit may miss tracking range for FALLOC_FL_ZERO_RANGE + +From: Xin Yin + +commit 5e4d0eba1ccaf19f93222abdeda5a368be141785 upstream. + +when call falloc with FALLOC_FL_ZERO_RANGE, to set an range to unwritten, +which has been already initialized. If the range is align to blocksize, +fast commit will not track range for this change. + +Also track range for unwritten range in ext4_map_blocks(). + +Signed-off-by: Xin Yin +Reviewed-by: Harshad Shirwadkar +Link: https://lore.kernel.org/r/20211221022839.374606-1-yinxin.x@bytedance.com +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 2 -- + fs/ext4/inode.c | 7 ++++--- + 2 files changed, 4 insertions(+), 5 deletions(-) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -4647,8 +4647,6 @@ static long ext4_zero_range(struct file + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; +- ext4_fc_track_range(handle, inode, offset >> inode->i_sb->s_blocksize_bits, +- (offset + len - 1) >> inode->i_sb->s_blocksize_bits); + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret >= 0) +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -741,10 +741,11 @@ out_sem: + if (ret) + return ret; + } +- ext4_fc_track_range(handle, inode, map->m_lblk, +- map->m_lblk + map->m_len - 1); + } +- ++ if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || ++ map->m_flags & EXT4_MAP_MAPPED)) ++ ext4_fc_track_range(handle, inode, map->m_lblk, ++ map->m_lblk + map->m_len - 1); + if (retval < 0) + ext_debug(inode, "failed with err %d\n", retval); + return retval; diff --git a/queue-5.16/ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch b/queue-5.16/ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch new file mode 100644 index 00000000000..3c19045d2ed --- /dev/null +++ b/queue-5.16/ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch @@ -0,0 +1,43 @@ +From c27c29c6af4f3f4ce925a2111c256733c5a5b430 Mon Sep 17 00:00:00 2001 +From: Harshad Shirwadkar +Date: Wed, 1 Dec 2021 08:34:21 -0800 +Subject: ext4: initialize err_blk before calling __ext4_get_inode_loc + +From: Harshad Shirwadkar + +commit c27c29c6af4f3f4ce925a2111c256733c5a5b430 upstream. + +It is not guaranteed that __ext4_get_inode_loc will definitely set +err_blk pointer when it returns EIO. To avoid using uninitialized +variables, let's first set err_blk to 0. + +Reported-by: Dan Carpenter +Signed-off-by: Harshad Shirwadkar +Link: https://lore.kernel.org/r/20211201163421.2631661-1-harshads@google.com +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4523,7 +4523,7 @@ has_buffer: + static int __ext4_get_inode_loc_noinmem(struct inode *inode, + struct ext4_iloc *iloc) + { +- ext4_fsblk_t err_blk; ++ ext4_fsblk_t err_blk = 0; + int ret; + + ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, +@@ -4538,7 +4538,7 @@ static int __ext4_get_inode_loc_noinmem( + + int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) + { +- ext4_fsblk_t err_blk; ++ ext4_fsblk_t err_blk = 0; + int ret; + + ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, diff --git a/queue-5.16/ext4-make-sure-quota-gets-properly-shutdown-on-error.patch b/queue-5.16/ext4-make-sure-quota-gets-properly-shutdown-on-error.patch new file mode 100644 index 00000000000..19993a915d5 --- /dev/null +++ b/queue-5.16/ext4-make-sure-quota-gets-properly-shutdown-on-error.patch @@ -0,0 +1,51 @@ +From 15fc69bbbbbc8c72e5f6cc4e1be0f51283c5448e Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 7 Oct 2021 17:53:35 +0200 +Subject: ext4: make sure quota gets properly shutdown on error + +From: Jan Kara + +commit 15fc69bbbbbc8c72e5f6cc4e1be0f51283c5448e upstream. + +When we hit an error when enabling quotas and setting inode flags, we do +not properly shutdown quota subsystem despite returning error from +Q_QUOTAON quotactl. This can lead to some odd situations like kernel +using quota file while it is still writeable for userspace. Make sure we +properly cleanup the quota subsystem in case of error. + +Signed-off-by: Jan Kara +Cc: stable@kernel.org +Link: https://lore.kernel.org/r/20211007155336.12493-2-jack@suse.cz +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -6275,10 +6275,7 @@ static int ext4_quota_on(struct super_bl + + lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); + err = dquot_quota_on(sb, type, format_id, path); +- if (err) { +- lockdep_set_quota_inode(path->dentry->d_inode, +- I_DATA_SEM_NORMAL); +- } else { ++ if (!err) { + struct inode *inode = d_inode(path->dentry); + handle_t *handle; + +@@ -6298,7 +6295,12 @@ static int ext4_quota_on(struct super_bl + ext4_journal_stop(handle); + unlock_inode: + inode_unlock(inode); ++ if (err) ++ dquot_quota_off(sb, type); + } ++ if (err) ++ lockdep_set_quota_inode(path->dentry->d_inode, ++ I_DATA_SEM_NORMAL); + return err; + } + diff --git a/queue-5.16/ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch b/queue-5.16/ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch new file mode 100644 index 00000000000..16123e1e4b7 --- /dev/null +++ b/queue-5.16/ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch @@ -0,0 +1,49 @@ +From 4013d47a5307fdb5c13370b5392498b00fedd274 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 7 Oct 2021 17:53:36 +0200 +Subject: ext4: make sure to reset inode lockdep class when quota enabling fails + +From: Jan Kara + +commit 4013d47a5307fdb5c13370b5392498b00fedd274 upstream. + +When we succeed in enabling some quota type but fail to enable another +one with quota feature, we correctly disable all enabled quota types. +However we forget to reset i_data_sem lockdep class. When the inode gets +freed and reused, it will inherit this lockdep class (i_data_sem is +initialized only when a slab is created) and thus eventually lockdep +barfs about possible deadlocks. + +Reported-and-tested-by: syzbot+3b6f9218b1301ddda3e2@syzkaller.appspotmail.com +Signed-off-by: Jan Kara +Cc: stable@kernel.org +Link: https://lore.kernel.org/r/20211007155336.12493-3-jack@suse.cz +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -6361,8 +6361,19 @@ int ext4_enable_quotas(struct super_bloc + "Failed to enable quota tracking " + "(type=%d, err=%d). Please run " + "e2fsck to fix.", type, err); +- for (type--; type >= 0; type--) ++ for (type--; type >= 0; type--) { ++ struct inode *inode; ++ ++ inode = sb_dqopt(sb)->files[type]; ++ if (inode) ++ inode = igrab(inode); + dquot_quota_off(sb, type); ++ if (inode) { ++ lockdep_set_quota_inode(inode, ++ I_DATA_SEM_NORMAL); ++ iput(inode); ++ } ++ } + + return err; + } diff --git a/queue-5.16/series b/queue-5.16/series index 3eb224c967c..fdcb0a26cad 100644 --- a/queue-5.16/series +++ b/queue-5.16/series @@ -879,3 +879,15 @@ pci-pci-bridge-emul-correctly-set-pcie-capabilities.patch pci-pci-bridge-emul-set-pci_status_cap_list-for-pcie-device.patch xfrm-fix-policy-lookup-for-ipv6-gre-packets.patch xfrm-fix-dflt-policy-check-when-there-is-no-policy-configured.patch +btrfs-fix-deadlock-between-quota-enable-and-other-quota-operations.patch +btrfs-zoned-cache-reported-zone-during-mount.patch +btrfs-check-the-root-node-for-uptodate-before-returning-it.patch +btrfs-add-extent-allocator-hook-to-decide-to-allocate-chunk-or-not.patch +btrfs-zoned-unset-dedicated-block-group-on-allocation-failure.patch +btrfs-zoned-fix-chunk-allocation-condition-for-zoned-allocator.patch +btrfs-respect-the-max-size-in-the-header-when-activating-swap-file.patch +ext4-make-sure-to-reset-inode-lockdep-class-when-quota-enabling-fails.patch +ext4-make-sure-quota-gets-properly-shutdown-on-error.patch +ext4-fix-a-possible-abba-deadlock-due-to-busy-pa.patch +ext4-initialize-err_blk-before-calling-__ext4_get_inode_loc.patch +ext4-fix-fast-commit-may-miss-tracking-range-for-falloc_fl_zero_range.patch