From 32db5295d06796e49d362fdbcd41a98982ab4f0a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 6 Jun 2022 19:02:20 +0200 Subject: [PATCH] 5.10-stable patches added patches: net-ipa-fix-page-free-in-ipa_endpoint_replenish_one.patch net-ipa-fix-page-free-in-ipa_endpoint_trans_release.patch xfs-assert-in-xfs_btree_del_cursor-should-take-into-account-error.patch xfs-consider-shutdown-in-bmapbt-cursor-delete-assert.patch xfs-fix-chown-leaking-delalloc-quota-blocks-when-fssetxattr-fails.patch xfs-fix-incorrect-root-dquot-corruption-error-when-switching-group-project-quota-types.patch xfs-force-log-and-push-ail-to-clear-pinned-inodes-when-aborting-mount.patch xfs-restore-shutdown-check-in-mapped-write-fault-path.patch xfs-set-inode-size-after-creating-symlink.patch xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch --- ...e-free-in-ipa_endpoint_replenish_one.patch | 38 ++++ ...e-free-in-ipa_endpoint_trans_release.patch | 38 ++++ queue-5.10/series | 10 ++ ...ursor-should-take-into-account-error.patch | 82 +++++++++ ...tdown-in-bmapbt-cursor-delete-assert.patch | 88 +++++++++ ...c-quota-blocks-when-fssetxattr-fails.patch | 170 ++++++++++++++++++ ...-switching-group-project-quota-types.patch | 95 ++++++++++ ...ar-pinned-inodes-when-aborting-mount.patch | 155 ++++++++++++++++ ...own-check-in-mapped-write-fault-path.patch | 53 ++++++ ...et-inode-size-after-creating-symlink.patch | 43 +++++ ...nting-on-quiesce-of-read-only-mounts.patch | 124 +++++++++++++ 11 files changed, 896 insertions(+) create mode 100644 queue-5.10/net-ipa-fix-page-free-in-ipa_endpoint_replenish_one.patch create mode 100644 queue-5.10/net-ipa-fix-page-free-in-ipa_endpoint_trans_release.patch create mode 100644 queue-5.10/xfs-assert-in-xfs_btree_del_cursor-should-take-into-account-error.patch create mode 100644 queue-5.10/xfs-consider-shutdown-in-bmapbt-cursor-delete-assert.patch create mode 100644 queue-5.10/xfs-fix-chown-leaking-delalloc-quota-blocks-when-fssetxattr-fails.patch create mode 100644 queue-5.10/xfs-fix-incorrect-root-dquot-corruption-error-when-switching-group-project-quota-types.patch create mode 100644 queue-5.10/xfs-force-log-and-push-ail-to-clear-pinned-inodes-when-aborting-mount.patch create mode 100644 queue-5.10/xfs-restore-shutdown-check-in-mapped-write-fault-path.patch create mode 100644 queue-5.10/xfs-set-inode-size-after-creating-symlink.patch create mode 100644 queue-5.10/xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch diff --git a/queue-5.10/net-ipa-fix-page-free-in-ipa_endpoint_replenish_one.patch b/queue-5.10/net-ipa-fix-page-free-in-ipa_endpoint_replenish_one.patch new file mode 100644 index 00000000000..f160cd7f705 --- /dev/null +++ b/queue-5.10/net-ipa-fix-page-free-in-ipa_endpoint_replenish_one.patch @@ -0,0 +1,38 @@ +From 70132763d5d2e94cd185e3aa92ac6a3ba89068fa Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Thu, 26 May 2022 10:23:14 -0500 +Subject: net: ipa: fix page free in ipa_endpoint_replenish_one() + +From: Alex Elder + +commit 70132763d5d2e94cd185e3aa92ac6a3ba89068fa upstream. + +Currently the (possibly compound) pages used for receive buffers are +freed using __free_pages(). But according to this comment above the +definition of that function, that's wrong: + If you want to use the page's reference count to decide + when to free the allocation, you should allocate a compound + page, and use put_page() instead of __free_pages(). + +Convert the call to __free_pages() in ipa_endpoint_replenish_one() +to use put_page() instead. + +Fixes: 6a606b90153b8 ("net: ipa: allocate transaction in replenish loop") +Signed-off-by: Alex Elder +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ipa/ipa_endpoint.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ipa/ipa_endpoint.c ++++ b/drivers/net/ipa/ipa_endpoint.c +@@ -884,7 +884,7 @@ static int ipa_endpoint_replenish_one(st + err_trans_free: + gsi_trans_free(trans); + err_free_pages: +- __free_pages(page, get_order(IPA_RX_BUFFER_SIZE)); ++ put_page(page); + + return -ENOMEM; + } diff --git a/queue-5.10/net-ipa-fix-page-free-in-ipa_endpoint_trans_release.patch b/queue-5.10/net-ipa-fix-page-free-in-ipa_endpoint_trans_release.patch new file mode 100644 index 00000000000..adbd9828f45 --- /dev/null +++ b/queue-5.10/net-ipa-fix-page-free-in-ipa_endpoint_trans_release.patch @@ -0,0 +1,38 @@ +From 155c0c90bca918de6e4327275dfc1d97fd604115 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Thu, 26 May 2022 10:23:13 -0500 +Subject: net: ipa: fix page free in ipa_endpoint_trans_release() + +From: Alex Elder + +commit 155c0c90bca918de6e4327275dfc1d97fd604115 upstream. + +Currently the (possibly compound) page used for receive buffers are +freed using __free_pages(). But according to this comment above the +definition of that function, that's wrong: + If you want to use the page's reference count to decide when + to free the allocation, you should allocate a compound page, + and use put_page() instead of __free_pages(). + +Convert the call to __free_pages() in ipa_endpoint_trans_release() +to use put_page() instead. + +Fixes: ed23f02680caa ("net: ipa: define per-endpoint receive buffer size") +Signed-off-by: Alex Elder +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ipa/ipa_endpoint.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ipa/ipa_endpoint.c ++++ b/drivers/net/ipa/ipa_endpoint.c +@@ -1179,7 +1179,7 @@ void ipa_endpoint_trans_release(struct i + struct page *page = trans->data; + + if (page) +- __free_pages(page, get_order(IPA_RX_BUFFER_SIZE)); ++ put_page(page); + } + } + diff --git a/queue-5.10/series b/queue-5.10/series index 6ca435b1897..9f1f357df0d 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -425,3 +425,13 @@ vdpasim-allow-to-enable-a-vq-repeatedly.patch blk-iolatency-fix-inflight-count-imbalances-and-io-hangs-on-offline.patch coresight-core-fix-coresight-device-probe-failure-issue.patch phy-qcom-qmp-fix-reset-controller-leak-on-probe-errors.patch +net-ipa-fix-page-free-in-ipa_endpoint_trans_release.patch +net-ipa-fix-page-free-in-ipa_endpoint_replenish_one.patch +xfs-set-inode-size-after-creating-symlink.patch +xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch +xfs-fix-chown-leaking-delalloc-quota-blocks-when-fssetxattr-fails.patch +xfs-fix-incorrect-root-dquot-corruption-error-when-switching-group-project-quota-types.patch +xfs-restore-shutdown-check-in-mapped-write-fault-path.patch +xfs-force-log-and-push-ail-to-clear-pinned-inodes-when-aborting-mount.patch +xfs-consider-shutdown-in-bmapbt-cursor-delete-assert.patch +xfs-assert-in-xfs_btree_del_cursor-should-take-into-account-error.patch diff --git a/queue-5.10/xfs-assert-in-xfs_btree_del_cursor-should-take-into-account-error.patch b/queue-5.10/xfs-assert-in-xfs_btree_del_cursor-should-take-into-account-error.patch new file mode 100644 index 00000000000..58d90ffef17 --- /dev/null +++ b/queue-5.10/xfs-assert-in-xfs_btree_del_cursor-should-take-into-account-error.patch @@ -0,0 +1,82 @@ +From foo@baz Mon Jun 6 07:00:47 PM CEST 2022 +From: Amir Goldstein +Date: Mon, 6 Jun 2022 17:32:55 +0300 +Subject: xfs: assert in xfs_btree_del_cursor should take into account error +To: Greg Kroah-Hartman +Cc: Sasha Levin , Dave Chinner , "Darrick J . Wong" , Christoph Hellwig , Brian Foster , Christian Brauner , Luis Chamberlain , Leah Rumancik , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner +Message-ID: <20220606143255.685988-9-amir73il@gmail.com> + +From: Dave Chinner + +commit 56486f307100e8fc66efa2ebd8a71941fa10bf6f upstream. + +xfs/538 on a 1kB block filesystem failed with this assert: + +XFS: Assertion failed: cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || xfs_is_shutdown(cur->bc_mp), file: fs/xfs/libxfs/xfs_btree.c, line: 448 + +The problem was that an allocation failed unexpectedly in +xfs_bmbt_alloc_block() after roughly 150,000 minlen allocation error +injections, resulting in an EFSCORRUPTED error being returned to +xfs_bmapi_write(). The error occurred on extent-to-btree format +conversion allocating the new root block: + + RIP: 0010:xfs_bmbt_alloc_block+0x177/0x210 + Call Trace: + + xfs_btree_new_iroot+0xdf/0x520 + xfs_btree_make_block_unfull+0x10d/0x1c0 + xfs_btree_insrec+0x364/0x790 + xfs_btree_insert+0xaa/0x210 + xfs_bmap_add_extent_hole_real+0x1fe/0x9a0 + xfs_bmapi_allocate+0x34c/0x420 + xfs_bmapi_write+0x53c/0x9c0 + xfs_alloc_file_space+0xee/0x320 + xfs_file_fallocate+0x36b/0x450 + vfs_fallocate+0x148/0x340 + __x64_sys_fallocate+0x3c/0x70 + do_syscall_64+0x35/0x80 + entry_SYSCALL_64_after_hwframe+0x44/0xa + +Why the allocation failed at this point is unknown, but is likely +that we ran the transaction out of reserved space and filesystem out +of space with bmbt blocks because of all the minlen allocations +being done causing worst case fragmentation of a large allocation. + +Regardless of the cause, we've then called xfs_bmapi_finish() which +calls xfs_btree_del_cursor(cur, error) to tear down the cursor. + +So we have a failed operation, error != 0, cur->bc_ino.allocated > 0 +and the filesystem is still up. The assert fails to take into +account that allocation can fail with an error and the transaction +teardown will shut the filesystem down if necessary. i.e. the +assert needs to check "|| error != 0" as well, because at this point +shutdown is pending because the current transaction is dirty.... + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Dave Chinner +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_btree.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_btree.c ++++ b/fs/xfs/libxfs/xfs_btree.c +@@ -372,8 +372,14 @@ xfs_btree_del_cursor( + break; + } + ++ /* ++ * If we are doing a BMBT update, the number of unaccounted blocks ++ * allocated during this cursor life time should be zero. If it's not ++ * zero, then we should be shut down or on our way to shutdown due to ++ * cancelling a dirty transaction on error. ++ */ + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || +- XFS_FORCED_SHUTDOWN(cur->bc_mp)); ++ XFS_FORCED_SHUTDOWN(cur->bc_mp) || error != 0); + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) + kmem_free(cur->bc_ops); + kmem_cache_free(xfs_btree_cur_zone, cur); diff --git a/queue-5.10/xfs-consider-shutdown-in-bmapbt-cursor-delete-assert.patch b/queue-5.10/xfs-consider-shutdown-in-bmapbt-cursor-delete-assert.patch new file mode 100644 index 00000000000..7eafcf4b688 --- /dev/null +++ b/queue-5.10/xfs-consider-shutdown-in-bmapbt-cursor-delete-assert.patch @@ -0,0 +1,88 @@ +From foo@baz Mon Jun 6 07:00:47 PM CEST 2022 +From: Amir Goldstein +Date: Mon, 6 Jun 2022 17:32:54 +0300 +Subject: xfs: consider shutdown in bmapbt cursor delete assert +To: Greg Kroah-Hartman +Cc: Sasha Levin , Dave Chinner , "Darrick J . Wong" , Christoph Hellwig , Brian Foster , Christian Brauner , Luis Chamberlain , Leah Rumancik , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org +Message-ID: <20220606143255.685988-8-amir73il@gmail.com> + +From: Brian Foster + +commit 1cd738b13ae9b29e03d6149f0246c61f76e81fcf upstream. + +The assert in xfs_btree_del_cursor() checks that the bmapbt block +allocation field has been handled correctly before the cursor is +freed. This field is used for accurate calculation of indirect block +reservation requirements (for delayed allocations), for example. +generic/019 reproduces a scenario where this assert fails because +the filesystem has shutdown while in the middle of a bmbt record +insertion. This occurs after a bmbt block has been allocated via the +cursor but before the higher level bmap function (i.e. +xfs_bmap_add_extent_hole_real()) completes and resets the field. + +Update the assert to accommodate the transient state if the +filesystem has shutdown. While here, clean up the indentation and +comments in the function. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_btree.c | 33 ++++++++++++--------------------- + 1 file changed, 12 insertions(+), 21 deletions(-) + +--- a/fs/xfs/libxfs/xfs_btree.c ++++ b/fs/xfs/libxfs/xfs_btree.c +@@ -353,20 +353,17 @@ xfs_btree_free_block( + */ + void + xfs_btree_del_cursor( +- xfs_btree_cur_t *cur, /* btree cursor */ +- int error) /* del because of error */ ++ struct xfs_btree_cur *cur, /* btree cursor */ ++ int error) /* del because of error */ + { +- int i; /* btree level */ ++ int i; /* btree level */ + + /* +- * Clear the buffer pointers, and release the buffers. +- * If we're doing this in the face of an error, we +- * need to make sure to inspect all of the entries +- * in the bc_bufs array for buffers to be unlocked. +- * This is because some of the btree code works from +- * level n down to 0, and if we get an error along +- * the way we won't have initialized all the entries +- * down to 0. ++ * Clear the buffer pointers and release the buffers. If we're doing ++ * this because of an error, inspect all of the entries in the bc_bufs ++ * array for buffers to be unlocked. This is because some of the btree ++ * code works from level n down to 0, and if we get an error along the ++ * way we won't have initialized all the entries down to 0. + */ + for (i = 0; i < cur->bc_nlevels; i++) { + if (cur->bc_bufs[i]) +@@ -374,17 +371,11 @@ xfs_btree_del_cursor( + else if (!error) + break; + } +- /* +- * Can't free a bmap cursor without having dealt with the +- * allocated indirect blocks' accounting. +- */ +- ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || +- cur->bc_ino.allocated == 0); +- /* +- * Free the cursor. +- */ ++ ++ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || ++ XFS_FORCED_SHUTDOWN(cur->bc_mp)); + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) +- kmem_free((void *)cur->bc_ops); ++ kmem_free(cur->bc_ops); + kmem_cache_free(xfs_btree_cur_zone, cur); + } + diff --git a/queue-5.10/xfs-fix-chown-leaking-delalloc-quota-blocks-when-fssetxattr-fails.patch b/queue-5.10/xfs-fix-chown-leaking-delalloc-quota-blocks-when-fssetxattr-fails.patch new file mode 100644 index 00000000000..1597f941ac4 --- /dev/null +++ b/queue-5.10/xfs-fix-chown-leaking-delalloc-quota-blocks-when-fssetxattr-fails.patch @@ -0,0 +1,170 @@ +From foo@baz Mon Jun 6 07:00:47 PM CEST 2022 +From: Amir Goldstein +Date: Mon, 6 Jun 2022 17:32:50 +0300 +Subject: xfs: fix chown leaking delalloc quota blocks when fssetxattr fails +To: Greg Kroah-Hartman +Cc: Sasha Levin , Dave Chinner , "Darrick J . Wong" , Christoph Hellwig , Brian Foster , Christian Brauner , Luis Chamberlain , Leah Rumancik , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org +Message-ID: <20220606143255.685988-4-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit 1aecf3734a95f3c167d1495550ca57556d33f7ec upstream. + +While refactoring the quota code to create a function to allocate inode +change transactions, I noticed that xfs_qm_vop_chown_reserve does more +than just make reservations: it also *modifies* the incore counts +directly to handle the owner id change for the delalloc blocks. + +I then observed that the fssetxattr code continues validating input +arguments after making the quota reservation but before dirtying the +transaction. If the routine decides to error out, it fails to undo the +accounting switch! This leads to incorrect quota reservation and +failure down the line. + +We can fix this by making the reservation function do only that -- for +the new dquot, it reserves ondisk and delalloc blocks to the +transaction, and the old dquot hangs on to its incore reservation for +now. Once we actually switch the dquots, we can then update the incore +reservations because we've dirtied the transaction and it's too late to +turn back now. + +No fixes tag because this has been broken since the start of git. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Reviewed-by: Brian Foster +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_qm.c | 92 +++++++++++++++++++++----------------------------------- + 1 file changed, 35 insertions(+), 57 deletions(-) + +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -1786,6 +1786,29 @@ xfs_qm_vop_chown( + xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); + + /* ++ * Back when we made quota reservations for the chown, we reserved the ++ * ondisk blocks + delalloc blocks with the new dquot. Now that we've ++ * switched the dquots, decrease the new dquot's block reservation ++ * (having already bumped up the real counter) so that we don't have ++ * any reservation to give back when we commit. ++ */ ++ xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS, ++ -ip->i_delayed_blks); ++ ++ /* ++ * Give the incore reservation for delalloc blocks back to the old ++ * dquot. We don't normally handle delalloc quota reservations ++ * transactionally, so just lock the dquot and subtract from the ++ * reservation. Dirty the transaction because it's too late to turn ++ * back now. ++ */ ++ tp->t_flags |= XFS_TRANS_DIRTY; ++ xfs_dqlock(prevdq); ++ ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); ++ prevdq->q_blk.reserved -= ip->i_delayed_blks; ++ xfs_dqunlock(prevdq); ++ ++ /* + * Take an extra reference, because the inode is going to keep + * this dquot pointer even after the trans_commit. + */ +@@ -1807,84 +1830,39 @@ xfs_qm_vop_chown_reserve( + uint flags) + { + struct xfs_mount *mp = ip->i_mount; +- uint64_t delblks; + unsigned int blkflags; +- struct xfs_dquot *udq_unres = NULL; +- struct xfs_dquot *gdq_unres = NULL; +- struct xfs_dquot *pdq_unres = NULL; + struct xfs_dquot *udq_delblks = NULL; + struct xfs_dquot *gdq_delblks = NULL; + struct xfs_dquot *pdq_delblks = NULL; +- int error; +- + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + +- delblks = ip->i_delayed_blks; + blkflags = XFS_IS_REALTIME_INODE(ip) ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; + + if (XFS_IS_UQUOTA_ON(mp) && udqp && +- i_uid_read(VFS_I(ip)) != udqp->q_id) { ++ i_uid_read(VFS_I(ip)) != udqp->q_id) + udq_delblks = udqp; +- /* +- * If there are delayed allocation blocks, then we have to +- * unreserve those from the old dquot, and add them to the +- * new dquot. +- */ +- if (delblks) { +- ASSERT(ip->i_udquot); +- udq_unres = ip->i_udquot; +- } +- } ++ + if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && +- i_gid_read(VFS_I(ip)) != gdqp->q_id) { ++ i_gid_read(VFS_I(ip)) != gdqp->q_id) + gdq_delblks = gdqp; +- if (delblks) { +- ASSERT(ip->i_gdquot); +- gdq_unres = ip->i_gdquot; +- } +- } + + if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && +- ip->i_d.di_projid != pdqp->q_id) { ++ ip->i_d.di_projid != pdqp->q_id) + pdq_delblks = pdqp; +- if (delblks) { +- ASSERT(ip->i_pdquot); +- pdq_unres = ip->i_pdquot; +- } +- } +- +- error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, +- udq_delblks, gdq_delblks, pdq_delblks, +- ip->i_d.di_nblocks, 1, flags | blkflags); +- if (error) +- return error; + + /* +- * Do the delayed blks reservations/unreservations now. Since, these +- * are done without the help of a transaction, if a reservation fails +- * its previous reservations won't be automatically undone by trans +- * code. So, we have to do it manually here. ++ * Reserve enough quota to handle blocks on disk and reserved for a ++ * delayed allocation. We'll actually transfer the delalloc ++ * reservation between dquots at chown time, even though that part is ++ * only semi-transactional. + */ +- if (delblks) { +- /* +- * Do the reservations first. Unreservation can't fail. +- */ +- ASSERT(udq_delblks || gdq_delblks || pdq_delblks); +- ASSERT(udq_unres || gdq_unres || pdq_unres); +- error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, +- udq_delblks, gdq_delblks, pdq_delblks, +- (xfs_qcnt_t)delblks, 0, flags | blkflags); +- if (error) +- return error; +- xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, +- udq_unres, gdq_unres, pdq_unres, +- -((xfs_qcnt_t)delblks), 0, blkflags); +- } +- +- return 0; ++ return xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks, ++ gdq_delblks, pdq_delblks, ++ ip->i_d.di_nblocks + ip->i_delayed_blks, ++ 1, blkflags | flags); + } + + int diff --git a/queue-5.10/xfs-fix-incorrect-root-dquot-corruption-error-when-switching-group-project-quota-types.patch b/queue-5.10/xfs-fix-incorrect-root-dquot-corruption-error-when-switching-group-project-quota-types.patch new file mode 100644 index 00000000000..b04b4fdadd2 --- /dev/null +++ b/queue-5.10/xfs-fix-incorrect-root-dquot-corruption-error-when-switching-group-project-quota-types.patch @@ -0,0 +1,95 @@ +From foo@baz Mon Jun 6 07:00:47 PM CEST 2022 +From: Amir Goldstein +Date: Mon, 6 Jun 2022 17:32:51 +0300 +Subject: xfs: fix incorrect root dquot corruption error when switching group/project quota types +To: Greg Kroah-Hartman +Cc: Sasha Levin , Dave Chinner , "Darrick J . Wong" , Christoph Hellwig , Brian Foster , Christian Brauner , Luis Chamberlain , Leah Rumancik , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Chandan Babu R +Message-ID: <20220606143255.685988-5-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit 45068063efb7dd0a8d115c106aa05d9ab0946257 upstream. + +While writing up a regression test for broken behavior when a chprojid +request fails, I noticed that we were logging corruption notices about +the root dquot of the group/project quota file at mount time when +testing V4 filesystems. + +In commit afeda6000b0c, I was trying to improve ondisk dquot validation +by making sure that when we load an ondisk dquot into memory on behalf +of an incore dquot, the dquot id and type matches. Unfortunately, I +forgot that V4 filesystems only have two quota files, and can switch +that file between group and project quota types at mount time. When we +perform that switch, we'll try to load the default quota limits from the +root dquot prior to running quotacheck and log a corruption error when +the types don't match. + +This is inconsequential because quotacheck will reset the second quota +file as part of doing the switch, but we shouldn't leave scary messages +in the kernel log. + +Fixes: afeda6000b0c ("xfs: validate ondisk/incore dquot flags") +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Reviewed-by: Chandan Babu R +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_dquot.c | 39 +++++++++++++++++++++++++++++++++++++-- + 1 file changed, 37 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_dquot.c ++++ b/fs/xfs/xfs_dquot.c +@@ -500,6 +500,42 @@ xfs_dquot_alloc( + return dqp; + } + ++/* Check the ondisk dquot's id and type match what the incore dquot expects. */ ++static bool ++xfs_dquot_check_type( ++ struct xfs_dquot *dqp, ++ struct xfs_disk_dquot *ddqp) ++{ ++ uint8_t ddqp_type; ++ uint8_t dqp_type; ++ ++ ddqp_type = ddqp->d_type & XFS_DQTYPE_REC_MASK; ++ dqp_type = xfs_dquot_type(dqp); ++ ++ if (be32_to_cpu(ddqp->d_id) != dqp->q_id) ++ return false; ++ ++ /* ++ * V5 filesystems always expect an exact type match. V4 filesystems ++ * expect an exact match for user dquots and for non-root group and ++ * project dquots. ++ */ ++ if (xfs_sb_version_hascrc(&dqp->q_mount->m_sb) || ++ dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0) ++ return ddqp_type == dqp_type; ++ ++ /* ++ * V4 filesystems support either group or project quotas, but not both ++ * at the same time. The non-user quota file can be switched between ++ * group and project quota uses depending on the mount options, which ++ * means that we can encounter the other type when we try to load quota ++ * defaults. Quotacheck will soon reset the the entire quota file ++ * (including the root dquot) anyway, but don't log scary corruption ++ * reports to dmesg. ++ */ ++ return ddqp_type == XFS_DQTYPE_GROUP || ddqp_type == XFS_DQTYPE_PROJ; ++} ++ + /* Copy the in-core quota fields in from the on-disk buffer. */ + STATIC int + xfs_dquot_from_disk( +@@ -512,8 +548,7 @@ xfs_dquot_from_disk( + * Ensure that we got the type and ID we were looking for. + * Everything else was checked by the dquot buffer verifier. + */ +- if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) || +- be32_to_cpu(ddqp->d_id) != dqp->q_id) { ++ if (!xfs_dquot_check_type(dqp, ddqp)) { + xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR, + "Metadata corruption detected at %pS, quota %u", + __this_address, dqp->q_id); diff --git a/queue-5.10/xfs-force-log-and-push-ail-to-clear-pinned-inodes-when-aborting-mount.patch b/queue-5.10/xfs-force-log-and-push-ail-to-clear-pinned-inodes-when-aborting-mount.patch new file mode 100644 index 00000000000..13e592eb754 --- /dev/null +++ b/queue-5.10/xfs-force-log-and-push-ail-to-clear-pinned-inodes-when-aborting-mount.patch @@ -0,0 +1,155 @@ +From foo@baz Mon Jun 6 07:00:47 PM CEST 2022 +From: Amir Goldstein +Date: Mon, 6 Jun 2022 17:32:53 +0300 +Subject: xfs: force log and push AIL to clear pinned inodes when aborting mount +To: Greg Kroah-Hartman +Cc: Sasha Levin , Dave Chinner , "Darrick J . Wong" , Christoph Hellwig , Brian Foster , Christian Brauner , Luis Chamberlain , Leah Rumancik , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Dave Chinner +Message-ID: <20220606143255.685988-7-amir73il@gmail.com> + +From: "Darrick J. Wong" + +commit d336f7ebc65007f5831e2297e6f3383ae8dbf8ed upstream. + +If we allocate quota inodes in the process of mounting a filesystem but +then decide to abort the mount, it's possible that the quota inodes are +sitting around pinned by the log. Now that inode reclaim relies on the +AIL to flush inodes, we have to force the log and push the AIL in +between releasing the quota inodes and kicking off reclaim to tear down +all the incore inodes. Do this by extracting the bits we need from the +unmount path and reusing them. As an added bonus, failed writes during +a failed mount will not retry forever now. + +This was originally found during a fuzz test of metadata directories +(xfs/1546), but the actual symptom was that reclaim hung up on the quota +inodes. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Reviewed-by: Dave Chinner +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_mount.c | 90 +++++++++++++++++++++++++---------------------------- + 1 file changed, 44 insertions(+), 46 deletions(-) + +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -632,6 +632,47 @@ xfs_check_summary_counts( + } + + /* ++ * Flush and reclaim dirty inodes in preparation for unmount. Inodes and ++ * internal inode structures can be sitting in the CIL and AIL at this point, ++ * so we need to unpin them, write them back and/or reclaim them before unmount ++ * can proceed. ++ * ++ * An inode cluster that has been freed can have its buffer still pinned in ++ * memory because the transaction is still sitting in a iclog. The stale inodes ++ * on that buffer will be pinned to the buffer until the transaction hits the ++ * disk and the callbacks run. Pushing the AIL will skip the stale inodes and ++ * may never see the pinned buffer, so nothing will push out the iclog and ++ * unpin the buffer. ++ * ++ * Hence we need to force the log to unpin everything first. However, log ++ * forces don't wait for the discards they issue to complete, so we have to ++ * explicitly wait for them to complete here as well. ++ * ++ * Then we can tell the world we are unmounting so that error handling knows ++ * that the filesystem is going away and we should error out anything that we ++ * have been retrying in the background. This will prevent never-ending ++ * retries in AIL pushing from hanging the unmount. ++ * ++ * Finally, we can push the AIL to clean all the remaining dirty objects, then ++ * reclaim the remaining inodes that are still in memory at this point in time. ++ */ ++static void ++xfs_unmount_flush_inodes( ++ struct xfs_mount *mp) ++{ ++ xfs_log_force(mp, XFS_LOG_SYNC); ++ xfs_extent_busy_wait_all(mp); ++ flush_workqueue(xfs_discard_wq); ++ ++ mp->m_flags |= XFS_MOUNT_UNMOUNTING; ++ ++ xfs_ail_push_all_sync(mp->m_ail); ++ cancel_delayed_work_sync(&mp->m_reclaim_work); ++ xfs_reclaim_inodes(mp); ++ xfs_health_unmount(mp); ++} ++ ++/* + * This function does the following on an initial mount of a file system: + * - reads the superblock from disk and init the mount struct + * - if we're a 32-bit kernel, do a size check on the superblock +@@ -1005,7 +1046,7 @@ xfs_mountfs( + /* Clean out dquots that might be in memory after quotacheck. */ + xfs_qm_unmount(mp); + /* +- * Cancel all delayed reclaim work and reclaim the inodes directly. ++ * Flush all inode reclamation work and flush the log. + * We have to do this /after/ rtunmount and qm_unmount because those + * two will have scheduled delayed reclaim for the rt/quota inodes. + * +@@ -1015,11 +1056,8 @@ xfs_mountfs( + * qm_unmount_quotas and therefore rely on qm_unmount to release the + * quota inodes. + */ +- cancel_delayed_work_sync(&mp->m_reclaim_work); +- xfs_reclaim_inodes(mp); +- xfs_health_unmount(mp); ++ xfs_unmount_flush_inodes(mp); + out_log_dealloc: +- mp->m_flags |= XFS_MOUNT_UNMOUNTING; + xfs_log_mount_cancel(mp); + out_fail_wait: + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) +@@ -1060,47 +1098,7 @@ xfs_unmountfs( + xfs_rtunmount_inodes(mp); + xfs_irele(mp->m_rootip); + +- /* +- * We can potentially deadlock here if we have an inode cluster +- * that has been freed has its buffer still pinned in memory because +- * the transaction is still sitting in a iclog. The stale inodes +- * on that buffer will be pinned to the buffer until the +- * transaction hits the disk and the callbacks run. Pushing the AIL will +- * skip the stale inodes and may never see the pinned buffer, so +- * nothing will push out the iclog and unpin the buffer. Hence we +- * need to force the log here to ensure all items are flushed into the +- * AIL before we go any further. +- */ +- xfs_log_force(mp, XFS_LOG_SYNC); +- +- /* +- * Wait for all busy extents to be freed, including completion of +- * any discard operation. +- */ +- xfs_extent_busy_wait_all(mp); +- flush_workqueue(xfs_discard_wq); +- +- /* +- * We now need to tell the world we are unmounting. This will allow +- * us to detect that the filesystem is going away and we should error +- * out anything that we have been retrying in the background. This will +- * prevent neverending retries in AIL pushing from hanging the unmount. +- */ +- mp->m_flags |= XFS_MOUNT_UNMOUNTING; +- +- /* +- * Flush all pending changes from the AIL. +- */ +- xfs_ail_push_all_sync(mp->m_ail); +- +- /* +- * Reclaim all inodes. At this point there should be no dirty inodes and +- * none should be pinned or locked. Stop background inode reclaim here +- * if it is still running. +- */ +- cancel_delayed_work_sync(&mp->m_reclaim_work); +- xfs_reclaim_inodes(mp); +- xfs_health_unmount(mp); ++ xfs_unmount_flush_inodes(mp); + + xfs_qm_unmount(mp); + diff --git a/queue-5.10/xfs-restore-shutdown-check-in-mapped-write-fault-path.patch b/queue-5.10/xfs-restore-shutdown-check-in-mapped-write-fault-path.patch new file mode 100644 index 00000000000..b828c2e1a9a --- /dev/null +++ b/queue-5.10/xfs-restore-shutdown-check-in-mapped-write-fault-path.patch @@ -0,0 +1,53 @@ +From foo@baz Mon Jun 6 07:00:47 PM CEST 2022 +From: Amir Goldstein +Date: Mon, 6 Jun 2022 17:32:52 +0300 +Subject: xfs: restore shutdown check in mapped write fault path +To: Greg Kroah-Hartman +Cc: Sasha Levin , Dave Chinner , "Darrick J . Wong" , Christoph Hellwig , Brian Foster , Christian Brauner , Luis Chamberlain , Leah Rumancik , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Eric Sandeen +Message-ID: <20220606143255.685988-6-amir73il@gmail.com> + +From: Brian Foster + +commit e4826691cc7e5458bcb659935d0092bcf3f08c20 upstream. + +XFS triggers an iomap warning in the write fault path due to a +!PageUptodate() page if a write fault happens to occur on a page +that recently failed writeback. The iomap writeback error handling +code can clear the Uptodate flag if no portion of the page is +submitted for I/O. This is reproduced by fstest generic/019, which +combines various forms of I/O with simulated disk failures that +inevitably lead to filesystem shutdown (which then unconditionally +fails page writeback). + +This is a regression introduced by commit f150b4234397 ("xfs: split +the iomap ops for buffered vs direct writes") due to the removal of +a shutdown check and explicit error return in the ->iomap_begin() +path used by the write fault path. The explicit error return +historically translated to a SIGBUS, but now carries on with iomap +processing where it complains about the unexpected state. Restore +the shutdown check to xfs_buffered_write_iomap_begin() to restore +historical behavior. + +Fixes: f150b4234397 ("xfs: split the iomap ops for buffered vs direct writes") +Signed-off-by: Brian Foster +Reviewed-by: Eric Sandeen +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_iomap.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -870,6 +870,9 @@ xfs_buffered_write_iomap_begin( + int allocfork = XFS_DATA_FORK; + int error = 0; + ++ if (XFS_FORCED_SHUTDOWN(mp)) ++ return -EIO; ++ + /* we can't use delayed allocations when using extent size hints */ + if (xfs_get_extsz_hint(ip)) + return xfs_direct_write_iomap_begin(inode, offset, count, diff --git a/queue-5.10/xfs-set-inode-size-after-creating-symlink.patch b/queue-5.10/xfs-set-inode-size-after-creating-symlink.patch new file mode 100644 index 00000000000..b59ed4b7c7b --- /dev/null +++ b/queue-5.10/xfs-set-inode-size-after-creating-symlink.patch @@ -0,0 +1,43 @@ +From foo@baz Mon Jun 6 07:00:47 PM CEST 2022 +From: Amir Goldstein +Date: Mon, 6 Jun 2022 17:32:48 +0300 +Subject: xfs: set inode size after creating symlink +To: Greg Kroah-Hartman +Cc: Sasha Levin , Dave Chinner , "Darrick J . Wong" , Christoph Hellwig , Brian Foster , Christian Brauner , Luis Chamberlain , Leah Rumancik , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Jeffrey Mitchell +Message-ID: <20220606143255.685988-2-amir73il@gmail.com> + +From: Jeffrey Mitchell + +commit 8aa921a95335d0a8c8e2be35a44467e7c91ec3e4 upstream. + +When XFS creates a new symlink, it writes its size to disk but not to the +VFS inode. This causes i_size_read() to return 0 for that symlink until +it is re-read from disk, for example when the system is rebooted. + +I found this inconsistency while protecting directories with eCryptFS. +The command "stat path/to/symlink/in/ecryptfs" will report "Size: 0" if +the symlink was created after the last reboot on an XFS root. + +Call i_size_write() in xfs_symlink() + +Signed-off-by: Jeffrey Mitchell +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Reviewed-by: Brian Foster +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_symlink.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/xfs/xfs_symlink.c ++++ b/fs/xfs/xfs_symlink.c +@@ -300,6 +300,7 @@ xfs_symlink( + } + ASSERT(pathlen == 0); + } ++ i_size_write(VFS_I(ip), ip->i_d.di_size); + + /* + * Create the directory entry for the symlink. diff --git a/queue-5.10/xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch b/queue-5.10/xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch new file mode 100644 index 00000000000..5ccffd28ae5 --- /dev/null +++ b/queue-5.10/xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch @@ -0,0 +1,124 @@ +From foo@baz Mon Jun 6 07:00:47 PM CEST 2022 +From: Amir Goldstein +Date: Mon, 6 Jun 2022 17:32:49 +0300 +Subject: xfs: sync lazy sb accounting on quiesce of read-only mounts +To: Greg Kroah-Hartman +Cc: Sasha Levin , Dave Chinner , "Darrick J . Wong" , Christoph Hellwig , Brian Foster , Christian Brauner , Luis Chamberlain , Leah Rumancik , Adam Manzanares , linux-xfs@vger.kernel.org, stable@vger.kernel.org, Gao Xiang , Allison Henderson , "Darrick J . Wong" , Bill O'Donnell +Message-ID: <20220606143255.685988-3-amir73il@gmail.com> + +From: Brian Foster + +commit 50d25484bebe94320c49dd1347d3330c7063bbdb upstream. + +xfs_log_sbcount() syncs the superblock specifically to accumulate +the in-core percpu superblock counters and commit them to disk. This +is required to maintain filesystem consistency across quiesce +(freeze, read-only mount/remount) or unmount when lazy superblock +accounting is enabled because individual transactions do not update +the superblock directly. + +This mechanism works as expected for writable mounts, but +xfs_log_sbcount() skips the update for read-only mounts. Read-only +mounts otherwise still allow log recovery and write out an unmount +record during log quiesce. If a read-only mount performs log +recovery, it can modify the in-core superblock counters and write an +unmount record when the filesystem unmounts without ever syncing the +in-core counters. This leaves the filesystem with a clean log but in +an inconsistent state with regard to lazy sb counters. + +Update xfs_log_sbcount() to use the same logic +xfs_log_unmount_write() uses to determine when to write an unmount +record. This ensures that lazy accounting is always synced before +the log is cleaned. Refactor this logic into a new helper to +distinguish between a writable filesystem and a writable log. +Specifically, the log is writable unless the filesystem is mounted +with the norecovery mount option, the underlying log device is +read-only, or the filesystem is shutdown. Drop the freeze state +check because the update is already allowed during the freezing +process and no context calls this function on an already frozen fs. +Also, retain the shutdown check in xfs_log_unmount_write() to catch +the case where the preceding log force might have triggered a +shutdown. + +Signed-off-by: Brian Foster +Reviewed-by: Gao Xiang +Reviewed-by: Allison Henderson +Reviewed-by: Darrick J. Wong +Reviewed-by: Bill O'Donnell +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Amir Goldstein +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log.c | 28 ++++++++++++++++++++-------- + fs/xfs/xfs_log.h | 1 + + fs/xfs/xfs_mount.c | 3 +-- + 3 files changed, 22 insertions(+), 10 deletions(-) + +--- a/fs/xfs/xfs_log.c ++++ b/fs/xfs/xfs_log.c +@@ -347,6 +347,25 @@ xlog_tic_add_region(xlog_ticket_t *tic, + tic->t_res_num++; + } + ++bool ++xfs_log_writable( ++ struct xfs_mount *mp) ++{ ++ /* ++ * Never write to the log on norecovery mounts, if the block device is ++ * read-only, or if the filesystem is shutdown. Read-only mounts still ++ * allow internal writes for log recovery and unmount purposes, so don't ++ * restrict that case here. ++ */ ++ if (mp->m_flags & XFS_MOUNT_NORECOVERY) ++ return false; ++ if (xfs_readonly_buftarg(mp->m_log->l_targ)) ++ return false; ++ if (XFS_FORCED_SHUTDOWN(mp)) ++ return false; ++ return true; ++} ++ + /* + * Replenish the byte reservation required by moving the grant write head. + */ +@@ -886,15 +905,8 @@ xfs_log_unmount_write( + { + struct xlog *log = mp->m_log; + +- /* +- * Don't write out unmount record on norecovery mounts or ro devices. +- * Or, if we are doing a forced umount (typically because of IO errors). +- */ +- if (mp->m_flags & XFS_MOUNT_NORECOVERY || +- xfs_readonly_buftarg(log->l_targ)) { +- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); ++ if (!xfs_log_writable(mp)) + return; +- } + + xfs_log_force(mp, XFS_LOG_SYNC); + +--- a/fs/xfs/xfs_log.h ++++ b/fs/xfs/xfs_log.h +@@ -127,6 +127,7 @@ int xfs_log_reserve(struct xfs_mount * + int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); + void xfs_log_unmount(struct xfs_mount *mp); + int xfs_log_force_umount(struct xfs_mount *mp, int logerror); ++bool xfs_log_writable(struct xfs_mount *mp); + + struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); + void xfs_log_ticket_put(struct xlog_ticket *ticket); +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -1176,8 +1176,7 @@ xfs_fs_writable( + int + xfs_log_sbcount(xfs_mount_t *mp) + { +- /* allow this to proceed during the freeze sequence... */ +- if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) ++ if (!xfs_log_writable(mp)) + return 0; + + /* -- 2.47.3