From: Greg Kroah-Hartman Date: Fri, 11 Nov 2022 09:04:47 +0000 (+0100) Subject: 5.4-stable patches X-Git-Tag: v5.10.155~56 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=3ad4b0cdf8551d0d352c9fcbc83cf7ce254f2359;p=thirdparty%2Fkernel%2Fstable-queue.git 5.4-stable patches added patches: xfs-drain-the-buf-delwri-queue-before-xfsaild-idles.patch xfs-preserve-inode-versioning-across-remounts.patch xfs-preserve-rmapbt-swapext-block-reservation-from-freed-blocks.patch xfs-redesign-the-reflink-remap-loop-to-fix-blkres-depletion-crash.patch xfs-rename-xfs_bmap_is_real_extent-to-is_written_extent.patch xfs-use-mmaplock-around-filemap_map_pages.patch --- diff --git a/queue-5.10/series b/queue-5.10/series new file mode 100644 index 00000000000..e69de29bb2d diff --git a/queue-5.4/series b/queue-5.4/series new file mode 100644 index 00000000000..25176f2fb29 --- /dev/null +++ b/queue-5.4/series @@ -0,0 +1,6 @@ +xfs-preserve-rmapbt-swapext-block-reservation-from-freed-blocks.patch +xfs-rename-xfs_bmap_is_real_extent-to-is_written_extent.patch +xfs-redesign-the-reflink-remap-loop-to-fix-blkres-depletion-crash.patch +xfs-use-mmaplock-around-filemap_map_pages.patch +xfs-preserve-inode-versioning-across-remounts.patch +xfs-drain-the-buf-delwri-queue-before-xfsaild-idles.patch diff --git a/queue-5.4/xfs-drain-the-buf-delwri-queue-before-xfsaild-idles.patch b/queue-5.4/xfs-drain-the-buf-delwri-queue-before-xfsaild-idles.patch new file mode 100644 index 00000000000..169694614ac --- /dev/null +++ b/queue-5.4/xfs-drain-the-buf-delwri-queue-before-xfsaild-idles.patch @@ -0,0 +1,101 @@ +From stable-owner@vger.kernel.org Fri Nov 11 05:11:27 2022 +From: Chandan Babu R +Date: Fri, 11 Nov 2022 09:40:25 +0530 +Subject: xfs: drain the buf delwri queue before xfsaild idles +To: gregkh@linuxfoundation.org +Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com +Message-ID: <20221111041025.87704-7-chandan.babu@oracle.com> + +From: Brian Foster + +commit f376b45e861d8b7b34bf0eceeecfdd00dbe65cde upstream. + +xfsaild is racy with respect to transaction abort and shutdown in +that the task can idle or exit with an empty AIL but buffers still +on the delwri queue. This was partly addressed by cancelling the +delwri queue before the task exits to prevent memory leaks, but it's +also possible for xfsaild to empty and idle with buffers on the +delwri queue. For example, a transaction that pins a buffer that +also happens to sit on the AIL delwri queue will explicitly remove +the associated log item from the AIL if the transaction aborts. The +side effect of this is an unmount hang in xfs_wait_buftarg() as the +associated buffers remain held by the delwri queue indefinitely. +This is reproduced on repeated runs of generic/531 with an fs format +(-mrmapbt=1 -bsize=1k) that happens to also reproduce transaction +aborts. + +Update xfsaild to not idle until both the AIL and associated delwri +queue are empty and update the push code to continue delwri queue +submission attempts even when the AIL is empty. This allows the AIL +to eventually release aborted buffers stranded on the delwri queue +when they are unlocked by the associated transaction. This should +have no significant effect on normal runtime behavior because the +xfsaild currently idles only when the AIL is empty and in practice +the AIL is rarely empty with a populated delwri queue. The items +must be AIL resident to land in the queue in the first place and +generally aren't removed until writeback completes. + +Note that the pre-existing delwri queue cancel logic in the exit +path is retained because task stop is external, could technically +come at any point, and xfsaild is still responsible to release its +buffer references before it exits. + +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Acked-by: Darrick J. Wong +Signed-off-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_trans_ail.c | 16 ++++++---------- + 1 file changed, 6 insertions(+), 10 deletions(-) + +--- a/fs/xfs/xfs_trans_ail.c ++++ b/fs/xfs/xfs_trans_ail.c +@@ -402,16 +402,10 @@ xfsaild_push( + target = ailp->ail_target; + ailp->ail_target_prev = target; + ++ /* we're done if the AIL is empty or our push has reached the end */ + lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); +- if (!lip) { +- /* +- * If the AIL is empty or our push has reached the end we are +- * done now. +- */ +- xfs_trans_ail_cursor_done(&cur); +- spin_unlock(&ailp->ail_lock); ++ if (!lip) + goto out_done; +- } + + XFS_STATS_INC(mp, xs_push_ail); + +@@ -493,6 +487,8 @@ xfsaild_push( + break; + lsn = lip->li_lsn; + } ++ ++out_done: + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->ail_lock); + +@@ -500,7 +496,6 @@ xfsaild_push( + ailp->ail_log_flush++; + + if (!count || XFS_LSN_CMP(lsn, target) >= 0) { +-out_done: + /* + * We reached the target or the AIL is empty, so wait a bit + * longer for I/O to complete and remove pushed items from the +@@ -592,7 +587,8 @@ xfsaild( + */ + smp_rmb(); + if (!xfs_ail_min(ailp) && +- ailp->ail_target == ailp->ail_target_prev) { ++ ailp->ail_target == ailp->ail_target_prev && ++ list_empty(&ailp->ail_buf_list)) { + spin_unlock(&ailp->ail_lock); + freezable_schedule(); + tout = 0; diff --git a/queue-5.4/xfs-preserve-inode-versioning-across-remounts.patch b/queue-5.4/xfs-preserve-inode-versioning-across-remounts.patch new file mode 100644 index 00000000000..11e687b3acf --- /dev/null +++ b/queue-5.4/xfs-preserve-inode-versioning-across-remounts.patch @@ -0,0 +1,57 @@ +From stable-owner@vger.kernel.org Fri Nov 11 05:14:16 2022 +From: Chandan Babu R +Date: Fri, 11 Nov 2022 09:40:24 +0530 +Subject: xfs: preserve inode versioning across remounts +To: gregkh@linuxfoundation.org +Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com +Message-ID: <20221111041025.87704-6-chandan.babu@oracle.com> + +From: Eric Sandeen + +commit 4750a171c3290f9bbebca16c6372db723a4cfa3b upstream. + +[ For 5.4.y, SB_I_VERSION should be set in xfs_fs_remount() ] + +The MS_I_VERSION mount flag is exposed via the VFS, as documented +in the mount manpages etc; see the iversion and noiversion mount +options in mount(8). + +As a result, mount -o remount looks for this option in /proc/mounts +and will only send the I_VERSION flag back in during remount it it +is present. Since it's not there, a remount will /remove/ the +I_VERSION flag at the vfs level, and iversion functionality is lost. + +xfs v5 superblocks intend to always have i_version enabled; it is +set as a default at mount time, but is lost during remount for the +reasons above. + +The generic fix would be to expose this documented option in +/proc/mounts, but since that was rejected, fix it up again in the +xfs remount path instead, so that at least xfs won't suffer from +this misbehavior. + +Signed-off-by: Eric Sandeen +Reviewed-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Signed-off-by: Darrick J. Wong +Reviewed-by: Darrick J. Wong +Acked-by: Darrick J. Wong +Signed-off-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_super.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -1228,6 +1228,10 @@ xfs_fs_remount( + char *p; + int error; + ++ /* version 5 superblocks always support version counters. */ ++ if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) ++ *flags |= SB_I_VERSION; ++ + /* First, check for complete junk; i.e. invalid options */ + error = xfs_test_remount_options(sb, options); + if (error) diff --git a/queue-5.4/xfs-preserve-rmapbt-swapext-block-reservation-from-freed-blocks.patch b/queue-5.4/xfs-preserve-rmapbt-swapext-block-reservation-from-freed-blocks.patch new file mode 100644 index 00000000000..20470aea0e1 --- /dev/null +++ b/queue-5.4/xfs-preserve-rmapbt-swapext-block-reservation-from-freed-blocks.patch @@ -0,0 +1,144 @@ +From stable-owner@vger.kernel.org Fri Nov 11 05:14:16 2022 +From: Chandan Babu R +Date: Fri, 11 Nov 2022 09:40:20 +0530 +Subject: xfs: preserve rmapbt swapext block reservation from freed blocks +To: gregkh@linuxfoundation.org +Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com +Message-ID: <20221111041025.87704-2-chandan.babu@oracle.com> + +From: Brian Foster + +commit f74681ba2006434be195402e0b15fc5763cddd7e upstream. + +[Slightly modify xfs_trans_alloc() to fix a merge conflict due to missing + "atomic_inc(&mp->m_active_trans)" statement in v5.9 kernel] + +The rmapbt extent swap algorithm remaps individual extents between +the source inode and the target to trigger reverse mapping metadata +updates. If either inode straddles a format or other bmap allocation +boundary, the individual unmap and map cycles can trigger repeated +bmap block allocations and frees as the extent count bounces back +and forth across the boundary. While net block usage is bound across +the swap operation, this behavior can prematurely exhaust the +transaction block reservation because it continuously drains as the +transaction rolls. Each allocation accounts against the reservation +and each free returns to global free space on transaction roll. + +The previous workaround to this problem attempted to detect this +boundary condition and provide surplus block reservation to +acommodate it. This is insufficient because more remaps can occur +than implied by the extent counts; if start offset boundaries are +not aligned between the two inodes, for example. + +To address this problem more generically and dynamically, add a +transaction accounting mode that returns freed blocks to the +transaction reservation instead of the superblock counters on +transaction roll and use it when the rmapbt based algorithm is +active. This allows the chain of remap transactions to preserve the +block reservation based own its own frees and prevent premature +exhaustion regardless of the remap pattern. Note that this is only +safe for superblocks with lazy sb accounting, but the latter is +required for v5 supers and the rmap feature depends on v5. + +Fixes: b3fed434822d0 ("xfs: account format bouncing into rmapbt swapext tx reservation") +Root-caused-by: Darrick J. Wong +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Acked-by: Darrick J. Wong +Signed-off-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_shared.h | 1 + + fs/xfs/xfs_bmap_util.c | 18 +++++++++--------- + fs/xfs/xfs_trans.c | 19 ++++++++++++++++++- + 3 files changed, 28 insertions(+), 10 deletions(-) + +--- a/fs/xfs/libxfs/xfs_shared.h ++++ b/fs/xfs/libxfs/xfs_shared.h +@@ -65,6 +65,7 @@ void xfs_log_get_max_trans_res(struct xf + #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ + #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ + #define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ ++#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */ + /* + * LOWMODE is used by the allocator to activate the lowspace algorithm - when + * free space is running low the extent allocator may choose to allocate an +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -1740,6 +1740,7 @@ xfs_swap_extents( + int lock_flags; + uint64_t f; + int resblks = 0; ++ unsigned int flags = 0; + + /* + * Lock the inodes against other IO, page faults and truncate to +@@ -1795,17 +1796,16 @@ xfs_swap_extents( + resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w); + + /* +- * Handle the corner case where either inode might straddle the +- * btree format boundary. If so, the inode could bounce between +- * btree <-> extent format on unmap -> remap cycles, freeing and +- * allocating a bmapbt block each time. ++ * If either inode straddles a bmapbt block allocation boundary, ++ * the rmapbt algorithm triggers repeated allocs and frees as ++ * extents are remapped. This can exhaust the block reservation ++ * prematurely and cause shutdown. Return freed blocks to the ++ * transaction reservation to counter this behavior. + */ +- if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1)) +- resblks += XFS_IFORK_MAXEXT(ip, w); +- if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1)) +- resblks += XFS_IFORK_MAXEXT(tip, w); ++ flags |= XFS_TRANS_RES_FDBLKS; + } +- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); ++ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags, ++ &tp); + if (error) + goto out_unlock; + +--- a/fs/xfs/xfs_trans.c ++++ b/fs/xfs/xfs_trans.c +@@ -107,7 +107,8 @@ xfs_trans_dup( + + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | + (tp->t_flags & XFS_TRANS_RESERVE) | +- (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); ++ (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) | ++ (tp->t_flags & XFS_TRANS_RES_FDBLKS); + /* We gave our writer reference to the new transaction */ + tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; + ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); +@@ -273,6 +274,8 @@ xfs_trans_alloc( + */ + WARN_ON(resp->tr_logres > 0 && + mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); ++ ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || ++ xfs_sb_version_haslazysbcount(&mp->m_sb)); + atomic_inc(&mp->m_active_trans); + + tp->t_magic = XFS_TRANS_HEADER_MAGIC; +@@ -368,6 +371,20 @@ xfs_trans_mod_sb( + tp->t_blk_res_used += (uint)-delta; + if (tp->t_blk_res_used > tp->t_blk_res) + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); ++ } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) { ++ int64_t blkres_delta; ++ ++ /* ++ * Return freed blocks directly to the reservation ++ * instead of the global pool, being careful not to ++ * overflow the trans counter. This is used to preserve ++ * reservation across chains of transaction rolls that ++ * repeatedly free and allocate blocks. ++ */ ++ blkres_delta = min_t(int64_t, delta, ++ UINT_MAX - tp->t_blk_res); ++ tp->t_blk_res += blkres_delta; ++ delta -= blkres_delta; + } + tp->t_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) diff --git a/queue-5.4/xfs-redesign-the-reflink-remap-loop-to-fix-blkres-depletion-crash.patch b/queue-5.4/xfs-redesign-the-reflink-remap-loop-to-fix-blkres-depletion-crash.patch new file mode 100644 index 00000000000..e0312394564 --- /dev/null +++ b/queue-5.4/xfs-redesign-the-reflink-remap-loop-to-fix-blkres-depletion-crash.patch @@ -0,0 +1,473 @@ +From stable-owner@vger.kernel.org Fri Nov 11 05:11:12 2022 +From: Chandan Babu R +Date: Fri, 11 Nov 2022 09:40:22 +0530 +Subject: xfs: redesign the reflink remap loop to fix blkres depletion crash +To: gregkh@linuxfoundation.org +Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com +Message-ID: <20221111041025.87704-4-chandan.babu@oracle.com> + +From: "Darrick J. Wong" + +commit 00fd1d56dd08a8ceaa9e4ee1a41fefd9f6c6bc7d upstream. + +The existing reflink remapping loop has some structural problems that +need addressing: + +The biggest problem is that we create one transaction for each extent in +the source file without accounting for the number of mappings there are +for the same range in the destination file. In other words, we don't +know the number of remap operations that will be necessary and we +therefore cannot guess the block reservation required. On highly +fragmented filesystems (e.g. ones with active dedupe) we guess wrong, +run out of block reservation, and fail. + +The second problem is that we don't actually use the bmap intents to +their full potential -- instead of calling bunmapi directly and having +to deal with its backwards operation, we could call the deferred ops +xfs_bmap_unmap_extent and xfs_refcount_decrease_extent instead. This +makes the frontend loop much simpler. + +Solve all of these problems by refactoring the remapping loops so that +we only perform one remapping operation per transaction, and each +operation only tries to remap a single extent from source to dest. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Reported-by: Edwin Török +Tested-by: Edwin Török +Acked-by: Darrick J. Wong +Signed-off-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.h | 13 +- + fs/xfs/xfs_reflink.c | 242 +++++++++++++++++++++++++---------------------- + fs/xfs/xfs_trace.h | 52 ---------- + 3 files changed, 143 insertions(+), 164 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.h ++++ b/fs/xfs/libxfs/xfs_bmap.h +@@ -158,6 +158,13 @@ static inline int xfs_bmapi_whichfork(in + { BMAP_ATTRFORK, "ATTR" }, \ + { BMAP_COWFORK, "COW" } + ++/* Return true if the extent is an allocated extent, written or not. */ ++static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) ++{ ++ return irec->br_startblock != HOLESTARTBLOCK && ++ irec->br_startblock != DELAYSTARTBLOCK && ++ !isnullstartblock(irec->br_startblock); ++} + + /* + * Return true if the extent is a real, allocated extent, or false if it is a +@@ -165,10 +172,8 @@ static inline int xfs_bmapi_whichfork(in + */ + static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) + { +- return irec->br_state != XFS_EXT_UNWRITTEN && +- irec->br_startblock != HOLESTARTBLOCK && +- irec->br_startblock != DELAYSTARTBLOCK && +- !isnullstartblock(irec->br_startblock); ++ return xfs_bmap_is_real_extent(irec) && ++ irec->br_state != XFS_EXT_UNWRITTEN; + } + + /* +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -986,41 +986,28 @@ xfs_reflink_ag_has_free_space( + } + + /* +- * Unmap a range of blocks from a file, then map other blocks into the hole. +- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). +- * The extent irec is mapped into dest at irec->br_startoff. ++ * Remap the given extent into the file. The dmap blockcount will be set to ++ * the number of blocks that were actually remapped. + */ + STATIC int + xfs_reflink_remap_extent( + struct xfs_inode *ip, +- struct xfs_bmbt_irec *irec, +- xfs_fileoff_t destoff, ++ struct xfs_bmbt_irec *dmap, + xfs_off_t new_isize) + { ++ struct xfs_bmbt_irec smap; + struct xfs_mount *mp = ip->i_mount; +- bool real_extent = xfs_bmap_is_written_extent(irec); + struct xfs_trans *tp; +- unsigned int resblks; +- struct xfs_bmbt_irec uirec; +- xfs_filblks_t rlen; +- xfs_filblks_t unmap_len; + xfs_off_t newlen; +- int64_t qres; ++ int64_t qres, qdelta; ++ unsigned int resblks; ++ bool smap_real; ++ bool dmap_written = xfs_bmap_is_written_extent(dmap); ++ int nimaps; + int error; + +- unmap_len = irec->br_startoff + irec->br_blockcount - destoff; +- trace_xfs_reflink_punch_range(ip, destoff, unmap_len); +- +- /* No reflinking if we're low on space */ +- if (real_extent) { +- error = xfs_reflink_ag_has_free_space(mp, +- XFS_FSB_TO_AGNO(mp, irec->br_startblock)); +- if (error) +- goto out; +- } +- + /* Start a rolling transaction to switch the mappings */ +- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); ++ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) + goto out; +@@ -1029,92 +1016,121 @@ xfs_reflink_remap_extent( + xfs_trans_ijoin(tp, ip, 0); + + /* +- * Reserve quota for this operation. We don't know if the first unmap +- * in the dest file will cause a bmap btree split, so we always reserve +- * at least enough blocks for that split. If the extent being mapped +- * in is written, we need to reserve quota for that too. ++ * Read what's currently mapped in the destination file into smap. ++ * If smap isn't a hole, we will have to remove it before we can add ++ * dmap to the destination file. + */ +- qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); +- if (real_extent) +- qres += irec->br_blockcount; +- error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, +- XFS_QMOPT_RES_REGBLKS); ++ nimaps = 1; ++ error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount, ++ &smap, &nimaps, 0); + if (error) + goto out_cancel; ++ ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff); ++ smap_real = xfs_bmap_is_real_extent(&smap); + +- trace_xfs_reflink_remap(ip, irec->br_startoff, +- irec->br_blockcount, irec->br_startblock); ++ /* ++ * We can only remap as many blocks as the smaller of the two extent ++ * maps, because we can only remap one extent at a time. ++ */ ++ dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount); ++ ASSERT(dmap->br_blockcount == smap.br_blockcount); ++ ++ trace_xfs_reflink_remap_extent_dest(ip, &smap); + +- /* Unmap the old blocks in the data fork. */ +- rlen = unmap_len; +- while (rlen) { +- ASSERT(tp->t_firstblock == NULLFSBLOCK); +- error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1); ++ /* No reflinking if the AG of the dest mapping is low on space. */ ++ if (dmap_written) { ++ error = xfs_reflink_ag_has_free_space(mp, ++ XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); + if (error) + goto out_cancel; ++ } + ++ /* ++ * Compute quota reservation if we think the quota block counter for ++ * this file could increase. ++ * ++ * We start by reserving enough blocks to handle a bmbt split. ++ * ++ * If we are mapping a written extent into the file, we need to have ++ * enough quota block count reservation to handle the blocks in that ++ * extent. ++ * ++ * Note that if we're replacing a delalloc reservation with a written ++ * extent, we have to take the full quota reservation because removing ++ * the delalloc reservation gives the block count back to the quota ++ * count. This is suboptimal, but the VFS flushed the dest range ++ * before we started. That should have removed all the delalloc ++ * reservations, but we code defensively. ++ */ ++ qdelta = 0; ++ qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); ++ if (dmap_written) ++ qres += dmap->br_blockcount; ++ error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0, ++ XFS_QMOPT_RES_REGBLKS); ++ if (error) ++ goto out_cancel; ++ ++ if (smap_real) { + /* +- * Trim the extent to whatever got unmapped. +- * Remember, bunmapi works backwards. ++ * If the extent we're unmapping is backed by storage (written ++ * or not), unmap the extent and drop its refcount. + */ +- uirec.br_startblock = irec->br_startblock + rlen; +- uirec.br_startoff = irec->br_startoff + rlen; +- uirec.br_blockcount = unmap_len - rlen; +- uirec.br_state = irec->br_state; +- unmap_len = rlen; +- +- /* If this isn't a real mapping, we're done. */ +- if (!real_extent || uirec.br_blockcount == 0) +- goto next_extent; +- +- trace_xfs_reflink_remap(ip, uirec.br_startoff, +- uirec.br_blockcount, uirec.br_startblock); +- +- /* Update the refcount tree */ +- xfs_refcount_increase_extent(tp, &uirec); +- +- /* Map the new blocks into the data fork. */ +- xfs_bmap_map_extent(tp, ip, &uirec); +- +- /* Update quota accounting. */ +- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, +- uirec.br_blockcount); +- +- /* Update dest isize if needed. */ +- newlen = XFS_FSB_TO_B(mp, +- uirec.br_startoff + uirec.br_blockcount); +- newlen = min_t(xfs_off_t, newlen, new_isize); +- if (newlen > i_size_read(VFS_I(ip))) { +- trace_xfs_reflink_update_inode_size(ip, newlen); +- i_size_write(VFS_I(ip), newlen); +- ip->i_d.di_size = newlen; +- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +- } ++ xfs_bmap_unmap_extent(tp, ip, &smap); ++ xfs_refcount_decrease_extent(tp, &smap); ++ qdelta -= smap.br_blockcount; ++ } else if (smap.br_startblock == DELAYSTARTBLOCK) { ++ xfs_filblks_t len = smap.br_blockcount; + +-next_extent: +- /* Process all the deferred stuff. */ +- error = xfs_defer_finish(&tp); ++ /* ++ * If the extent we're unmapping is a delalloc reservation, ++ * we can use the regular bunmapi function to release the ++ * incore state. Dropping the delalloc reservation takes care ++ * of the quota reservation for us. ++ */ ++ error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1); + if (error) + goto out_cancel; ++ ASSERT(len == 0); ++ } ++ ++ /* ++ * If the extent we're sharing is backed by written storage, increase ++ * its refcount and map it into the file. ++ */ ++ if (dmap_written) { ++ xfs_refcount_increase_extent(tp, dmap); ++ xfs_bmap_map_extent(tp, ip, dmap); ++ qdelta += dmap->br_blockcount; ++ } ++ ++ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); ++ ++ /* Update dest isize if needed. */ ++ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); ++ newlen = min_t(xfs_off_t, newlen, new_isize); ++ if (newlen > i_size_read(VFS_I(ip))) { ++ trace_xfs_reflink_update_inode_size(ip, newlen); ++ i_size_write(VFS_I(ip), newlen); ++ ip->i_d.di_size = newlen; ++ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + ++ /* Commit everything and unlock. */ + error = xfs_trans_commit(tp); +- xfs_iunlock(ip, XFS_ILOCK_EXCL); +- if (error) +- goto out; +- return 0; ++ goto out_unlock; + + out_cancel: + xfs_trans_cancel(tp); ++out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + out: +- trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); ++ if (error) ++ trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); + return error; + } + +-/* +- * Iteratively remap one file's extents (and holes) to another's. +- */ ++/* Remap a range of one file to the other. */ + int + xfs_reflink_remap_blocks( + struct xfs_inode *src, +@@ -1125,25 +1141,22 @@ xfs_reflink_remap_blocks( + loff_t *remapped) + { + struct xfs_bmbt_irec imap; +- xfs_fileoff_t srcoff; +- xfs_fileoff_t destoff; ++ struct xfs_mount *mp = src->i_mount; ++ xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in); ++ xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out); + xfs_filblks_t len; +- xfs_filblks_t range_len; + xfs_filblks_t remapped_len = 0; + xfs_off_t new_isize = pos_out + remap_len; + int nimaps; + int error = 0; + +- destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); +- srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); +- len = XFS_B_TO_FSB(src->i_mount, remap_len); +- +- /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ +- while (len) { +- uint lock_mode; ++ len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), ++ XFS_MAX_FILEOFF); + +- trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, +- dest, destoff); ++ trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); ++ ++ while (len > 0) { ++ unsigned int lock_mode; + + /* Read extent from the source file */ + nimaps = 1; +@@ -1152,18 +1165,25 @@ xfs_reflink_remap_blocks( + xfs_iunlock(src, lock_mode); + if (error) + break; +- ASSERT(nimaps == 1); ++ /* ++ * The caller supposedly flushed all dirty pages in the source ++ * file range, which means that writeback should have allocated ++ * or deleted all delalloc reservations in that range. If we ++ * find one, that's a good sign that something is seriously ++ * wrong here. ++ */ ++ ASSERT(nimaps == 1 && imap.br_startoff == srcoff); ++ if (imap.br_startblock == DELAYSTARTBLOCK) { ++ ASSERT(imap.br_startblock != DELAYSTARTBLOCK); ++ error = -EFSCORRUPTED; ++ break; ++ } + +- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, +- &imap); ++ trace_xfs_reflink_remap_extent_src(src, &imap); + +- /* Translate imap into the destination file. */ +- range_len = imap.br_startoff + imap.br_blockcount - srcoff; +- imap.br_startoff += destoff - srcoff; +- +- /* Clear dest from destoff to the end of imap and map it in. */ +- error = xfs_reflink_remap_extent(dest, &imap, destoff, +- new_isize); ++ /* Remap into the destination file at the given offset. */ ++ imap.br_startoff = destoff; ++ error = xfs_reflink_remap_extent(dest, &imap, new_isize); + if (error) + break; + +@@ -1173,10 +1193,10 @@ xfs_reflink_remap_blocks( + } + + /* Advance drange/srange */ +- srcoff += range_len; +- destoff += range_len; +- len -= range_len; +- remapped_len += range_len; ++ srcoff += imap.br_blockcount; ++ destoff += imap.br_blockcount; ++ len -= imap.br_blockcount; ++ remapped_len += imap.br_blockcount; + } + + if (error) +--- a/fs/xfs/xfs_trace.h ++++ b/fs/xfs/xfs_trace.h +@@ -3078,8 +3078,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, + DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); + DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); + DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); +-DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); +-TRACE_EVENT(xfs_reflink_remap_blocks_loop, ++TRACE_EVENT(xfs_reflink_remap_blocks, + TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, + xfs_filblks_t len, struct xfs_inode *dest, + xfs_fileoff_t doffset), +@@ -3110,59 +3109,14 @@ TRACE_EVENT(xfs_reflink_remap_blocks_loo + __entry->dest_ino, + __entry->dest_lblk) + ); +-TRACE_EVENT(xfs_reflink_punch_range, +- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, +- xfs_extlen_t len), +- TP_ARGS(ip, lblk, len), +- TP_STRUCT__entry( +- __field(dev_t, dev) +- __field(xfs_ino_t, ino) +- __field(xfs_fileoff_t, lblk) +- __field(xfs_extlen_t, len) +- ), +- TP_fast_assign( +- __entry->dev = VFS_I(ip)->i_sb->s_dev; +- __entry->ino = ip->i_ino; +- __entry->lblk = lblk; +- __entry->len = len; +- ), +- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", +- MAJOR(__entry->dev), MINOR(__entry->dev), +- __entry->ino, +- __entry->lblk, +- __entry->len) +-); +-TRACE_EVENT(xfs_reflink_remap, +- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, +- xfs_extlen_t len, xfs_fsblock_t new_pblk), +- TP_ARGS(ip, lblk, len, new_pblk), +- TP_STRUCT__entry( +- __field(dev_t, dev) +- __field(xfs_ino_t, ino) +- __field(xfs_fileoff_t, lblk) +- __field(xfs_extlen_t, len) +- __field(xfs_fsblock_t, new_pblk) +- ), +- TP_fast_assign( +- __entry->dev = VFS_I(ip)->i_sb->s_dev; +- __entry->ino = ip->i_ino; +- __entry->lblk = lblk; +- __entry->len = len; +- __entry->new_pblk = new_pblk; +- ), +- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", +- MAJOR(__entry->dev), MINOR(__entry->dev), +- __entry->ino, +- __entry->lblk, +- __entry->len, +- __entry->new_pblk) +-); + DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); + DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); + DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); + DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); + DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); + DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); ++DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src); ++DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest); + + /* dedupe tracepoints */ + DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); diff --git a/queue-5.4/xfs-rename-xfs_bmap_is_real_extent-to-is_written_extent.patch b/queue-5.4/xfs-rename-xfs_bmap_is_real_extent-to-is_written_extent.patch new file mode 100644 index 00000000000..934699d6c3a --- /dev/null +++ b/queue-5.4/xfs-rename-xfs_bmap_is_real_extent-to-is_written_extent.patch @@ -0,0 +1,90 @@ +From stable-owner@vger.kernel.org Fri Nov 11 05:11:10 2022 +From: Chandan Babu R +Date: Fri, 11 Nov 2022 09:40:21 +0530 +Subject: xfs: rename xfs_bmap_is_real_extent to is_written_extent +To: gregkh@linuxfoundation.org +Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com +Message-ID: <20221111041025.87704-3-chandan.babu@oracle.com> + +From: "Darrick J. Wong" + +commit 877f58f53684f14ca3202640f70592bf44890924 upstream. + +[ Slightly modify fs/xfs/libxfs/xfs_rtbitmap.c & fs/xfs/xfs_reflink.c to + resolve merge conflict ] + +The name of this predicate is a little misleading -- it decides if the +extent mapping is allocated and written. Change the name to be more +direct, as we're going to add a new predicate in the next patch. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Acked-by: Darrick J. Wong +Signed-off-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.h | 2 +- + fs/xfs/libxfs/xfs_rtbitmap.c | 2 +- + fs/xfs/xfs_reflink.c | 8 ++++---- + 3 files changed, 6 insertions(+), 6 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.h ++++ b/fs/xfs/libxfs/xfs_bmap.h +@@ -163,7 +163,7 @@ static inline int xfs_bmapi_whichfork(in + * Return true if the extent is a real, allocated extent, or false if it is a + * delayed allocation, and unwritten extent or a hole. + */ +-static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) ++static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec) + { + return irec->br_state != XFS_EXT_UNWRITTEN && + irec->br_startblock != HOLESTARTBLOCK && +--- a/fs/xfs/libxfs/xfs_rtbitmap.c ++++ b/fs/xfs/libxfs/xfs_rtbitmap.c +@@ -70,7 +70,7 @@ xfs_rtbuf_get( + if (error) + return error; + +- if (nmap == 0 || !xfs_bmap_is_real_extent(&map)) { ++ if (nmap == 0 || !xfs_bmap_is_written_extent(&map)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -181,7 +181,7 @@ xfs_reflink_trim_around_shared( + int error = 0; + + /* Holes, unwritten, and delalloc extents cannot be shared */ +- if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { ++ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { + *shared = false; + return 0; + } +@@ -657,7 +657,7 @@ xfs_reflink_end_cow_extent( + * preallocations can leak into the range we are called upon, and we + * need to skip them. + */ +- if (!xfs_bmap_is_real_extent(&got)) { ++ if (!xfs_bmap_is_written_extent(&got)) { + *end_fsb = del.br_startoff; + goto out_cancel; + } +@@ -998,7 +998,7 @@ xfs_reflink_remap_extent( + xfs_off_t new_isize) + { + struct xfs_mount *mp = ip->i_mount; +- bool real_extent = xfs_bmap_is_real_extent(irec); ++ bool real_extent = xfs_bmap_is_written_extent(irec); + struct xfs_trans *tp; + unsigned int resblks; + struct xfs_bmbt_irec uirec; +@@ -1427,7 +1427,7 @@ xfs_reflink_dirty_extents( + goto out; + if (nmaps == 0) + break; +- if (!xfs_bmap_is_real_extent(&map[0])) ++ if (!xfs_bmap_is_written_extent(&map[0])) + goto next; + + map[1] = map[0]; diff --git a/queue-5.4/xfs-use-mmaplock-around-filemap_map_pages.patch b/queue-5.4/xfs-use-mmaplock-around-filemap_map_pages.patch new file mode 100644 index 00000000000..0c2aeddaa34 --- /dev/null +++ b/queue-5.4/xfs-use-mmaplock-around-filemap_map_pages.patch @@ -0,0 +1,67 @@ +From stable-owner@vger.kernel.org Fri Nov 11 05:11:19 2022 +From: Chandan Babu R +Date: Fri, 11 Nov 2022 09:40:23 +0530 +Subject: xfs: use MMAPLOCK around filemap_map_pages() +To: gregkh@linuxfoundation.org +Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com +Message-ID: <20221111041025.87704-5-chandan.babu@oracle.com> + +From: Dave Chinner + +commit cd647d5651c0b0deaa26c1acb9e1789437ba9bc7 upstream. + +The page faultround path ->map_pages is implemented in XFS via +filemap_map_pages(). This function checks that pages found in page +cache lookups have not raced with truncate based invalidation by +checking page->mapping is correct and page->index is within EOF. + +However, we've known for a long time that this is not sufficient to +protect against races with invalidations done by operations that do +not change EOF. e.g. hole punching and other fallocate() based +direct extent manipulations. The way we protect against these +races is we wrap the page fault operations in a XFS_MMAPLOCK_SHARED +lock so they serialise against fallocate and truncate before calling +into the filemap function that processes the fault. + +Do the same for XFS's ->map_pages implementation to close this +potential data corruption issue. + +Signed-off-by: Dave Chinner +Reviewed-by: Amir Goldstein +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Acked-by: Darrick J. Wong +Signed-off-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_file.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -1267,10 +1267,23 @@ xfs_filemap_pfn_mkwrite( + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + } + ++static void ++xfs_filemap_map_pages( ++ struct vm_fault *vmf, ++ pgoff_t start_pgoff, ++ pgoff_t end_pgoff) ++{ ++ struct inode *inode = file_inode(vmf->vma->vm_file); ++ ++ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ++ filemap_map_pages(vmf, start_pgoff, end_pgoff); ++ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ++} ++ + static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .huge_fault = xfs_filemap_huge_fault, +- .map_pages = filemap_map_pages, ++ .map_pages = xfs_filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, + .pfn_mkwrite = xfs_filemap_pfn_mkwrite, + };