From: Dave Chinner Date: Fri, 4 Sep 2020 19:57:20 +0000 (-0400) Subject: xfs: pin inode backing buffer to the inode log item X-Git-Tag: v5.9.0-rc0~45 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2b8ea8268481db3d3c8bc6e5529497029c64dfdd;p=thirdparty%2Fxfsprogs-dev.git xfs: pin inode backing buffer to the inode log item Source kernel commit: 298f7bec503f30bd98242ec02df6abe13b31a677 When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Eric Sandeen --- diff --git a/include/xfs_trans.h b/include/xfs_trans.h index 75ca3777b..84f29418c 100644 --- a/include/xfs_trans.h +++ b/include/xfs_trans.h @@ -22,6 +22,7 @@ typedef struct xfs_log_item { struct xfs_mount *li_mountp; /* ptr to fs mount */ uint li_type; /* item type */ unsigned long li_flags; /* misc flags */ + struct xfs_buf *li_buf; /* real buffer pointer */ } xfs_log_item_t; #define XFS_LI_DIRTY 3 /* log item dirty in transaction */ diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h index 34462f78a..8b67a57df 100644 --- a/libxfs/libxfs_io.h +++ b/libxfs/libxfs_io.h @@ -243,6 +243,12 @@ xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t len) return 0; } +static inline void +xfs_buf_hold(struct xfs_buf *bp) +{ + bp->b_node.cn_count++; +} + int libxfs_buf_get_uncached(struct xfs_buftarg *targ, size_t bblen, int flags, struct xfs_buf **bpp); int libxfs_buf_read_uncached(struct xfs_buftarg *targ, xfs_daddr_t daddr, @@ -253,7 +259,7 @@ int libxfs_buf_read_uncached(struct xfs_buftarg *targ, xfs_daddr_t daddr, static inline bool xfs_buf_delwri_queue(struct xfs_buf *bp, struct list_head *buffer_list) { - bp->b_node.cn_count++; + xfs_buf_hold(bp); list_add_tail(&bp->b_list, buffer_list); return true; } diff --git a/libxfs/trans.c b/libxfs/trans.c index a61f803fe..5b6e59467 100644 --- a/libxfs/trans.c +++ b/libxfs/trans.c @@ -780,6 +780,8 @@ xfs_inode_item_put( { struct xfs_inode *ip = iip->ili_inode; + ASSERT(iip->ili_item.li_buf == NULL); + ip->i_itemp = NULL; kmem_cache_free(xfs_ili_zone, iip); } @@ -795,47 +797,35 @@ static void inode_item_done( struct xfs_inode_log_item *iip) { - xfs_dinode_t *dip; - xfs_inode_t *ip; - xfs_mount_t *mp; xfs_buf_t *bp; int error; - ip = iip->ili_inode; - mp = iip->ili_item.li_mountp; - ASSERT(ip != NULL); + ASSERT(iip->ili_inode != NULL); if (!(iip->ili_fields & XFS_ILOG_ALL)) - goto free; + goto free_item; - /* - * Get the buffer containing the on-disk inode. - */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, 0); - if (error) { - fprintf(stderr, _("%s: warning - imap_to_bp failed (%d)\n"), - progname, error); - goto free; - } + bp = iip->ili_item.li_buf; + iip->ili_item.li_buf = NULL; /* * Flush the inode and disassociate it from the transaction regardless * of whether the flush succeed or not. If we fail the flush, make sure * we still release the buffer reference we currently hold. */ - error = libxfs_iflush_int(ip, bp); + error = libxfs_iflush_int(iip->ili_inode, bp); bp->b_transp = NULL; /* remove xact ptr */ if (error) { fprintf(stderr, _("%s: warning - iflush_int failed (%d)\n"), progname, error); - libxfs_buf_relse(bp); goto free; } libxfs_buf_mark_dirty(bp); - libxfs_buf_relse(bp); free: + libxfs_buf_relse(bp); +free_item: xfs_inode_item_put(iip); } diff --git a/libxfs/xfs_inode_buf.c b/libxfs/xfs_inode_buf.c index 6362ee442..896764403 100644 --- a/libxfs/xfs_inode_buf.c +++ b/libxfs/xfs_inode_buf.c @@ -173,7 +173,8 @@ xfs_imap_to_bp( } *bpp = bp; - *dipp = xfs_buf_offset(bp, imap->im_boffset); + if (dipp) + *dipp = xfs_buf_offset(bp, imap->im_boffset); return 0; } diff --git a/libxfs/xfs_trans_inode.c b/libxfs/xfs_trans_inode.c index aaa310f78..4387660b9 100644 --- a/libxfs/xfs_trans_inode.c +++ b/libxfs/xfs_trans_inode.c @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -69,13 +71,19 @@ xfs_trans_ichgtime( } /* - * This is called to mark the fields indicated in fieldmask as needing - * to be logged when the transaction is committed. The inode must - * already be associated with the given transaction. + * This is called to mark the fields indicated in fieldmask as needing to be + * logged when the transaction is committed. The inode must already be + * associated with the given transaction. * - * The values for fieldmask are defined in xfs_inode_item.h. We always - * log all of the core inode if any of it has changed, and we always log - * all of the inline data/extents/b-tree root if any of them has changed. + * The values for fieldmask are defined in xfs_inode_item.h. We always log all + * of the core inode if any of it has changed, and we always log all of the + * inline data/extents/b-tree root if any of them has changed. + * + * Grab and pin the cluster buffer associated with this inode to avoid RMW + * cycles at inode writeback time. Avoid the need to add error handling to every + * xfs_trans_log_inode() call by shutting down on read error. This will cause + * transactions to fail and everything to error out, just like if we return a + * read error in a dirty transaction and cancel it. */ void xfs_trans_log_inode( @@ -128,6 +136,39 @@ xfs_trans_log_inode( spin_lock(&iip->ili_lock); iip->ili_fsync_fields |= flags; + if (!iip->ili_item.li_buf) { + struct xfs_buf *bp; + int error; + + /* + * We hold the ILOCK here, so this inode is not going to be + * flushed while we are here. Further, because there is no + * buffer attached to the item, we know that there is no IO in + * progress, so nothing will clear the ili_fields while we read + * in the buffer. Hence we can safely drop the spin lock and + * read the buffer knowing that the state will not change from + * here. + */ + spin_unlock(&iip->ili_lock); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, NULL, + &bp, 0); + if (error) { + xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); + return; + } + + /* + * We need an explicit buffer reference for the log item but + * don't want the buffer to remain attached to the transaction. + * Hold the buffer but release the transaction reference. + */ + xfs_buf_hold(bp); + xfs_trans_brelse(tp, bp); + + spin_lock(&iip->ili_lock); + iip->ili_item.li_buf = bp; + } + /* * Always OR in the bits from the ili_last_fields field. This is to * coordinate with the xfs_iflush() and xfs_iflush_done() routines in