From f3e817d4a59122622d92eb623d064002b74e8cf9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Apr 2017 18:41:03 +0200 Subject: [PATCH] 4.9-stable patches added patches: xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch xfs-pull-up-iolock-from-xfs_free_eofblocks.patch xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch xfs-use-per-ag-reservations-for-the-finobt.patch --- queue-4.9/series | 4 + ...lds-on-success-in-__xfs_ag_resv_init.patch | 61 ++++ ...ll-up-iolock-from-xfs_free_eofblocks.patch | 246 ++++++++++++++ ...cans-under-iolock-are-livelock-prone.patch | 229 +++++++++++++ ...e-per-ag-reservations-for-the-finobt.patch | 321 ++++++++++++++++++ 5 files changed, 861 insertions(+) create mode 100644 queue-4.9/xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch create mode 100644 queue-4.9/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch create mode 100644 queue-4.9/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch create mode 100644 queue-4.9/xfs-use-per-ag-reservations-for-the-finobt.patch diff --git a/queue-4.9/series b/queue-4.9/series index 76be2244cf8..db553e76a6f 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -1,2 +1,6 @@ libceph-force-gfp_noio-for-socket-allocations.patch xen-setup-don-t-relocate-p2m-over-existing-one.patch +xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch +xfs-use-per-ag-reservations-for-the-finobt.patch +xfs-pull-up-iolock-from-xfs_free_eofblocks.patch +xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch diff --git a/queue-4.9/xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch b/queue-4.9/xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch new file mode 100644 index 00000000000..38738f50eb2 --- /dev/null +++ b/queue-4.9/xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch @@ -0,0 +1,61 @@ +From 4dfa2b84118fd6c95202ae87e62adf5000ccd4d0 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Wed, 25 Jan 2017 07:49:34 -0800 +Subject: xfs: only update mount/resv fields on success in __xfs_ag_resv_init + +From: Christoph Hellwig + +commit 4dfa2b84118fd6c95202ae87e62adf5000ccd4d0 upstream. + +Try to reserve the blocks first and only then update the fields in +or hanging off the mount structure. This way we can call __xfs_ag_resv_init +again after a previous failure. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_ag_resv.c | 23 ++++++++++++++--------- + 1 file changed, 14 insertions(+), 9 deletions(-) + +--- a/fs/xfs/libxfs/xfs_ag_resv.c ++++ b/fs/xfs/libxfs/xfs_ag_resv.c +@@ -200,22 +200,27 @@ __xfs_ag_resv_init( + struct xfs_mount *mp = pag->pag_mount; + struct xfs_ag_resv *resv; + int error; ++ xfs_extlen_t reserved; + +- resv = xfs_perag_resv(pag, type); + if (used > ask) + ask = used; +- resv->ar_asked = ask; +- resv->ar_reserved = resv->ar_orig_reserved = ask - used; +- mp->m_ag_max_usable -= ask; ++ reserved = ask - used; + +- trace_xfs_ag_resv_init(pag, type, ask); +- +- error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true); +- if (error) ++ error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true); ++ if (error) { + trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, + error, _RET_IP_); ++ return error; ++ } + +- return error; ++ mp->m_ag_max_usable -= ask; ++ ++ resv = xfs_perag_resv(pag, type); ++ resv->ar_asked = ask; ++ resv->ar_reserved = resv->ar_orig_reserved = reserved; ++ ++ trace_xfs_ag_resv_init(pag, type, ask); ++ return 0; + } + + /* Create a per-AG block reservation. */ diff --git a/queue-4.9/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch b/queue-4.9/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch new file mode 100644 index 00000000000..0ac5c2d117a --- /dev/null +++ b/queue-4.9/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch @@ -0,0 +1,246 @@ +From a36b926180cda375ac2ec89e1748b47137cfc51c Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Fri, 27 Jan 2017 23:22:55 -0800 +Subject: xfs: pull up iolock from xfs_free_eofblocks() + +From: Brian Foster + +commit a36b926180cda375ac2ec89e1748b47137cfc51c upstream. + +xfs_free_eofblocks() requires the IOLOCK_EXCL lock, but is called from +different contexts where the lock may or may not be held. The +need_iolock parameter exists for this reason, to indicate whether +xfs_free_eofblocks() must acquire the iolock itself before it can +proceed. + +This is ugly and confusing. Simplify the semantics of +xfs_free_eofblocks() to require the caller to acquire the iolock +appropriately and kill the need_iolock parameter. While here, the mp +param can be removed as well as the xfs_mount is accessible from the +xfs_inode structure. This patch does not change behavior. + +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_bmap_util.c | 41 +++++++++++++++------------------------ + fs/xfs/xfs_bmap_util.h | 3 -- + fs/xfs/xfs_icache.c | 24 ++++++++++++++--------- + fs/xfs/xfs_inode.c | 51 ++++++++++++++++++++++++++----------------------- + 4 files changed, 60 insertions(+), 59 deletions(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -917,17 +917,18 @@ xfs_can_free_eofblocks(struct xfs_inode + */ + int + xfs_free_eofblocks( +- xfs_mount_t *mp, +- xfs_inode_t *ip, +- bool need_iolock) ++ struct xfs_inode *ip) + { +- xfs_trans_t *tp; +- int error; +- xfs_fileoff_t end_fsb; +- xfs_fileoff_t last_fsb; +- xfs_filblks_t map_len; +- int nimaps; +- xfs_bmbt_irec_t imap; ++ struct xfs_trans *tp; ++ int error; ++ xfs_fileoff_t end_fsb; ++ xfs_fileoff_t last_fsb; ++ xfs_filblks_t map_len; ++ int nimaps; ++ struct xfs_bmbt_irec imap; ++ struct xfs_mount *mp = ip->i_mount; ++ ++ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + /* + * Figure out if there are any blocks beyond the end +@@ -944,6 +945,10 @@ xfs_free_eofblocks( + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + ++ /* ++ * If there are blocks after the end of file, truncate the file to its ++ * current size to free them up. ++ */ + if (!error && (nimaps != 0) && + (imap.br_startblock != HOLESTARTBLOCK || + ip->i_delayed_blks)) { +@@ -954,22 +959,10 @@ xfs_free_eofblocks( + if (error) + return error; + +- /* +- * There are blocks after the end of file. +- * Free them up now by truncating the file to +- * its current size. +- */ +- if (need_iolock) { +- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) +- return -EAGAIN; +- } +- + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, + &tp); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); +- if (need_iolock) +- xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + +@@ -997,8 +990,6 @@ xfs_free_eofblocks( + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +- if (need_iolock) +- xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + return error; + } +@@ -1415,7 +1406,7 @@ xfs_shift_file_space( + * into the accessible region of the file. + */ + if (xfs_can_free_eofblocks(ip, true)) { +- error = xfs_free_eofblocks(mp, ip, false); ++ error = xfs_free_eofblocks(ip); + if (error) + return error; + } +--- a/fs/xfs/xfs_bmap_util.h ++++ b/fs/xfs/xfs_bmap_util.h +@@ -63,8 +63,7 @@ int xfs_insert_file_space(struct xfs_ino + + /* EOF block manipulation functions */ + bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); +-int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, +- bool need_iolock); ++int xfs_free_eofblocks(struct xfs_inode *ip); + + int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, + struct xfs_swapext *sx); +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1324,7 +1324,7 @@ xfs_inode_free_eofblocks( + int flags, + void *args) + { +- int ret; ++ int ret = 0; + struct xfs_eofblocks *eofb = args; + bool need_iolock = true; + int match; +@@ -1360,19 +1360,25 @@ xfs_inode_free_eofblocks( + return 0; + + /* +- * A scan owner implies we already hold the iolock. Skip it in +- * xfs_free_eofblocks() to avoid deadlock. This also eliminates +- * the possibility of EAGAIN being returned. ++ * A scan owner implies we already hold the iolock. Skip it here ++ * to avoid deadlock. + */ + if (eofb->eof_scan_owner == ip->i_ino) + need_iolock = false; + } + +- ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); +- +- /* don't revisit the inode if we're not waiting */ +- if (ret == -EAGAIN && !(flags & SYNC_WAIT)) +- ret = 0; ++ /* ++ * If the caller is waiting, return -EAGAIN to keep the background ++ * scanner moving and revisit the inode in a subsequent pass. ++ */ ++ if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { ++ if (flags & SYNC_WAIT) ++ ret = -EAGAIN; ++ return ret; ++ } ++ ret = xfs_free_eofblocks(ip); ++ if (need_iolock) ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + return ret; + } +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1701,32 +1701,34 @@ xfs_release( + if (xfs_can_free_eofblocks(ip, false)) { + + /* ++ * Check if the inode is being opened, written and closed ++ * frequently and we have delayed allocation blocks outstanding ++ * (e.g. streaming writes from the NFS server), truncating the ++ * blocks past EOF will cause fragmentation to occur. ++ * ++ * In this case don't do the truncation, but we have to be ++ * careful how we detect this case. Blocks beyond EOF show up as ++ * i_delayed_blks even when the inode is clean, so we need to ++ * truncate them away first before checking for a dirty release. ++ * Hence on the first dirty close we will still remove the ++ * speculative allocation, but after that we will leave it in ++ * place. ++ */ ++ if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) ++ return 0; ++ /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem +- * otherwise. We'll get another chance to drop them once the ++ * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. +- * +- * Further, check if the inode is being opened, written and +- * closed frequently and we have delayed allocation blocks +- * outstanding (e.g. streaming writes from the NFS server), +- * truncating the blocks past EOF will cause fragmentation to +- * occur. +- * +- * In this case don't do the truncation, either, but we have to +- * be careful how we detect this case. Blocks beyond EOF show +- * up as i_delayed_blks even when the inode is clean, so we +- * need to truncate them away first before checking for a dirty +- * release. Hence on the first dirty close we will still remove +- * the speculative allocation, but after that we will leave it +- * in place. + */ +- if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) +- return 0; +- +- error = xfs_free_eofblocks(mp, ip, true); +- if (error && error != -EAGAIN) +- return error; ++ if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { ++ error = xfs_free_eofblocks(ip); ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); ++ if (error) ++ return error; ++ } + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) +@@ -1913,8 +1915,11 @@ xfs_inactive( + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ +- if (xfs_can_free_eofblocks(ip, true)) +- xfs_free_eofblocks(mp, ip, false); ++ if (xfs_can_free_eofblocks(ip, true)) { ++ xfs_ilock(ip, XFS_IOLOCK_EXCL); ++ xfs_free_eofblocks(ip); ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); ++ } + + return; + } diff --git a/queue-4.9/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch b/queue-4.9/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch new file mode 100644 index 00000000000..ec427ea3bfa --- /dev/null +++ b/queue-4.9/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch @@ -0,0 +1,229 @@ +From c3155097ad89a956579bc305856a1f2878494e52 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Fri, 27 Jan 2017 23:22:56 -0800 +Subject: xfs: sync eofblocks scans under iolock are livelock prone + +From: Brian Foster + +commit c3155097ad89a956579bc305856a1f2878494e52 upstream. + +The xfs_eofblocks.eof_scan_owner field is an internal field to +facilitate invoking eofb scans from the kernel while under the iolock. +This is necessary because the eofb scan acquires the iolock of each +inode. Synchronous scans are invoked on certain buffered write failures +while under iolock. In such cases, the scan owner indicates that the +context for the scan already owns the particular iolock and prevents a +double lock deadlock. + +eofblocks scans while under iolock are still livelock prone in the event +of multiple parallel scans, however. If multiple buffered writes to +different inodes fail and invoke eofblocks scans at the same time, each +scan avoids a deadlock with its own inode by virtue of the +eof_scan_owner field, but will never be able to acquire the iolock of +the inode from the parallel scan. Because the low free space scans are +invoked with SYNC_WAIT, the scan will not return until it has processed +every tagged inode and thus both scans will spin indefinitely on the +iolock being held across the opposite scan. This problem can be +reproduced reliably by generic/224 on systems with higher cpu counts +(x16). + +To avoid this problem, simplify the semantics of eofblocks scans to +never invoke a scan while under iolock. This means that the buffered +write context must drop the iolock before the scan. It must reacquire +the lock before the write retry and also repeat the initial write +checks, as the original state might no longer be valid once the iolock +was dropped. + +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/xfs/xfs_file.c | 13 +++++++++---- + fs/xfs/xfs_icache.c | 45 +++++++-------------------------------------- + fs/xfs/xfs_icache.h | 2 -- + 3 files changed, 16 insertions(+), 44 deletions(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -675,8 +675,10 @@ xfs_file_buffered_aio_write( + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int enospc = 0; +- int iolock = XFS_IOLOCK_EXCL; ++ int iolock; + ++write_retry: ++ iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); +@@ -686,7 +688,6 @@ xfs_file_buffered_aio_write( + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + +-write_retry: + trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); + ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); + if (likely(ret >= 0)) +@@ -702,18 +703,21 @@ write_retry: + * running at the same time. + */ + if (ret == -EDQUOT && !enospc) { ++ xfs_rw_iunlock(ip, iolock); + enospc = xfs_inode_free_quota_eofblocks(ip); + if (enospc) + goto write_retry; + enospc = xfs_inode_free_quota_cowblocks(ip); + if (enospc) + goto write_retry; ++ iolock = 0; + } else if (ret == -ENOSPC && !enospc) { + struct xfs_eofblocks eofb = {0}; + + enospc = 1; + xfs_flush_inodes(ip->i_mount); +- eofb.eof_scan_owner = ip->i_ino; /* for locking */ ++ ++ xfs_rw_iunlock(ip, iolock); + eofb.eof_flags = XFS_EOF_FLAGS_SYNC; + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + goto write_retry; +@@ -721,7 +725,8 @@ write_retry: + + current->backing_dev_info = NULL; + out: +- xfs_rw_iunlock(ip, iolock); ++ if (iolock) ++ xfs_rw_iunlock(ip, iolock); + return ret; + } + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1326,11 +1326,8 @@ xfs_inode_free_eofblocks( + { + int ret = 0; + struct xfs_eofblocks *eofb = args; +- bool need_iolock = true; + int match; + +- ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); +- + if (!xfs_can_free_eofblocks(ip, false)) { + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); +@@ -1358,27 +1355,19 @@ xfs_inode_free_eofblocks( + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; +- +- /* +- * A scan owner implies we already hold the iolock. Skip it here +- * to avoid deadlock. +- */ +- if (eofb->eof_scan_owner == ip->i_ino) +- need_iolock = false; + } + + /* + * If the caller is waiting, return -EAGAIN to keep the background + * scanner moving and revisit the inode in a subsequent pass. + */ +- if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { ++ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (flags & SYNC_WAIT) + ret = -EAGAIN; + return ret; + } + ret = xfs_free_eofblocks(ip); +- if (need_iolock) +- xfs_iunlock(ip, XFS_IOLOCK_EXCL); ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + return ret; + } +@@ -1425,15 +1414,10 @@ __xfs_inode_free_quota_eofblocks( + struct xfs_eofblocks eofb = {0}; + struct xfs_dquot *dq; + +- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); +- + /* +- * Set the scan owner to avoid a potential livelock. Otherwise, the scan +- * can repeatedly trylock on the inode we're currently processing. We +- * run a sync scan to increase effectiveness and use the union filter to ++ * Run a sync scan to increase effectiveness and use the union filter to + * cover all applicable quotas in a single scan. + */ +- eofb.eof_scan_owner = ip->i_ino; + eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; + + if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { +@@ -1585,12 +1569,9 @@ xfs_inode_free_cowblocks( + { + int ret; + struct xfs_eofblocks *eofb = args; +- bool need_iolock = true; + int match; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + +- ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); +- + /* + * Just clear the tag if we have an empty cow fork or none at all. It's + * possible the inode was fully unshared since it was originally tagged. +@@ -1623,28 +1604,16 @@ xfs_inode_free_cowblocks( + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; +- +- /* +- * A scan owner implies we already hold the iolock. Skip it in +- * xfs_free_eofblocks() to avoid deadlock. This also eliminates +- * the possibility of EAGAIN being returned. +- */ +- if (eofb->eof_scan_owner == ip->i_ino) +- need_iolock = false; + } + + /* Free the CoW blocks */ +- if (need_iolock) { +- xfs_ilock(ip, XFS_IOLOCK_EXCL); +- xfs_ilock(ip, XFS_MMAPLOCK_EXCL); +- } ++ xfs_ilock(ip, XFS_IOLOCK_EXCL); ++ xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + +- if (need_iolock) { +- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); +- xfs_iunlock(ip, XFS_IOLOCK_EXCL); +- } ++ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + return ret; + } +--- a/fs/xfs/xfs_icache.h ++++ b/fs/xfs/xfs_icache.h +@@ -27,7 +27,6 @@ struct xfs_eofblocks { + kgid_t eof_gid; + prid_t eof_prid; + __u64 eof_min_file_size; +- xfs_ino_t eof_scan_owner; + }; + + #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ +@@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user( + dst->eof_flags = src->eof_flags; + dst->eof_prid = src->eof_prid; + dst->eof_min_file_size = src->eof_min_file_size; +- dst->eof_scan_owner = NULLFSINO; + + dst->eof_uid = INVALID_UID; + if (src->eof_flags & XFS_EOF_FLAGS_UID) { diff --git a/queue-4.9/xfs-use-per-ag-reservations-for-the-finobt.patch b/queue-4.9/xfs-use-per-ag-reservations-for-the-finobt.patch new file mode 100644 index 00000000000..7a17043901d --- /dev/null +++ b/queue-4.9/xfs-use-per-ag-reservations-for-the-finobt.patch @@ -0,0 +1,321 @@ +From 76d771b4cbe33c581bd6ca2710c120be51172440 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Wed, 25 Jan 2017 07:49:35 -0800 +Subject: xfs: use per-AG reservations for the finobt + +From: Christoph Hellwig + +commit 76d771b4cbe33c581bd6ca2710c120be51172440 upstream. + +Currently we try to rely on the global reserved block pool for block +allocations for the free inode btree, but I have customer reports +(fairly complex workload, need to find an easier reproducer) where that +is not enough as the AG where we free an inode that requires a new +finobt block is entirely full. This causes us to cancel a dirty +transaction and thus a file system shutdown. + +I think the right way to guard against this is to treat the finot the same +way as the refcount btree and have a per-AG reservations for the possible +worst case size of it, and the patch below implements that. + +Note that this could increase mount times with large finobt trees. In +an ideal world we would have added a field for the number of finobt +fields to the AGI, similar to what we did for the refcount blocks. +We should do add it next time we rev the AGI or AGF format by adding +new fields. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_ag_resv.c | 47 +++++++++++++++++--- + fs/xfs/libxfs/xfs_ialloc_btree.c | 90 +++++++++++++++++++++++++++++++++++++-- + fs/xfs/libxfs/xfs_ialloc_btree.h | 3 + + fs/xfs/xfs_inode.c | 23 +++++---- + fs/xfs/xfs_mount.h | 1 + 5 files changed, 144 insertions(+), 20 deletions(-) + +--- a/fs/xfs/libxfs/xfs_ag_resv.c ++++ b/fs/xfs/libxfs/xfs_ag_resv.c +@@ -39,6 +39,7 @@ + #include "xfs_rmap_btree.h" + #include "xfs_btree.h" + #include "xfs_refcount_btree.h" ++#include "xfs_ialloc_btree.h" + + /* + * Per-AG Block Reservations +@@ -210,6 +211,9 @@ __xfs_ag_resv_init( + if (error) { + trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, + error, _RET_IP_); ++ xfs_warn(mp, ++"Per-AG reservation for AG %u failed. Filesystem may run out of space.", ++ pag->pag_agno); + return error; + } + +@@ -228,6 +232,8 @@ int + xfs_ag_resv_init( + struct xfs_perag *pag) + { ++ struct xfs_mount *mp = pag->pag_mount; ++ xfs_agnumber_t agno = pag->pag_agno; + xfs_extlen_t ask; + xfs_extlen_t used; + int error = 0; +@@ -236,23 +242,45 @@ xfs_ag_resv_init( + if (pag->pag_meta_resv.ar_asked == 0) { + ask = used = 0; + +- error = xfs_refcountbt_calc_reserves(pag->pag_mount, +- pag->pag_agno, &ask, &used); ++ error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used); + if (error) + goto out; + +- error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, +- ask, used); ++ error = xfs_finobt_calc_reserves(mp, agno, &ask, &used); + if (error) + goto out; ++ ++ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, ++ ask, used); ++ if (error) { ++ /* ++ * Because we didn't have per-AG reservations when the ++ * finobt feature was added we might not be able to ++ * reserve all needed blocks. Warn and fall back to the ++ * old and potentially buggy code in that case, but ++ * ensure we do have the reservation for the refcountbt. ++ */ ++ ask = used = 0; ++ ++ mp->m_inotbt_nores = true; ++ ++ error = xfs_refcountbt_calc_reserves(mp, agno, &ask, ++ &used); ++ if (error) ++ goto out; ++ ++ error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, ++ ask, used); ++ if (error) ++ goto out; ++ } + } + + /* Create the AGFL metadata reservation */ + if (pag->pag_agfl_resv.ar_asked == 0) { + ask = used = 0; + +- error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, +- &ask, &used); ++ error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); + if (error) + goto out; + +@@ -261,9 +289,16 @@ xfs_ag_resv_init( + goto out; + } + ++#ifdef DEBUG ++ /* need to read in the AGF for the ASSERT below to work */ ++ error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0); ++ if (error) ++ return error; ++ + ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + + xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= + pag->pagf_freeblks + pag->pagf_flcount); ++#endif + out: + return error; + } +--- a/fs/xfs/libxfs/xfs_ialloc_btree.c ++++ b/fs/xfs/libxfs/xfs_ialloc_btree.c +@@ -82,11 +82,12 @@ xfs_finobt_set_root( + } + + STATIC int +-xfs_inobt_alloc_block( ++__xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, +- int *stat) ++ int *stat, ++ enum xfs_ag_resv_type resv) + { + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ +@@ -103,6 +104,7 @@ xfs_inobt_alloc_block( + args.maxlen = 1; + args.prod = 1; + args.type = XFS_ALLOCTYPE_NEAR_BNO; ++ args.resv = resv; + + error = xfs_alloc_vextent(&args); + if (error) { +@@ -123,6 +125,27 @@ xfs_inobt_alloc_block( + } + + STATIC int ++xfs_inobt_alloc_block( ++ struct xfs_btree_cur *cur, ++ union xfs_btree_ptr *start, ++ union xfs_btree_ptr *new, ++ int *stat) ++{ ++ return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); ++} ++ ++STATIC int ++xfs_finobt_alloc_block( ++ struct xfs_btree_cur *cur, ++ union xfs_btree_ptr *start, ++ union xfs_btree_ptr *new, ++ int *stat) ++{ ++ return __xfs_inobt_alloc_block(cur, start, new, stat, ++ XFS_AG_RESV_METADATA); ++} ++ ++STATIC int + xfs_inobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +@@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_fi + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_finobt_set_root, +- .alloc_block = xfs_inobt_alloc_block, ++ .alloc_block = xfs_finobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, +@@ -478,3 +501,64 @@ xfs_inobt_rec_check_count( + return 0; + } + #endif /* DEBUG */ ++ ++static xfs_extlen_t ++xfs_inobt_max_size( ++ struct xfs_mount *mp) ++{ ++ /* Bail out if we're uninitialized, which can happen in mkfs. */ ++ if (mp->m_inobt_mxr[0] == 0) ++ return 0; ++ ++ return xfs_btree_calc_size(mp, mp->m_inobt_mnr, ++ (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / ++ XFS_INODES_PER_CHUNK); ++} ++ ++static int ++xfs_inobt_count_blocks( ++ struct xfs_mount *mp, ++ xfs_agnumber_t agno, ++ xfs_btnum_t btnum, ++ xfs_extlen_t *tree_blocks) ++{ ++ struct xfs_buf *agbp; ++ struct xfs_btree_cur *cur; ++ int error; ++ ++ error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); ++ if (error) ++ return error; ++ ++ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); ++ error = xfs_btree_count_blocks(cur, tree_blocks); ++ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); ++ xfs_buf_relse(agbp); ++ ++ return error; ++} ++ ++/* ++ * Figure out how many blocks to reserve and how many are used by this btree. ++ */ ++int ++xfs_finobt_calc_reserves( ++ struct xfs_mount *mp, ++ xfs_agnumber_t agno, ++ xfs_extlen_t *ask, ++ xfs_extlen_t *used) ++{ ++ xfs_extlen_t tree_len = 0; ++ int error; ++ ++ if (!xfs_sb_version_hasfinobt(&mp->m_sb)) ++ return 0; ++ ++ error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); ++ if (error) ++ return error; ++ ++ *ask += xfs_inobt_max_size(mp); ++ *used += tree_len; ++ return 0; ++} +--- a/fs/xfs/libxfs/xfs_ialloc_btree.h ++++ b/fs/xfs/libxfs/xfs_ialloc_btree.h +@@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs + #define xfs_inobt_rec_check_count(mp, rec) 0 + #endif /* DEBUG */ + ++int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, ++ xfs_extlen_t *ask, xfs_extlen_t *used); ++ + #endif /* __XFS_IALLOC_BTREE_H__ */ +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1801,22 +1801,23 @@ xfs_inactive_ifree( + int error; + + /* +- * The ifree transaction might need to allocate blocks for record +- * insertion to the finobt. We don't want to fail here at ENOSPC, so +- * allow ifree to dip into the reserved block pool if necessary. +- * +- * Freeing large sets of inodes generally means freeing inode chunks, +- * directory and file data blocks, so this should be relatively safe. +- * Only under severe circumstances should it be possible to free enough +- * inodes to exhaust the reserve block pool via finobt expansion while +- * at the same time not creating free space in the filesystem. ++ * We try to use a per-AG reservation for any block needed by the finobt ++ * tree, but as the finobt feature predates the per-AG reservation ++ * support a degraded file system might not have enough space for the ++ * reservation at mount time. In that case try to dip into the reserved ++ * pool and pray. + * + * Send a warning if the reservation does happen to fail, as the inode + * now remains allocated and sits on the unlinked list until the fs is + * repaired. + */ +- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, +- XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); ++ if (unlikely(mp->m_inotbt_nores)) { ++ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, ++ XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, ++ &tp); ++ } else { ++ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); ++ } + if (error) { + if (error == -ENOSPC) { + xfs_warn_ratelimited(mp, +--- a/fs/xfs/xfs_mount.h ++++ b/fs/xfs/xfs_mount.h +@@ -140,6 +140,7 @@ typedef struct xfs_mount { + int m_fixedfsid[2]; /* unchanged for life of FS */ + uint m_dmevmask; /* DMI events for this FS */ + __uint64_t m_flags; /* global mount flags */ ++ bool m_inotbt_nores; /* no per-AG finobt resv. */ + int m_ialloc_inos; /* inodes in inode allocation */ + int m_ialloc_blks; /* blocks in inode allocation */ + int m_ialloc_min_blks;/* min blocks in sparse inode -- 2.47.3