From: Greg Kroah-Hartman Date: Sat, 1 Apr 2017 16:40:40 +0000 (+0200) Subject: 4.10-stable patches X-Git-Tag: v4.9.21~21 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=167931cea41b639e699cdc48749962eaf132afa9;p=thirdparty%2Fkernel%2Fstable-queue.git 4.10-stable patches added patches: xfs-pull-up-iolock-from-xfs_free_eofblocks.patch xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch --- diff --git a/queue-4.10/series b/queue-4.10/series index 68ee3f217d8..2d2ac804b39 100644 --- a/queue-4.10/series +++ b/queue-4.10/series @@ -1,2 +1,4 @@ libceph-force-gfp_noio-for-socket-allocations.patch kvm-nvmx-fix-nested-ept-detection.patch +xfs-pull-up-iolock-from-xfs_free_eofblocks.patch +xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch diff --git a/queue-4.10/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch b/queue-4.10/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch new file mode 100644 index 00000000000..3a06e7b4d02 --- /dev/null +++ b/queue-4.10/xfs-pull-up-iolock-from-xfs_free_eofblocks.patch @@ -0,0 +1,246 @@ +From a36b926180cda375ac2ec89e1748b47137cfc51c Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Fri, 27 Jan 2017 23:22:55 -0800 +Subject: xfs: pull up iolock from xfs_free_eofblocks() + +From: Brian Foster + +commit a36b926180cda375ac2ec89e1748b47137cfc51c upstream. + +xfs_free_eofblocks() requires the IOLOCK_EXCL lock, but is called from +different contexts where the lock may or may not be held. The +need_iolock parameter exists for this reason, to indicate whether +xfs_free_eofblocks() must acquire the iolock itself before it can +proceed. + +This is ugly and confusing. Simplify the semantics of +xfs_free_eofblocks() to require the caller to acquire the iolock +appropriately and kill the need_iolock parameter. While here, the mp +param can be removed as well as the xfs_mount is accessible from the +xfs_inode structure. This patch does not change behavior. + +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_bmap_util.c | 41 +++++++++++++++------------------------ + fs/xfs/xfs_bmap_util.h | 3 -- + fs/xfs/xfs_icache.c | 24 ++++++++++++++--------- + fs/xfs/xfs_inode.c | 51 ++++++++++++++++++++++++++----------------------- + 4 files changed, 60 insertions(+), 59 deletions(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -917,17 +917,18 @@ xfs_can_free_eofblocks(struct xfs_inode + */ + int + xfs_free_eofblocks( +- xfs_mount_t *mp, +- xfs_inode_t *ip, +- bool need_iolock) ++ struct xfs_inode *ip) + { +- xfs_trans_t *tp; +- int error; +- xfs_fileoff_t end_fsb; +- xfs_fileoff_t last_fsb; +- xfs_filblks_t map_len; +- int nimaps; +- xfs_bmbt_irec_t imap; ++ struct xfs_trans *tp; ++ int error; ++ xfs_fileoff_t end_fsb; ++ xfs_fileoff_t last_fsb; ++ xfs_filblks_t map_len; ++ int nimaps; ++ struct xfs_bmbt_irec imap; ++ struct xfs_mount *mp = ip->i_mount; ++ ++ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + /* + * Figure out if there are any blocks beyond the end +@@ -944,6 +945,10 @@ xfs_free_eofblocks( + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + ++ /* ++ * If there are blocks after the end of file, truncate the file to its ++ * current size to free them up. ++ */ + if (!error && (nimaps != 0) && + (imap.br_startblock != HOLESTARTBLOCK || + ip->i_delayed_blks)) { +@@ -954,22 +959,10 @@ xfs_free_eofblocks( + if (error) + return error; + +- /* +- * There are blocks after the end of file. +- * Free them up now by truncating the file to +- * its current size. +- */ +- if (need_iolock) { +- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) +- return -EAGAIN; +- } +- + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, + &tp); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); +- if (need_iolock) +- xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + +@@ -997,8 +990,6 @@ xfs_free_eofblocks( + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +- if (need_iolock) +- xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + return error; + } +@@ -1415,7 +1406,7 @@ xfs_shift_file_space( + * into the accessible region of the file. + */ + if (xfs_can_free_eofblocks(ip, true)) { +- error = xfs_free_eofblocks(mp, ip, false); ++ error = xfs_free_eofblocks(ip); + if (error) + return error; + } +--- a/fs/xfs/xfs_bmap_util.h ++++ b/fs/xfs/xfs_bmap_util.h +@@ -63,8 +63,7 @@ int xfs_insert_file_space(struct xfs_ino + + /* EOF block manipulation functions */ + bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); +-int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, +- bool need_iolock); ++int xfs_free_eofblocks(struct xfs_inode *ip); + + int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, + struct xfs_swapext *sx); +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1322,7 +1322,7 @@ xfs_inode_free_eofblocks( + int flags, + void *args) + { +- int ret; ++ int ret = 0; + struct xfs_eofblocks *eofb = args; + bool need_iolock = true; + int match; +@@ -1358,19 +1358,25 @@ xfs_inode_free_eofblocks( + return 0; + + /* +- * A scan owner implies we already hold the iolock. Skip it in +- * xfs_free_eofblocks() to avoid deadlock. This also eliminates +- * the possibility of EAGAIN being returned. ++ * A scan owner implies we already hold the iolock. Skip it here ++ * to avoid deadlock. + */ + if (eofb->eof_scan_owner == ip->i_ino) + need_iolock = false; + } + +- ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); +- +- /* don't revisit the inode if we're not waiting */ +- if (ret == -EAGAIN && !(flags & SYNC_WAIT)) +- ret = 0; ++ /* ++ * If the caller is waiting, return -EAGAIN to keep the background ++ * scanner moving and revisit the inode in a subsequent pass. ++ */ ++ if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { ++ if (flags & SYNC_WAIT) ++ ret = -EAGAIN; ++ return ret; ++ } ++ ret = xfs_free_eofblocks(ip); ++ if (need_iolock) ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + return ret; + } +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1692,32 +1692,34 @@ xfs_release( + if (xfs_can_free_eofblocks(ip, false)) { + + /* ++ * Check if the inode is being opened, written and closed ++ * frequently and we have delayed allocation blocks outstanding ++ * (e.g. streaming writes from the NFS server), truncating the ++ * blocks past EOF will cause fragmentation to occur. ++ * ++ * In this case don't do the truncation, but we have to be ++ * careful how we detect this case. Blocks beyond EOF show up as ++ * i_delayed_blks even when the inode is clean, so we need to ++ * truncate them away first before checking for a dirty release. ++ * Hence on the first dirty close we will still remove the ++ * speculative allocation, but after that we will leave it in ++ * place. ++ */ ++ if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) ++ return 0; ++ /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem +- * otherwise. We'll get another chance to drop them once the ++ * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. +- * +- * Further, check if the inode is being opened, written and +- * closed frequently and we have delayed allocation blocks +- * outstanding (e.g. streaming writes from the NFS server), +- * truncating the blocks past EOF will cause fragmentation to +- * occur. +- * +- * In this case don't do the truncation, either, but we have to +- * be careful how we detect this case. Blocks beyond EOF show +- * up as i_delayed_blks even when the inode is clean, so we +- * need to truncate them away first before checking for a dirty +- * release. Hence on the first dirty close we will still remove +- * the speculative allocation, but after that we will leave it +- * in place. + */ +- if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) +- return 0; +- +- error = xfs_free_eofblocks(mp, ip, true); +- if (error && error != -EAGAIN) +- return error; ++ if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { ++ error = xfs_free_eofblocks(ip); ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); ++ if (error) ++ return error; ++ } + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) +@@ -1904,8 +1906,11 @@ xfs_inactive( + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ +- if (xfs_can_free_eofblocks(ip, true)) +- xfs_free_eofblocks(mp, ip, false); ++ if (xfs_can_free_eofblocks(ip, true)) { ++ xfs_ilock(ip, XFS_IOLOCK_EXCL); ++ xfs_free_eofblocks(ip); ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); ++ } + + return; + } diff --git a/queue-4.10/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch b/queue-4.10/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch new file mode 100644 index 00000000000..048a92ca5f0 --- /dev/null +++ b/queue-4.10/xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch @@ -0,0 +1,228 @@ +From c3155097ad89a956579bc305856a1f2878494e52 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Fri, 27 Jan 2017 23:22:56 -0800 +Subject: xfs: sync eofblocks scans under iolock are livelock prone + +From: Brian Foster + +commit c3155097ad89a956579bc305856a1f2878494e52 upstream. + +The xfs_eofblocks.eof_scan_owner field is an internal field to +facilitate invoking eofb scans from the kernel while under the iolock. +This is necessary because the eofb scan acquires the iolock of each +inode. Synchronous scans are invoked on certain buffered write failures +while under iolock. In such cases, the scan owner indicates that the +context for the scan already owns the particular iolock and prevents a +double lock deadlock. + +eofblocks scans while under iolock are still livelock prone in the event +of multiple parallel scans, however. If multiple buffered writes to +different inodes fail and invoke eofblocks scans at the same time, each +scan avoids a deadlock with its own inode by virtue of the +eof_scan_owner field, but will never be able to acquire the iolock of +the inode from the parallel scan. Because the low free space scans are +invoked with SYNC_WAIT, the scan will not return until it has processed +every tagged inode and thus both scans will spin indefinitely on the +iolock being held across the opposite scan. This problem can be +reproduced reliably by generic/224 on systems with higher cpu counts +(x16). + +To avoid this problem, simplify the semantics of eofblocks scans to +never invoke a scan while under iolock. This means that the buffered +write context must drop the iolock before the scan. It must reacquire +the lock before the write retry and also repeat the initial write +checks, as the original state might no longer be valid once the iolock +was dropped. + +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_file.c | 13 +++++++++---- + fs/xfs/xfs_icache.c | 45 +++++++-------------------------------------- + fs/xfs/xfs_icache.h | 2 -- + 3 files changed, 16 insertions(+), 44 deletions(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -614,8 +614,10 @@ xfs_file_buffered_aio_write( + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int enospc = 0; +- int iolock = XFS_IOLOCK_EXCL; ++ int iolock; + ++write_retry: ++ iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); +@@ -625,7 +627,6 @@ xfs_file_buffered_aio_write( + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + +-write_retry: + trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); + ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); + if (likely(ret >= 0)) +@@ -641,18 +642,21 @@ write_retry: + * running at the same time. + */ + if (ret == -EDQUOT && !enospc) { ++ xfs_iunlock(ip, iolock); + enospc = xfs_inode_free_quota_eofblocks(ip); + if (enospc) + goto write_retry; + enospc = xfs_inode_free_quota_cowblocks(ip); + if (enospc) + goto write_retry; ++ iolock = 0; + } else if (ret == -ENOSPC && !enospc) { + struct xfs_eofblocks eofb = {0}; + + enospc = 1; + xfs_flush_inodes(ip->i_mount); +- eofb.eof_scan_owner = ip->i_ino; /* for locking */ ++ ++ xfs_iunlock(ip, iolock); + eofb.eof_flags = XFS_EOF_FLAGS_SYNC; + xfs_icache_free_eofblocks(ip->i_mount, &eofb); + goto write_retry; +@@ -660,7 +664,8 @@ write_retry: + + current->backing_dev_info = NULL; + out: +- xfs_iunlock(ip, iolock); ++ if (iolock) ++ xfs_iunlock(ip, iolock); + return ret; + } + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1324,11 +1324,8 @@ xfs_inode_free_eofblocks( + { + int ret = 0; + struct xfs_eofblocks *eofb = args; +- bool need_iolock = true; + int match; + +- ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); +- + if (!xfs_can_free_eofblocks(ip, false)) { + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); +@@ -1356,27 +1353,19 @@ xfs_inode_free_eofblocks( + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; +- +- /* +- * A scan owner implies we already hold the iolock. Skip it here +- * to avoid deadlock. +- */ +- if (eofb->eof_scan_owner == ip->i_ino) +- need_iolock = false; + } + + /* + * If the caller is waiting, return -EAGAIN to keep the background + * scanner moving and revisit the inode in a subsequent pass. + */ +- if (need_iolock && !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { ++ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + if (flags & SYNC_WAIT) + ret = -EAGAIN; + return ret; + } + ret = xfs_free_eofblocks(ip); +- if (need_iolock) +- xfs_iunlock(ip, XFS_IOLOCK_EXCL); ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + return ret; + } +@@ -1423,15 +1412,10 @@ __xfs_inode_free_quota_eofblocks( + struct xfs_eofblocks eofb = {0}; + struct xfs_dquot *dq; + +- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); +- + /* +- * Set the scan owner to avoid a potential livelock. Otherwise, the scan +- * can repeatedly trylock on the inode we're currently processing. We +- * run a sync scan to increase effectiveness and use the union filter to ++ * Run a sync scan to increase effectiveness and use the union filter to + * cover all applicable quotas in a single scan. + */ +- eofb.eof_scan_owner = ip->i_ino; + eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; + + if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { +@@ -1583,12 +1567,9 @@ xfs_inode_free_cowblocks( + { + int ret; + struct xfs_eofblocks *eofb = args; +- bool need_iolock = true; + int match; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + +- ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); +- + /* + * Just clear the tag if we have an empty cow fork or none at all. It's + * possible the inode was fully unshared since it was originally tagged. +@@ -1621,28 +1602,16 @@ xfs_inode_free_cowblocks( + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; +- +- /* +- * A scan owner implies we already hold the iolock. Skip it in +- * xfs_free_eofblocks() to avoid deadlock. This also eliminates +- * the possibility of EAGAIN being returned. +- */ +- if (eofb->eof_scan_owner == ip->i_ino) +- need_iolock = false; + } + + /* Free the CoW blocks */ +- if (need_iolock) { +- xfs_ilock(ip, XFS_IOLOCK_EXCL); +- xfs_ilock(ip, XFS_MMAPLOCK_EXCL); +- } ++ xfs_ilock(ip, XFS_IOLOCK_EXCL); ++ xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + +- if (need_iolock) { +- xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); +- xfs_iunlock(ip, XFS_IOLOCK_EXCL); +- } ++ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); ++ xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + return ret; + } +--- a/fs/xfs/xfs_icache.h ++++ b/fs/xfs/xfs_icache.h +@@ -27,7 +27,6 @@ struct xfs_eofblocks { + kgid_t eof_gid; + prid_t eof_prid; + __u64 eof_min_file_size; +- xfs_ino_t eof_scan_owner; + }; + + #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ +@@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user( + dst->eof_flags = src->eof_flags; + dst->eof_prid = src->eof_prid; + dst->eof_min_file_size = src->eof_min_file_size; +- dst->eof_scan_owner = NULLFSINO; + + dst->eof_uid = INVALID_UID; + if (src->eof_flags & XFS_EOF_FLAGS_UID) {