From 138885ca44882168bb8def38d52b3b404c7363a2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Apr 2017 19:33:18 +0200 Subject: [PATCH] 4.9-stable patches added patches: xfs-allow-unwritten-extents-in-the-cow-fork.patch xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch xfs-don-t-fail-xfs_extent_busy-allocation.patch xfs-don-t-reserve-blocks-for-right-shift-transactions.patch xfs-fail-_dir_open-when-readahead-fails.patch xfs-filter-out-obviously-bad-btree-pointers.patch xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch xfs-only-reclaim-unwritten-cow-extents-periodically.patch xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch xfs-split-indlen-reservations-fairly-when-under-reserved.patch xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch xfs-tune-down-agno-asserts-in-the-bmap-code.patch xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch xfs-verify-free-block-header-fields.patch --- queue-4.9/series | 23 ++ ...ow-unwritten-extents-in-the-cow-fork.patch | 278 ++++++++++++++ ...ly-bad-level-values-in-the-bmbt-root.patch | 55 +++ ...on-t-fail-xfs_extent_busy-allocation.patch | 45 +++ ...-blocks-for-right-shift-transactions.patch | 94 +++++ ...-fail-_dir_open-when-readahead-fails.patch | 75 ++++ ...ter-out-obviously-bad-btree-pointers.patch | 66 ++++ ...eamline-error-handling-in-xfs_end_io.patch | 111 ++++++ ...with-file-extending-async-dio-writes.patch | 61 ++++ ...king-an-inode-to-access-the-data-map.patch | 48 +++ ...zed-variable-in-_reflink_convert_cow.patch | 31 ++ ...en-shortage-on-delalloc-extent-merge.patch | 76 ++++ ...-prealloc-cow-fork-extents-unwritten.patch | 339 ++++++++++++++++++ ...m-unwritten-cow-extents-periodically.patch | 159 ++++++++ ...ned-direct-writes-to-reflinked-files.patch | 125 +++++++ ...-clear-the-retry-status-of-xfs_buf_t.patch | 32 ++ ...ervations-fairly-when-under-reserved.patch | 118 ++++++ ...he-first-btree-block-when-reflinking.patch | 97 +++++ ...e-down-agno-asserts-in-the-bmap-code.patch | 83 +++++ ...-mtime-on-clone-destinatation-inodes.patch | 66 ++++ ...-for-newly-allocated-delalloc-blocks.patch | 150 ++++++++ ...sb-to-calculate-inode-alignment-mask.patch | 44 +++ ...b-to-calculate-inode-chunk-alignment.patch | 91 +++++ .../xfs-verify-free-block-header-fields.patch | 93 +++++ 24 files changed, 2360 insertions(+) create mode 100644 queue-4.9/xfs-allow-unwritten-extents-in-the-cow-fork.patch create mode 100644 queue-4.9/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch create mode 100644 queue-4.9/xfs-don-t-fail-xfs_extent_busy-allocation.patch create mode 100644 queue-4.9/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch create mode 100644 queue-4.9/xfs-fail-_dir_open-when-readahead-fails.patch create mode 100644 queue-4.9/xfs-filter-out-obviously-bad-btree-pointers.patch create mode 100644 queue-4.9/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch create mode 100644 queue-4.9/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch create mode 100644 queue-4.9/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch create mode 100644 queue-4.9/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch create mode 100644 queue-4.9/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch create mode 100644 queue-4.9/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch create mode 100644 queue-4.9/xfs-only-reclaim-unwritten-cow-extents-periodically.patch create mode 100644 queue-4.9/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch create mode 100644 queue-4.9/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch create mode 100644 queue-4.9/xfs-split-indlen-reservations-fairly-when-under-reserved.patch create mode 100644 queue-4.9/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch create mode 100644 queue-4.9/xfs-tune-down-agno-asserts-in-the-bmap-code.patch create mode 100644 queue-4.9/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch create mode 100644 queue-4.9/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch create mode 100644 queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch create mode 100644 queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch create mode 100644 queue-4.9/xfs-verify-free-block-header-fields.patch diff --git a/queue-4.9/series b/queue-4.9/series index db553e76a6f..1c97673490f 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -4,3 +4,26 @@ xfs-only-update-mount-resv-fields-on-success-in-__xfs_ag_resv_init.patch xfs-use-per-ag-reservations-for-the-finobt.patch xfs-pull-up-iolock-from-xfs_free_eofblocks.patch xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch +xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch +xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch +xfs-fail-_dir_open-when-readahead-fails.patch +xfs-filter-out-obviously-bad-btree-pointers.patch +xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch +xfs-verify-free-block-header-fields.patch +xfs-allow-unwritten-extents-in-the-cow-fork.patch +xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch +xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch +xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch +xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch +xfs-don-t-fail-xfs_extent_busy-allocation.patch +xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch +xfs-split-indlen-reservations-fairly-when-under-reserved.patch +xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch +xfs-don-t-reserve-blocks-for-right-shift-transactions.patch +xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch +xfs-tune-down-agno-asserts-in-the-bmap-code.patch +xfs-only-reclaim-unwritten-cow-extents-periodically.patch +xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch +xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch +xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch +xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch diff --git a/queue-4.9/xfs-allow-unwritten-extents-in-the-cow-fork.patch b/queue-4.9/xfs-allow-unwritten-extents-in-the-cow-fork.patch new file mode 100644 index 00000000000..900ce34164d --- /dev/null +++ b/queue-4.9/xfs-allow-unwritten-extents-in-the-cow-fork.patch @@ -0,0 +1,278 @@ +From 05a630d76bd3f39baf0eecfa305bed2820796dee Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 2 Feb 2017 15:14:01 -0800 +Subject: xfs: allow unwritten extents in the CoW fork + +From: Darrick J. Wong + +commit 05a630d76bd3f39baf0eecfa305bed2820796dee upstream. + +In the data fork, we only allow extents to perform the following state +transitions: + +delay -> real <-> unwritten + +There's no way to move directly from a delalloc reservation to an +/unwritten/ allocated extent. However, for the CoW fork we want to be +able to do the following to each extent: + +delalloc -> unwritten -> written -> remapped to data fork + +This will help us to avoid a race in the speculative CoW preallocation +code between a first thread that is allocating a CoW extent and a second +thread that is remapping part of a file after a write. In order to do +this, however, we need two things: first, we have to be able to +transition from da to unwritten, and second the function that converts +between real and unwritten has to be made aware of the cow fork. Do +both of those things. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_bmap.c | 80 +++++++++++++++++++++++++++++------------------ + 1 file changed, 50 insertions(+), 30 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -1952,6 +1952,7 @@ xfs_bmap_add_extent_delay_real( + */ + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, new->br_startblock); ++ xfs_bmbt_set_state(ep, new->br_state); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + (*nextents)++; +@@ -2290,6 +2291,7 @@ STATIC int /* error */ + xfs_bmap_add_extent_unwritten_real( + struct xfs_trans *tp, + xfs_inode_t *ip, /* incore inode pointer */ ++ int whichfork, + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + xfs_bmbt_irec_t *new, /* new data to add to file extents */ +@@ -2309,12 +2311,14 @@ xfs_bmap_add_extent_unwritten_real( + /* left is 0, right is 1, prev is 2 */ + int rval=0; /* return value (logging flags) */ + int state = 0;/* state bits, accessed thru macros */ +- struct xfs_mount *mp = tp->t_mountp; ++ struct xfs_mount *mp = ip->i_mount; + + *logflagsp = 0; + + cur = *curp; +- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ++ ifp = XFS_IFORK_PTR(ip, whichfork); ++ if (whichfork == XFS_COW_FORK) ++ state |= BMAP_COWFORK; + + ASSERT(*idx >= 0); + ASSERT(*idx <= xfs_iext_count(ifp)); +@@ -2373,7 +2377,7 @@ xfs_bmap_add_extent_unwritten_real( + * Don't set contiguous if the combined extent would be too large. + * Also check for all-three-contiguous being too large. + */ +- if (*idx < xfs_iext_count(&ip->i_df) - 1) { ++ if (*idx < xfs_iext_count(ifp) - 1) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + if (isnullstartblock(RIGHT.br_startblock)) +@@ -2413,7 +2417,8 @@ xfs_bmap_add_extent_unwritten_real( + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 2, state); +- ip->i_d.di_nextents -= 2; ++ XFS_IFORK_NEXT_SET(ip, whichfork, ++ XFS_IFORK_NEXTENTS(ip, whichfork) - 2); + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { +@@ -2456,7 +2461,8 @@ xfs_bmap_add_extent_unwritten_real( + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); +- ip->i_d.di_nextents--; ++ XFS_IFORK_NEXT_SET(ip, whichfork, ++ XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { +@@ -2491,7 +2497,8 @@ xfs_bmap_add_extent_unwritten_real( + xfs_bmbt_set_state(ep, newext); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_remove(ip, *idx + 1, 1, state); +- ip->i_d.di_nextents--; ++ XFS_IFORK_NEXT_SET(ip, whichfork, ++ XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { +@@ -2603,7 +2610,8 @@ xfs_bmap_add_extent_unwritten_real( + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_insert(ip, *idx, 1, new, state); +- ip->i_d.di_nextents++; ++ XFS_IFORK_NEXT_SET(ip, whichfork, ++ XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { +@@ -2681,7 +2689,8 @@ xfs_bmap_add_extent_unwritten_real( + ++*idx; + xfs_iext_insert(ip, *idx, 1, new, state); + +- ip->i_d.di_nextents++; ++ XFS_IFORK_NEXT_SET(ip, whichfork, ++ XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { +@@ -2729,7 +2738,8 @@ xfs_bmap_add_extent_unwritten_real( + ++*idx; + xfs_iext_insert(ip, *idx, 2, &r[0], state); + +- ip->i_d.di_nextents += 2; ++ XFS_IFORK_NEXT_SET(ip, whichfork, ++ XFS_IFORK_NEXTENTS(ip, whichfork) + 2); + if (cur == NULL) + rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; + else { +@@ -2783,17 +2793,17 @@ xfs_bmap_add_extent_unwritten_real( + } + + /* update reverse mappings */ +- error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new); ++ error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new); + if (error) + goto done; + + /* convert to a btree if necessary */ +- if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { ++ if (xfs_bmap_needs_btree(ip, whichfork)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur, +- 0, &tmp_logflags, XFS_DATA_FORK); ++ 0, &tmp_logflags, whichfork); + *logflagsp |= tmp_logflags; + if (error) + goto done; +@@ -2805,7 +2815,7 @@ xfs_bmap_add_extent_unwritten_real( + *curp = cur; + } + +- xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); ++ xfs_bmap_check_leaf_extents(*curp, ip, whichfork); + done: + *logflagsp |= rval; + return error; +@@ -4458,10 +4468,16 @@ xfs_bmapi_allocate( + bma->got.br_state = XFS_EXT_NORM; + + /* +- * A wasdelay extent has been initialized, so shouldn't be flagged +- * as unwritten. ++ * In the data fork, a wasdelay extent has been initialized, so ++ * shouldn't be flagged as unwritten. ++ * ++ * For the cow fork, however, we convert delalloc reservations ++ * (extents allocated for speculative preallocation) to ++ * allocated unwritten extents, and only convert the unwritten ++ * extents to real extents when we're about to write the data. + */ +- if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && ++ if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && ++ (bma->flags & XFS_BMAPI_PREALLOC) && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + bma->got.br_state = XFS_EXT_UNWRITTEN; + +@@ -4512,8 +4528,6 @@ xfs_bmapi_convert_unwritten( + (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) + return 0; + +- ASSERT(whichfork != XFS_COW_FORK); +- + /* + * Modify (by adding) the state flag, if writing. + */ +@@ -4538,8 +4552,8 @@ xfs_bmapi_convert_unwritten( + return error; + } + +- error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, +- &bma->cur, mval, bma->firstblock, bma->dfops, ++ error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork, ++ &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops, + &tmp_logflags); + /* + * Log the inode core unconditionally in the unwritten extent conversion +@@ -4548,8 +4562,12 @@ xfs_bmapi_convert_unwritten( + * in the transaction for the sake of fsync(), even if nothing has + * changed, because fsync() will not force the log for this transaction + * unless it sees the inode pinned. ++ * ++ * Note: If we're only converting cow fork extents, there aren't ++ * any on-disk updates to make, so we don't need to log anything. + */ +- bma->logflags |= tmp_logflags | XFS_ILOG_CORE; ++ if (whichfork != XFS_COW_FORK) ++ bma->logflags |= tmp_logflags | XFS_ILOG_CORE; + if (error) + return error; + +@@ -4623,15 +4641,15 @@ xfs_bmapi_write( + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & XFS_BMAPI_IGSTATE)); +- ASSERT(tp != NULL); ++ ASSERT(tp != NULL || ++ (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == ++ (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); + ASSERT(len > 0); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); + ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); +- ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); +- ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK); + + /* zeroing is for currently only for data extents, not metadata */ + ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != +@@ -5653,8 +5671,8 @@ __xfs_bunmapi( + } + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent_unwritten_real(tp, ip, +- &lastx, &cur, &del, firstblock, dfops, +- &logflags); ++ whichfork, &lastx, &cur, &del, ++ firstblock, dfops, &logflags); + if (error) + goto error0; + goto nodelete; +@@ -5711,8 +5729,9 @@ __xfs_bunmapi( + prev.br_state = XFS_EXT_UNWRITTEN; + lastx--; + error = xfs_bmap_add_extent_unwritten_real(tp, +- ip, &lastx, &cur, &prev, +- firstblock, dfops, &logflags); ++ ip, whichfork, &lastx, &cur, ++ &prev, firstblock, dfops, ++ &logflags); + if (error) + goto error0; + goto nodelete; +@@ -5720,8 +5739,9 @@ __xfs_bunmapi( + ASSERT(del.br_state == XFS_EXT_NORM); + del.br_state = XFS_EXT_UNWRITTEN; + error = xfs_bmap_add_extent_unwritten_real(tp, +- ip, &lastx, &cur, &del, +- firstblock, dfops, &logflags); ++ ip, whichfork, &lastx, &cur, ++ &del, firstblock, dfops, ++ &logflags); + if (error) + goto error0; + goto nodelete; diff --git a/queue-4.9/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch b/queue-4.9/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch new file mode 100644 index 00000000000..5d1c3147e6e --- /dev/null +++ b/queue-4.9/xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch @@ -0,0 +1,55 @@ +From b3bf607d58520ea8c0666aeb4be60dbb724cd3a2 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 2 Feb 2017 15:13:59 -0800 +Subject: xfs: check for obviously bad level values in the bmbt root + +From: Darrick J. Wong + +commit b3bf607d58520ea8c0666aeb4be60dbb724cd3a2 upstream. + +We can't handle a bmbt that's taller than BTREE_MAXLEVELS, and there's +no such thing as a zero-level bmbt (for that we have extents format), +so if we see this, send back an error code. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_inode_fork.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_inode_fork.c ++++ b/fs/xfs/libxfs/xfs_inode_fork.c +@@ -26,6 +26,7 @@ + #include "xfs_inode.h" + #include "xfs_trans.h" + #include "xfs_inode_item.h" ++#include "xfs_btree.h" + #include "xfs_bmap_btree.h" + #include "xfs_bmap.h" + #include "xfs_error.h" +@@ -429,11 +430,13 @@ xfs_iformat_btree( + /* REFERENCED */ + int nrecs; + int size; ++ int level; + + ifp = XFS_IFORK_PTR(ip, whichfork); + dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); + nrecs = be16_to_cpu(dfp->bb_numrecs); ++ level = be16_to_cpu(dfp->bb_level); + + /* + * blow out if -- fork has less extents than can fit in +@@ -446,7 +449,8 @@ xfs_iformat_btree( + XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, mp, whichfork) || +- XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { ++ XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || ++ level == 0 || level > XFS_BTREE_MAXLEVELS) { + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, diff --git a/queue-4.9/xfs-don-t-fail-xfs_extent_busy-allocation.patch b/queue-4.9/xfs-don-t-fail-xfs_extent_busy-allocation.patch new file mode 100644 index 00000000000..4d007c3ef7e --- /dev/null +++ b/queue-4.9/xfs-don-t-fail-xfs_extent_busy-allocation.patch @@ -0,0 +1,45 @@ +From 5e30c23d13919a718b22d4921dc5c0accc59da27 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Tue, 7 Feb 2017 14:06:46 -0800 +Subject: xfs: don't fail xfs_extent_busy allocation + +From: Christoph Hellwig + +commit 5e30c23d13919a718b22d4921dc5c0accc59da27 upstream. + +We don't just need the structure to track busy extents which can be +avoided with a synchronous transaction, but also to keep track of +pending discard. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_extent_busy.c | 13 +------------ + 1 file changed, 1 insertion(+), 12 deletions(-) + +--- a/fs/xfs/xfs_extent_busy.c ++++ b/fs/xfs/xfs_extent_busy.c +@@ -45,18 +45,7 @@ xfs_extent_busy_insert( + struct rb_node **rbp; + struct rb_node *parent = NULL; + +- new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); +- if (!new) { +- /* +- * No Memory! Since it is now not possible to track the free +- * block, make this a synchronous transaction to insure that +- * the block is not reused before this transaction commits. +- */ +- trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); +- xfs_trans_set_sync(tp); +- return; +- } +- ++ new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); + new->agno = agno; + new->bno = bno; + new->length = len; diff --git a/queue-4.9/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch b/queue-4.9/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch new file mode 100644 index 00000000000..a7aa9fdfacc --- /dev/null +++ b/queue-4.9/xfs-don-t-reserve-blocks-for-right-shift-transactions.patch @@ -0,0 +1,94 @@ +From 48af96ab92bc68fb645068b978ce36df2379e076 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Wed, 15 Feb 2017 10:18:10 -0800 +Subject: xfs: don't reserve blocks for right shift transactions + +From: Brian Foster + +commit 48af96ab92bc68fb645068b978ce36df2379e076 upstream. + +The block reservation for the transaction allocated in +xfs_shift_file_space() is an artifact of the original collapse range +support. It exists to handle the case where a collapse range occurs, +the initial extent is left shifted into a location that forms a +contiguous boundary with the previous extent and thus the extents +are merged. This code was subsequently refactored and reused for +insert range (right shift) support. + +If an insert range occurs under low free space conditions, the +extent at the starting offset is split before the first shift +transaction is allocated. If the block reservation fails, this +leaves separate, but contiguous extents around in the inode. While +not a fatal problem, this is unexpected and will flag a warning on +subsequent insert range operations on the inode. This problem has +been reproduce intermittently by generic/270 running against a +ramdisk device. + +Since right shift does not create new extent boundaries in the +inode, a block reservation for extent merge is unnecessary. Update +xfs_shift_file_space() to conditionally reserve fs blocks for left +shift transactions only. This avoids the warning reproduced by +generic/270. + +Reported-by: Ross Zwisler +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_bmap_util.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -1387,10 +1387,16 @@ xfs_shift_file_space( + xfs_fileoff_t stop_fsb; + xfs_fileoff_t next_fsb; + xfs_fileoff_t shift_fsb; ++ uint resblks; + + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + + if (direction == SHIFT_LEFT) { ++ /* ++ * Reserve blocks to cover potential extent merges after left ++ * shift operations. ++ */ ++ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + next_fsb = XFS_B_TO_FSB(mp, offset + len); + stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); + } else { +@@ -1398,6 +1404,7 @@ xfs_shift_file_space( + * If right shift, delegate the work of initialization of + * next_fsb to xfs_bmap_shift_extent as it has ilock held. + */ ++ resblks = 0; + next_fsb = NULLFSBLOCK; + stop_fsb = XFS_B_TO_FSB(mp, offset); + } +@@ -1439,21 +1446,14 @@ xfs_shift_file_space( + } + + while (!error && !done) { +- /* +- * We would need to reserve permanent block for transaction. +- * This will come into picture when after shifting extent into +- * hole we found that adjacent extents can be merged which +- * may lead to freeing of a block during record update. +- */ +- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, +- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); ++ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, ++ &tp); + if (error) + break; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, +- ip->i_gdquot, ip->i_pdquot, +- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, ++ ip->i_gdquot, ip->i_pdquot, resblks, 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_trans_cancel; diff --git a/queue-4.9/xfs-fail-_dir_open-when-readahead-fails.patch b/queue-4.9/xfs-fail-_dir_open-when-readahead-fails.patch new file mode 100644 index 00000000000..2e5501f482f --- /dev/null +++ b/queue-4.9/xfs-fail-_dir_open-when-readahead-fails.patch @@ -0,0 +1,75 @@ +From 7a652bbe366464267190c2792a32ce4fff5595ef Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 2 Feb 2017 15:13:58 -0800 +Subject: xfs: fail _dir_open when readahead fails + +From: Darrick J. Wong + +commit 7a652bbe366464267190c2792a32ce4fff5595ef upstream. + +When we open a directory, we try to readahead block 0 of the directory +on the assumption that we're going to need it soon. If the bmbt is +corrupt, the directory will never be usable and the readahead fails +immediately, so we might as well prevent the directory from being opened +at all. This prevents a subsequent read or modify operation from +hitting it and taking the fs offline. + +NOTE: We're only checking for early failures in the block mapping, not +the readahead directory block itself. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Eric Sandeen +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_da_btree.c | 6 ++---- + fs/xfs/libxfs/xfs_da_btree.h | 2 +- + fs/xfs/xfs_file.c | 4 ++-- + 3 files changed, 5 insertions(+), 7 deletions(-) + +--- a/fs/xfs/libxfs/xfs_da_btree.c ++++ b/fs/xfs/libxfs/xfs_da_btree.c +@@ -2633,7 +2633,7 @@ out_free: + /* + * Readahead the dir/attr block. + */ +-xfs_daddr_t ++int + xfs_da_reada_buf( + struct xfs_inode *dp, + xfs_dablk_t bno, +@@ -2664,7 +2664,5 @@ out_free: + if (mapp != &map) + kmem_free(mapp); + +- if (error) +- return -1; +- return mappedbno; ++ return error; + } +--- a/fs/xfs/libxfs/xfs_da_btree.h ++++ b/fs/xfs/libxfs/xfs_da_btree.h +@@ -201,7 +201,7 @@ int xfs_da_read_buf(struct xfs_trans *tr + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int whichfork, + const struct xfs_buf_ops *ops); +-xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, ++int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno, int whichfork, + const struct xfs_buf_ops *ops); + int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -992,9 +992,9 @@ xfs_dir_open( + */ + mode = xfs_ilock_data_map_shared(ip); + if (ip->i_d.di_nextents > 0) +- xfs_dir3_data_readahead(ip, 0, -1); ++ error = xfs_dir3_data_readahead(ip, 0, -1); + xfs_iunlock(ip, mode); +- return 0; ++ return error; + } + + STATIC int diff --git a/queue-4.9/xfs-filter-out-obviously-bad-btree-pointers.patch b/queue-4.9/xfs-filter-out-obviously-bad-btree-pointers.patch new file mode 100644 index 00000000000..593fc6f95f7 --- /dev/null +++ b/queue-4.9/xfs-filter-out-obviously-bad-btree-pointers.patch @@ -0,0 +1,66 @@ +From d5a91baeb6033c3392121e4d5c011cdc08dfa9f7 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 2 Feb 2017 15:13:58 -0800 +Subject: xfs: filter out obviously bad btree pointers + +From: Darrick J. Wong + +commit d5a91baeb6033c3392121e4d5c011cdc08dfa9f7 upstream. + +Don't let anybody load an obviously bad btree pointer. Since the values +come from disk, we must return an error, not just ASSERT. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Eric Sandeen +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_bmap.c | 5 +---- + fs/xfs/libxfs/xfs_btree.c | 3 ++- + fs/xfs/libxfs/xfs_btree.h | 2 +- + 3 files changed, 4 insertions(+), 6 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -1278,7 +1278,6 @@ xfs_bmap_read_extents( + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ + +- bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : +@@ -1291,9 +1290,7 @@ xfs_bmap_read_extents( + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); +- ASSERT(bno != NULLFSBLOCK); +- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); +- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); ++ + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. +--- a/fs/xfs/libxfs/xfs_btree.c ++++ b/fs/xfs/libxfs/xfs_btree.c +@@ -810,7 +810,8 @@ xfs_btree_read_bufl( + xfs_daddr_t d; /* real disk block address */ + int error; + +- ASSERT(fsbno != NULLFSBLOCK); ++ if (!XFS_FSB_SANITY_CHECK(mp, fsbno)) ++ return -EFSCORRUPTED; + d = XFS_FSB_TO_DADDR(mp, fsbno); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp, ops); +--- a/fs/xfs/libxfs/xfs_btree.h ++++ b/fs/xfs/libxfs/xfs_btree.h +@@ -491,7 +491,7 @@ static inline int xfs_btree_get_level(st + #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) + + #define XFS_FSB_SANITY_CHECK(mp,fsb) \ +- (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ ++ (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ + XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) + + /* diff --git a/queue-4.9/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch b/queue-4.9/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch new file mode 100644 index 00000000000..92637ba6d41 --- /dev/null +++ b/queue-4.9/xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch @@ -0,0 +1,111 @@ +From 787eb485509f9d58962bd8b4dbc6a5ac6e2034fe Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Thu, 2 Mar 2017 15:02:51 -0800 +Subject: xfs: fix and streamline error handling in xfs_end_io + +From: Christoph Hellwig + +commit 787eb485509f9d58962bd8b4dbc6a5ac6e2034fe upstream. + +There are two different cases of buffered I/O errors: + + - first we can have an already shutdown fs. In that case we should skip + any on-disk operations and just clean up the appen transaction if + present and destroy the ioend + - a real I/O error. In that case we should cleanup any lingering COW + blocks. This gets skipped in the current code and is fixed by this + patch. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_aops.c | 59 ++++++++++++++++++++++++------------------------------ + 1 file changed, 27 insertions(+), 32 deletions(-) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -279,54 +279,49 @@ xfs_end_io( + struct xfs_ioend *ioend = + container_of(work, struct xfs_ioend, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); ++ xfs_off_t offset = ioend->io_offset; ++ size_t size = ioend->io_size; + int error = ioend->io_bio->bi_error; + + /* +- * Set an error if the mount has shut down and proceed with end I/O +- * processing so it can perform whatever cleanups are necessary. ++ * Just clean up the in-memory strutures if the fs has been shut down. + */ +- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) ++ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + error = -EIO; ++ goto done; ++ } + + /* +- * For a CoW extent, we need to move the mapping from the CoW fork +- * to the data fork. If instead an error happened, just dump the +- * new blocks. ++ * Clean up any COW blocks on an I/O error. + */ +- if (ioend->io_type == XFS_IO_COW) { +- if (error) +- goto done; +- if (ioend->io_bio->bi_error) { +- error = xfs_reflink_cancel_cow_range(ip, +- ioend->io_offset, ioend->io_size, true); +- goto done; ++ if (unlikely(error)) { ++ switch (ioend->io_type) { ++ case XFS_IO_COW: ++ xfs_reflink_cancel_cow_range(ip, offset, size, true); ++ break; + } +- error = xfs_reflink_end_cow(ip, ioend->io_offset, +- ioend->io_size); +- if (error) +- goto done; ++ ++ goto done; + } + + /* +- * For unwritten extents we need to issue transactions to convert a +- * range to normal written extens after the data I/O has finished. +- * Detecting and handling completion IO errors is done individually +- * for each case as different cleanup operations need to be performed +- * on error. ++ * Success: commit the COW or unwritten blocks if needed. + */ +- if (ioend->io_type == XFS_IO_UNWRITTEN) { +- if (error) +- goto done; +- error = xfs_iomap_write_unwritten(ip, ioend->io_offset, +- ioend->io_size); +- } else if (ioend->io_append_trans) { +- error = xfs_setfilesize_ioend(ioend, error); +- } else { +- ASSERT(!xfs_ioend_is_append(ioend) || +- ioend->io_type == XFS_IO_COW); ++ switch (ioend->io_type) { ++ case XFS_IO_COW: ++ error = xfs_reflink_end_cow(ip, offset, size); ++ break; ++ case XFS_IO_UNWRITTEN: ++ error = xfs_iomap_write_unwritten(ip, offset, size); ++ break; ++ default: ++ ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); ++ break; + } + + done: ++ if (ioend->io_append_trans) ++ error = xfs_setfilesize_ioend(ioend, error); + xfs_destroy_ioend(ioend, error); + } + diff --git a/queue-4.9/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch b/queue-4.9/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch new file mode 100644 index 00000000000..b67003b4eeb --- /dev/null +++ b/queue-4.9/xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch @@ -0,0 +1,61 @@ +From e4229d6b0bc9280f29624faf170cf76a9f1ca60e Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Fri, 27 Jan 2017 23:22:57 -0800 +Subject: xfs: fix eofblocks race with file extending async dio writes + +From: Brian Foster + +commit e4229d6b0bc9280f29624faf170cf76a9f1ca60e upstream. + +It's possible for post-eof blocks to end up being used for direct I/O +writes. dio write performs an upfront unwritten extent allocation, sends +the dio and then updates the inode size (if necessary) on write +completion. If a file release occurs while a file extending dio write is +in flight, it is possible to mistake the post-eof blocks for speculative +preallocation and incorrectly truncate them from the inode. This means +that the resulting dio write completion can discover a hole and allocate +new blocks rather than perform unwritten extent conversion. + +This requires a strange mix of I/O and is thus not likely to reproduce +in real world workloads. It is intermittently reproduced by generic/299. +The error manifests as an assert failure due to transaction overrun +because the aforementioned write completion transaction has only +reserved enough blocks for btree operations: + + XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, \ + file: fs/xfs//xfs_trans.c, line: 309 + +The root cause is that xfs_free_eofblocks() uses i_size to truncate +post-eof blocks from the inode, but async, file extending direct writes +do not update i_size until write completion, long after inode locks are +dropped. Therefore, xfs_free_eofblocks() effectively truncates the inode +to the incorrect size. + +Update xfs_free_eofblocks() to serialize against dio similar to how +extending writes are serialized against i_size updates before post-eof +block zeroing. Specifically, wait on dio while under the iolock. This +ensures that dio write completions have updated i_size before post-eof +blocks are processed. + +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_bmap_util.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -959,6 +959,9 @@ xfs_free_eofblocks( + if (error) + return error; + ++ /* wait on dio to ensure i_size has settled */ ++ inode_dio_wait(VFS_I(ip)); ++ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, + &tp); + if (error) { diff --git a/queue-4.9/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch b/queue-4.9/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch new file mode 100644 index 00000000000..a35bb54161f --- /dev/null +++ b/queue-4.9/xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch @@ -0,0 +1,48 @@ +From 4b5bd5bf3fb182dc504b1b64e0331300f156e756 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 2 Feb 2017 15:13:57 -0800 +Subject: xfs: fix toctou race when locking an inode to access the data map + +From: Darrick J. Wong + +commit 4b5bd5bf3fb182dc504b1b64e0331300f156e756 upstream. + +We use di_format and if_flags to decide whether we're grabbing the ilock +in btree mode (btree extents not loaded) or shared mode (anything else), +but the state of those fields can be changed by other threads that are +also trying to load the btree extents -- IFEXTENTS gets set before the +_bmap_read_extents call and cleared if it fails. + +We don't actually need to have IFEXTENTS set until after the bmbt +records are successfully loaded and validated, which will fix the race +between multiple threads trying to read the same directory. The next +patch strengthens directory bmbt validation by refusing to open the +directory if reading the bmbt to start directory readahead fails. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_inode_fork.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/xfs/libxfs/xfs_inode_fork.c ++++ b/fs/xfs/libxfs/xfs_inode_fork.c +@@ -497,15 +497,14 @@ xfs_iread_extents( + * We know that the size is valid (it's checked in iformat_btree) + */ + ifp->if_bytes = ifp->if_real_bytes = 0; +- ifp->if_flags |= XFS_IFEXTENTS; + xfs_iext_add(ifp, 0, nextents); + error = xfs_bmap_read_extents(tp, ip, whichfork); + if (error) { + xfs_iext_destroy(ifp); +- ifp->if_flags &= ~XFS_IFEXTENTS; + return error; + } + xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); ++ ifp->if_flags |= XFS_IFEXTENTS; + return 0; + } + /* diff --git a/queue-4.9/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch b/queue-4.9/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch new file mode 100644 index 00000000000..aab604126c2 --- /dev/null +++ b/queue-4.9/xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch @@ -0,0 +1,31 @@ +From 93aaead52a9eebdc20dc8fa673c350e592a06949 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 13 Feb 2017 22:52:27 -0800 +Subject: xfs: fix uninitialized variable in _reflink_convert_cow + +From: Darrick J. Wong + +commit 93aaead52a9eebdc20dc8fa673c350e592a06949 upstream. + +Fix an uninitialize variable. + +Reported-by: Dan Carpenter +Reviewed-by: Brian Foster +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_reflink.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -363,7 +363,7 @@ xfs_reflink_convert_cow( + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + xfs_extnum_t idx; + bool found; +- int error; ++ int error = 0; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + diff --git a/queue-4.9/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch b/queue-4.9/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch new file mode 100644 index 00000000000..d9817ff2c4c --- /dev/null +++ b/queue-4.9/xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch @@ -0,0 +1,76 @@ +From 0e339ef8556d9e567aa7925f8892c263d79430d9 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Mon, 13 Feb 2017 22:48:18 -0800 +Subject: xfs: handle indlen shortage on delalloc extent merge + +From: Brian Foster + +commit 0e339ef8556d9e567aa7925f8892c263d79430d9 upstream. + +When a delalloc extent is created, it can be merged with pre-existing, +contiguous, delalloc extents. When this occurs, +xfs_bmap_add_extent_hole_delay() merges the extents along with the +associated indirect block reservations. The expectation here is that the +combined worst case indlen reservation is always less than or equal to +the indlen reservation for the individual extents. + +This is not always the case, however, as existing extents can less than +the expected indlen reservation if the extent was previously split due +to a hole punch. If a new extent merges with such an extent, the total +indlen requirement may be larger than the sum of the indlen reservations +held by both extents. + +xfs_bmap_add_extent_hole_delay() assumes that the worst case indlen +reservation is always available and assigns it to the merged extent +without consideration for the indlen held by the pre-existing extent. As +a result, the subsequent xfs_mod_fdblocks() call can attempt an +unintentional allocation rather than a free (indicated by an ASSERT() +failure). Further, if the allocation happens to fail in this context, +the failure goes unhandled and creates a filesystem wide block +accounting inconsistency. + +Fix xfs_bmap_add_extent_hole_delay() to function as designed. Cap the +indlen reservation assigned to the merged extent to the sum of the +indlen reservations held by each of the individual extents. + +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_bmap.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -2907,7 +2907,8 @@ xfs_bmap_add_extent_hole_delay( + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); +- newlen = xfs_bmap_worst_indlen(ip, temp); ++ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), ++ oldlen); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + nullstartblock((int)newlen)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); +@@ -2928,7 +2929,8 @@ xfs_bmap_add_extent_hole_delay( + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); +- newlen = xfs_bmap_worst_indlen(ip, temp); ++ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), ++ oldlen); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + nullstartblock((int)newlen)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); +@@ -2944,7 +2946,8 @@ xfs_bmap_add_extent_hole_delay( + temp = new->br_blockcount + right.br_blockcount; + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); +- newlen = xfs_bmap_worst_indlen(ip, temp); ++ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), ++ oldlen); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, + nullstartblock((int)newlen), temp, right.br_state); diff --git a/queue-4.9/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch b/queue-4.9/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch new file mode 100644 index 00000000000..6ecba09fe89 --- /dev/null +++ b/queue-4.9/xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch @@ -0,0 +1,339 @@ +From 5eda43000064a69a39fb7869cc63c9571535ad29 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 2 Feb 2017 15:14:02 -0800 +Subject: xfs: mark speculative prealloc CoW fork extents unwritten + +From: Darrick J. Wong + +commit 5eda43000064a69a39fb7869cc63c9571535ad29 upstream. + +Christoph Hellwig pointed out that there's a potentially nasty race when +performing simultaneous nearby directio cow writes: + +"Thread 1 writes a range from B to c + +" B --------- C + p + +"a little later thread 2 writes from A to B + +" A --------- B + p + +[editor's note: the 'p' denote cowextsize boundaries, which I added to +make this more clear] + +"but the code preallocates beyond B into the range where thread +"1 has just written, but ->end_io hasn't been called yet. +"But once ->end_io is called thread 2 has already allocated +"up to the extent size hint into the write range of thread 1, +"so the end_io handler will splice the unintialized blocks from +"that preallocation back into the file right after B." + +We can avoid this race by ensuring that thread 1 cannot accidentally +remap the blocks that thread 2 allocated (as part of speculative +preallocation) as part of t2's write preparation in t1's end_io handler. +The way we make this happen is by taking advantage of the unwritten +extent flag as an intermediate step. + +Recall that when we begin the process of writing data to shared blocks, +we create a delayed allocation extent in the CoW fork: + +D: --RRRRRRSSSRRRRRRRR--- +C: ------DDDDDDD--------- + +When a thread prepares to CoW some dirty data out to disk, it will now +convert the delalloc reservation into an /unwritten/ allocated extent in +the cow fork. The da conversion code tries to opportunistically +allocate as much of a (speculatively prealloc'd) extent as possible, so +we may end up allocating a larger extent than we're actually writing +out: + +D: --RRRRRRSSSRRRRRRRR--- +U: ------UUUUUUU--------- + +Next, we convert only the part of the extent that we're actively +planning to write to normal (i.e. not unwritten) status: + +D: --RRRRRRSSSRRRRRRRR--- +U: ------UURRUUU--------- + +If the write succeeds, the end_cow function will now scan the relevant +range of the CoW fork for real extents and remap only the real extents +into the data fork: + +D: --RRRRRRRRSRRRRRRRR--- +U: ------UU--UUU--------- + +This ensures that we never obliterate valid data fork extents with +unwritten blocks from the CoW fork. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_aops.c | 6 ++ + fs/xfs/xfs_iomap.c | 2 + fs/xfs/xfs_reflink.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++---- + fs/xfs/xfs_reflink.h | 2 + fs/xfs/xfs_trace.h | 8 ++- + 5 files changed, 123 insertions(+), 11 deletions(-) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -486,6 +486,12 @@ xfs_submit_ioend( + struct xfs_ioend *ioend, + int status) + { ++ /* Convert CoW extents to regular */ ++ if (!status && ioend->io_type == XFS_IO_COW) { ++ status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ++ ioend->io_offset, ioend->io_size); ++ } ++ + /* Reserve log space if we might write beyond the on-disk inode size. */ + if (!status && + ioend->io_type != XFS_IO_UNWRITTEN && +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -685,7 +685,7 @@ xfs_iomap_write_allocate( + int nres; + + if (whichfork == XFS_COW_FORK) +- flags |= XFS_BMAPI_COWFORK; ++ flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + + /* + * Make sure that the dquots are there. +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -82,11 +82,22 @@ + * mappings are a reservation against the free space in the filesystem; + * adjacent mappings can also be combined into fewer larger mappings. + * ++ * As an optimization, the CoW extent size hint (cowextsz) creates ++ * outsized aligned delalloc reservations in the hope of landing out of ++ * order nearby CoW writes in a single extent on disk, thereby reducing ++ * fragmentation and improving future performance. ++ * ++ * D: --RRRRRRSSSRRRRRRRR--- (data fork) ++ * C: ------DDDDDDD--------- (CoW fork) ++ * + * When dirty pages are being written out (typically in writepage), the +- * delalloc reservations are converted into real mappings by allocating +- * blocks and replacing the delalloc mapping with real ones. A delalloc +- * mapping can be replaced by several real ones if the free space is +- * fragmented. ++ * delalloc reservations are converted into unwritten mappings by ++ * allocating blocks and replacing the delalloc mapping with real ones. ++ * A delalloc mapping can be replaced by several unwritten ones if the ++ * free space is fragmented. ++ * ++ * D: --RRRRRRSSSRRRRRRRR--- ++ * C: ------UUUUUUU--------- + * + * We want to adapt the delalloc mechanism for copy-on-write, since the + * write paths are similar. The first two steps (creating the reservation +@@ -101,13 +112,29 @@ + * Block-aligned directio writes will use the same mechanism as buffered + * writes. + * ++ * Just prior to submitting the actual disk write requests, we convert ++ * the extents representing the range of the file actually being written ++ * (as opposed to extra pieces created for the cowextsize hint) to real ++ * extents. This will become important in the next step: ++ * ++ * D: --RRRRRRSSSRRRRRRRR--- ++ * C: ------UUrrUUU--------- ++ * + * CoW remapping must be done after the data block write completes, + * because we don't want to destroy the old data fork map until we're sure + * the new block has been written. Since the new mappings are kept in a + * separate fork, we can simply iterate these mappings to find the ones + * that cover the file blocks that we just CoW'd. For each extent, simply + * unmap the corresponding range in the data fork, map the new range into +- * the data fork, and remove the extent from the CoW fork. ++ * the data fork, and remove the extent from the CoW fork. Because of ++ * the presence of the cowextsize hint, however, we must be careful ++ * only to remap the blocks that we've actually written out -- we must ++ * never remap delalloc reservations nor CoW staging blocks that have ++ * yet to be written. This corresponds exactly to the real extents in ++ * the CoW fork: ++ * ++ * D: --RRRRRRrrSRRRRRRRR--- ++ * C: ------UU--UUU--------- + * + * Since the remapping operation can be applied to an arbitrary file + * range, we record the need for the remap step as a flag in the ioend +@@ -296,6 +323,65 @@ xfs_reflink_reserve_cow( + return 0; + } + ++/* Convert part of an unwritten CoW extent to a real one. */ ++STATIC int ++xfs_reflink_convert_cow_extent( ++ struct xfs_inode *ip, ++ struct xfs_bmbt_irec *imap, ++ xfs_fileoff_t offset_fsb, ++ xfs_filblks_t count_fsb, ++ struct xfs_defer_ops *dfops) ++{ ++ struct xfs_bmbt_irec irec = *imap; ++ xfs_fsblock_t first_block; ++ int nimaps = 1; ++ ++ if (imap->br_state == XFS_EXT_NORM) ++ return 0; ++ ++ xfs_trim_extent(&irec, offset_fsb, count_fsb); ++ trace_xfs_reflink_convert_cow(ip, &irec); ++ if (irec.br_blockcount == 0) ++ return 0; ++ return xfs_bmapi_write(NULL, ip, irec.br_startoff, irec.br_blockcount, ++ XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block, ++ 0, &irec, &nimaps, dfops); ++} ++ ++/* Convert all of the unwritten CoW extents in a file's range to real ones. */ ++int ++xfs_reflink_convert_cow( ++ struct xfs_inode *ip, ++ xfs_off_t offset, ++ xfs_off_t count) ++{ ++ struct xfs_bmbt_irec got; ++ struct xfs_defer_ops dfops; ++ struct xfs_mount *mp = ip->i_mount; ++ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ++ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); ++ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); ++ xfs_extnum_t idx; ++ bool found; ++ int error; ++ ++ xfs_ilock(ip, XFS_ILOCK_EXCL); ++ ++ /* Convert all the extents to real from unwritten. */ ++ for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); ++ found && got.br_startoff < end_fsb; ++ found = xfs_iext_get_extent(ifp, ++idx, &got)) { ++ error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb, ++ end_fsb - offset_fsb, &dfops); ++ if (error) ++ break; ++ } ++ ++ /* Finish up. */ ++ xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ return error; ++} ++ + /* Allocate all CoW reservations covering a range of blocks in a file. */ + static int + __xfs_reflink_allocate_cow( +@@ -328,6 +414,7 @@ __xfs_reflink_allocate_cow( + goto out_unlock; + ASSERT(nimaps == 1); + ++ /* Make sure there's a CoW reservation for it. */ + error = xfs_reflink_reserve_cow(ip, &imap, &shared); + if (error) + goto out_trans_cancel; +@@ -337,14 +424,16 @@ __xfs_reflink_allocate_cow( + goto out_trans_cancel; + } + ++ /* Allocate the entire reservation as unwritten blocks. */ + xfs_trans_ijoin(tp, ip, 0); + error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount, +- XFS_BMAPI_COWFORK, &first_block, ++ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), + &imap, &nimaps, &dfops); + if (error) + goto out_trans_cancel; + ++ /* Finish up. */ + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto out_trans_cancel; +@@ -389,11 +478,12 @@ xfs_reflink_allocate_cow_range( + if (error) { + trace_xfs_reflink_allocate_cow_range_error(ip, error, + _RET_IP_); +- break; ++ return error; + } + } + +- return error; ++ /* Convert the CoW extents to regular. */ ++ return xfs_reflink_convert_cow(ip, offset, count); + } + + /* +@@ -669,6 +759,16 @@ xfs_reflink_end_cow( + + ASSERT(!isnullstartblock(got.br_startblock)); + ++ /* ++ * Don't remap unwritten extents; these are ++ * speculatively preallocated CoW extents that have been ++ * allocated but have not yet been involved in a write. ++ */ ++ if (got.br_state == XFS_EXT_UNWRITTEN) { ++ idx--; ++ goto next_extent; ++ } ++ + /* Unmap the old blocks in the data fork. */ + xfs_defer_init(&dfops, &firstfsb); + rlen = del.br_blockcount; +--- a/fs/xfs/xfs_reflink.h ++++ b/fs/xfs/xfs_reflink.h +@@ -30,6 +30,8 @@ extern int xfs_reflink_reserve_cow(struc + struct xfs_bmbt_irec *imap, bool *shared); + extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, + xfs_off_t offset, xfs_off_t count); ++extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, ++ xfs_off_t count); + extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, + struct xfs_bmbt_irec *imap, bool *need_alloc); + extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, +--- a/fs/xfs/xfs_trace.h ++++ b/fs/xfs/xfs_trace.h +@@ -3183,6 +3183,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, pblk) ++ __field(int, state) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; +@@ -3190,13 +3191,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; ++ __entry->state = irec->br_state; + ), +- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", ++ TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len, +- __entry->pblk) ++ __entry->pblk, ++ __entry->state) + ); + #define DEFINE_INODE_IREC_EVENT(name) \ + DEFINE_EVENT(xfs_inode_irec_class, name, \ +@@ -3345,6 +3348,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim + DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); + DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); + DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); ++DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); + + DEFINE_RW_EVENT(xfs_reflink_reserve_cow); + DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); diff --git a/queue-4.9/xfs-only-reclaim-unwritten-cow-extents-periodically.patch b/queue-4.9/xfs-only-reclaim-unwritten-cow-extents-periodically.patch new file mode 100644 index 00000000000..96dbfceafa9 --- /dev/null +++ b/queue-4.9/xfs-only-reclaim-unwritten-cow-extents-periodically.patch @@ -0,0 +1,159 @@ +From 3802a345321a08093ba2ddb1849e736f84e8d450 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Tue, 7 Mar 2017 16:45:58 -0800 +Subject: xfs: only reclaim unwritten COW extents periodically + +From: Christoph Hellwig + +commit 3802a345321a08093ba2ddb1849e736f84e8d450 upstream. + +We only want to reclaim preallocations from our periodic work item. +Currently this is archived by looking for a dirty inode, but that check +is rather fragile. Instead add a flag to xfs_reflink_cancel_cow_* so +that the caller can ask for just cancelling unwritten extents in the COW +fork. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +[darrick: fix typos in commit message] +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_aops.c | 2 +- + fs/xfs/xfs_icache.c | 2 +- + fs/xfs/xfs_inode.c | 2 +- + fs/xfs/xfs_reflink.c | 23 ++++++++++++++++------- + fs/xfs/xfs_reflink.h | 4 ++-- + fs/xfs/xfs_super.c | 2 +- + 6 files changed, 22 insertions(+), 13 deletions(-) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -298,7 +298,7 @@ xfs_end_io( + goto done; + if (ioend->io_bio->bi_error) { + error = xfs_reflink_cancel_cow_range(ip, +- ioend->io_offset, ioend->io_size); ++ ioend->io_offset, ioend->io_size, true); + goto done; + } + error = xfs_reflink_end_cow(ip, ioend->io_offset, +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1610,7 +1610,7 @@ xfs_inode_free_cowblocks( + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + +- ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); ++ ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); + + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1624,7 +1624,7 @@ xfs_itruncate_extents( + + /* Remove all pending CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, +- last_block); ++ last_block, true); + if (error) + goto out; + +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -571,14 +571,18 @@ xfs_reflink_trim_irec_to_next_cow( + } + + /* +- * Cancel all pending CoW reservations for some block range of an inode. ++ * Cancel CoW reservations for some block range of an inode. ++ * ++ * If cancel_real is true this function cancels all COW fork extents for the ++ * inode; if cancel_real is false, real extents are not cleared. + */ + int + xfs_reflink_cancel_cow_blocks( + struct xfs_inode *ip, + struct xfs_trans **tpp, + xfs_fileoff_t offset_fsb, +- xfs_fileoff_t end_fsb) ++ xfs_fileoff_t end_fsb, ++ bool cancel_real) + { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got, prev, del; +@@ -605,7 +609,7 @@ xfs_reflink_cancel_cow_blocks( + &idx, &got, &del); + if (error) + break; +- } else { ++ } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { + xfs_trans_ijoin(*tpp, ip, 0); + xfs_defer_init(&dfops, &firstfsb); + +@@ -648,13 +652,17 @@ xfs_reflink_cancel_cow_blocks( + } + + /* +- * Cancel all pending CoW reservations for some byte range of an inode. ++ * Cancel CoW reservations for some byte range of an inode. ++ * ++ * If cancel_real is true this function cancels all COW fork extents for the ++ * inode; if cancel_real is false, real extents are not cleared. + */ + int + xfs_reflink_cancel_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, +- xfs_off_t count) ++ xfs_off_t count, ++ bool cancel_real) + { + struct xfs_trans *tp; + xfs_fileoff_t offset_fsb; +@@ -680,7 +688,8 @@ xfs_reflink_cancel_cow_range( + xfs_trans_ijoin(tp, ip, 0); + + /* Scrape out the old CoW reservations */ +- error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); ++ error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, ++ cancel_real); + if (error) + goto out_cancel; + +@@ -1686,7 +1695,7 @@ next: + * We didn't find any shared blocks so turn off the reflink flag. + * First, get rid of any leftover CoW mappings. + */ +- error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); ++ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); + if (error) + return error; + +--- a/fs/xfs/xfs_reflink.h ++++ b/fs/xfs/xfs_reflink.h +@@ -39,9 +39,9 @@ extern int xfs_reflink_trim_irec_to_next + + extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, + struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, +- xfs_fileoff_t end_fsb); ++ xfs_fileoff_t end_fsb, bool cancel_real); + extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, +- xfs_off_t count); ++ xfs_off_t count, bool cancel_real); + extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); + extern int xfs_reflink_recover_cow(struct xfs_mount *mp); +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -948,7 +948,7 @@ xfs_fs_destroy_inode( + XFS_STATS_INC(ip->i_mount, vn_remove); + + if (xfs_is_reflink_inode(ip)) { +- error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); ++ error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) + xfs_warn(ip->i_mount, + "Error %d while evicting CoW blocks for inode %llu.", diff --git a/queue-4.9/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch b/queue-4.9/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch new file mode 100644 index 00000000000..0b5b9a17bae --- /dev/null +++ b/queue-4.9/xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch @@ -0,0 +1,125 @@ +rom 54a4ef8af4e0dc5c983d17fcb9cf5fd25666d94e Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Mon, 6 Feb 2017 13:00:54 -0800 +Subject: xfs: reject all unaligned direct writes to reflinked files + +From: Christoph Hellwig + +commit 54a4ef8af4e0dc5c983d17fcb9cf5fd25666d94e upstream. + +We currently fall back from direct to buffered writes if we detect a +remaining shared extent in the iomap_begin callback. But by the time +iomap_begin is called for the potentially unaligned end block we might +have already written most of the data to disk, which we'd now write +again using buffered I/O. To avoid this reject all writes to reflinked +files before starting I/O so that we are guaranteed to only write the +data once. + +The alternative would be to unshare the unaligned start and/or end block +before doing the I/O. I think that's doable, and will actually be +required to support reflinks on DAX file system. But it will take a +little more time and I'd rather get rid of the double write ASAP. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +[slight changes in context due to the new direct I/O code in 4.10+] +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_aops.c | 45 --------------------------------------------- + fs/xfs/xfs_file.c | 9 +++++++++ + fs/xfs/xfs_trace.h | 2 +- + 3 files changed, 10 insertions(+), 46 deletions(-) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -1263,44 +1263,6 @@ xfs_map_trim_size( + bh_result->b_size = mapping_size; + } + +-/* Bounce unaligned directio writes to the page cache. */ +-static int +-xfs_bounce_unaligned_dio_write( +- struct xfs_inode *ip, +- xfs_fileoff_t offset_fsb, +- struct xfs_bmbt_irec *imap) +-{ +- struct xfs_bmbt_irec irec; +- xfs_fileoff_t delta; +- bool shared; +- bool x; +- int error; +- +- irec = *imap; +- if (offset_fsb > irec.br_startoff) { +- delta = offset_fsb - irec.br_startoff; +- irec.br_blockcount -= delta; +- irec.br_startblock += delta; +- irec.br_startoff = offset_fsb; +- } +- error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); +- if (error) +- return error; +- +- /* +- * We're here because we're trying to do a directio write to a +- * region that isn't aligned to a filesystem block. If any part +- * of the extent is shared, fall back to buffered mode to handle +- * the RMW. This is done by returning -EREMCHG ("remote addr +- * changed"), which is caught further up the call stack. +- */ +- if (shared) { +- trace_xfs_reflink_bounce_dio_write(ip, imap); +- return -EREMCHG; +- } +- return 0; +-} +- + STATIC int + __xfs_get_blocks( + struct inode *inode, +@@ -1438,13 +1400,6 @@ __xfs_get_blocks( + if (imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK && + (create || !ISUNWRITTEN(&imap))) { +- if (create && direct && !is_cow) { +- error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, +- &imap); +- if (error) +- return error; +- } +- + xfs_map_buffer(inode, bh_result, &imap, offset); + if (ISUNWRITTEN(&imap)) + set_buffer_unwritten(bh_result); +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -554,6 +554,15 @@ xfs_file_dio_aio_write( + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + count) & mp->m_blockmask)) { + unaligned_io = 1; ++ ++ /* ++ * We can't properly handle unaligned direct I/O to reflink ++ * files yet, as we can't unshare a partial block. ++ */ ++ if (xfs_is_reflink_inode(ip)) { ++ trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); ++ return -EREMCHG; ++ } + iolock = XFS_IOLOCK_EXCL; + } else { + iolock = XFS_IOLOCK_SHARED; +--- a/fs/xfs/xfs_trace.h ++++ b/fs/xfs/xfs_trace.h +@@ -3353,7 +3353,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_conv + DEFINE_RW_EVENT(xfs_reflink_reserve_cow); + DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); + +-DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); ++DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); + DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); + DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); + diff --git a/queue-4.9/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch b/queue-4.9/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch new file mode 100644 index 00000000000..970cc93a618 --- /dev/null +++ b/queue-4.9/xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch @@ -0,0 +1,32 @@ +From 4dd2eb633598cb6a5a0be2fd9a2be0819f5eeb5f Mon Sep 17 00:00:00 2001 +From: Hou Tao +Date: Fri, 3 Feb 2017 14:39:07 -0800 +Subject: xfs: reset b_first_retry_time when clear the retry status of xfs_buf_t + +From: Hou Tao + +commit 4dd2eb633598cb6a5a0be2fd9a2be0819f5eeb5f upstream. + +After successful IO or permanent error, b_first_retry_time also +needs to be cleared, else the invalid first retry time will be +used by the next retry check. + +Signed-off-by: Hou Tao +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_buf_item.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/xfs/xfs_buf_item.c ++++ b/fs/xfs/xfs_buf_item.c +@@ -1162,6 +1162,7 @@ xfs_buf_iodone_callbacks( + */ + bp->b_last_error = 0; + bp->b_retries = 0; ++ bp->b_first_retry_time = 0; + + xfs_buf_do_callbacks(bp); + bp->b_fspriv = NULL; diff --git a/queue-4.9/xfs-split-indlen-reservations-fairly-when-under-reserved.patch b/queue-4.9/xfs-split-indlen-reservations-fairly-when-under-reserved.patch new file mode 100644 index 00000000000..b7b4fdb4128 --- /dev/null +++ b/queue-4.9/xfs-split-indlen-reservations-fairly-when-under-reserved.patch @@ -0,0 +1,118 @@ +From 75d65361cf3c0dae2af970c305e19c727b28a510 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Mon, 13 Feb 2017 22:48:30 -0800 +Subject: xfs: split indlen reservations fairly when under reserved + +From: Brian Foster + +commit 75d65361cf3c0dae2af970c305e19c727b28a510 upstream. + +Certain workoads that punch holes into speculative preallocation can +cause delalloc indirect reservation splits when the delalloc extent is +split in two. If further splits occur, an already short-handed extent +can be split into two in a manner that leaves zero indirect blocks for +one of the two new extents. This occurs because the shortage is large +enough that the xfs_bmap_split_indlen() algorithm completely drains the +requested indlen of one of the extents before it honors the existing +reservation. + +This ultimately results in a warning from xfs_bmap_del_extent(). This +has been observed during file copies of large, sparse files using 'cp +--sparse=always.' + +To avoid this problem, update xfs_bmap_split_indlen() to explicitly +apply the reservation shortage fairly between both extents. This smooths +out the overall indlen shortage and defers the situation where we end up +with a delalloc extent with zero indlen reservation to extreme +circumstances. + +Reported-by: Patrick Dung +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_bmap.c | 61 +++++++++++++++++++++++++++++++++-------------- + 1 file changed, 43 insertions(+), 18 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -4899,34 +4899,59 @@ xfs_bmap_split_indlen( + xfs_filblks_t len2 = *indlen2; + xfs_filblks_t nres = len1 + len2; /* new total res. */ + xfs_filblks_t stolen = 0; ++ xfs_filblks_t resfactor; + + /* + * Steal as many blocks as we can to try and satisfy the worst case + * indlen for both new extents. + */ +- while (nres > ores && avail) { +- nres--; +- avail--; +- stolen++; +- } ++ if (ores < nres && avail) ++ stolen = XFS_FILBLKS_MIN(nres - ores, avail); ++ ores += stolen; ++ ++ /* nothing else to do if we've satisfied the new reservation */ ++ if (ores >= nres) ++ return stolen; ++ ++ /* ++ * We can't meet the total required reservation for the two extents. ++ * Calculate the percent of the overall shortage between both extents ++ * and apply this percentage to each of the requested indlen values. ++ * This distributes the shortage fairly and reduces the chances that one ++ * of the two extents is left with nothing when extents are repeatedly ++ * split. ++ */ ++ resfactor = (ores * 100); ++ do_div(resfactor, nres); ++ len1 *= resfactor; ++ do_div(len1, 100); ++ len2 *= resfactor; ++ do_div(len2, 100); ++ ASSERT(len1 + len2 <= ores); ++ ASSERT(len1 < *indlen1 && len2 < *indlen2); + + /* +- * The only blocks available are those reserved for the original +- * extent and what we can steal from the extent being removed. +- * If this still isn't enough to satisfy the combined +- * requirements for the two new extents, skim blocks off of each +- * of the new reservations until they match what is available. ++ * Hand out the remainder to each extent. If one of the two reservations ++ * is zero, we want to make sure that one gets a block first. The loop ++ * below starts with len1, so hand len2 a block right off the bat if it ++ * is zero. + */ +- while (nres > ores) { +- if (len1) { +- len1--; +- nres--; ++ ores -= (len1 + len2); ++ ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores); ++ if (ores && !len2 && *indlen2) { ++ len2++; ++ ores--; ++ } ++ while (ores) { ++ if (len1 < *indlen1) { ++ len1++; ++ ores--; + } +- if (nres == ores) ++ if (!ores) + break; +- if (len2) { +- len2--; +- nres--; ++ if (len2 < *indlen2) { ++ len2++; ++ ores--; + } + } + diff --git a/queue-4.9/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch b/queue-4.9/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch new file mode 100644 index 00000000000..2cd8d041a81 --- /dev/null +++ b/queue-4.9/xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch @@ -0,0 +1,97 @@ +From 2fcc319d2467a5f5b78f35f79fd6e22741a31b1e Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Wed, 8 Mar 2017 10:38:53 -0800 +Subject: xfs: try any AG when allocating the first btree block when reflinking + +From: Christoph Hellwig + +commit 2fcc319d2467a5f5b78f35f79fd6e22741a31b1e upstream. + +When a reflink operation causes the bmap code to allocate a btree block +we're currently doing single-AG allocations due to having ->firstblock +set and then try any higher AG due a little reflink quirk we've put in +when adding the reflink code. But given that we do not have a minleft +reservation of any kind in this AG we can still not have any space in +the same or higher AG even if the file system has enough free space. +To fix this use a XFS_ALLOCTYPE_FIRST_AG allocation in this fall back +path instead. + +[And yes, we need to redo this properly instead of piling hacks over + hacks. I'm working on that, but it's not going to be a small series. + In the meantime this fixes the customer reported issue] + +Also add a warning for failing allocations to make it easier to debug. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_bmap.c | 10 +++++++--- + fs/xfs/libxfs/xfs_bmap_btree.c | 6 +++--- + 2 files changed, 10 insertions(+), 6 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -769,8 +769,8 @@ xfs_bmap_extents_to_btree( + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (dfops->dop_low) { +-try_another_ag: + args.type = XFS_ALLOCTYPE_START_BNO; ++try_another_ag: + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; +@@ -796,13 +796,17 @@ try_another_ag: + if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { +- dfops->dop_low = true; ++ args.type = XFS_ALLOCTYPE_FIRST_AG; + goto try_another_ag; + } ++ if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { ++ xfs_iroot_realloc(ip, -1, whichfork); ++ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); ++ return -ENOSPC; ++ } + /* + * Allocation can't fail, the space was reserved. + */ +- ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock)); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; +--- a/fs/xfs/libxfs/xfs_bmap_btree.c ++++ b/fs/xfs/libxfs/xfs_bmap_btree.c +@@ -453,8 +453,8 @@ xfs_bmbt_alloc_block( + + if (args.fsbno == NULLFSBLOCK) { + args.fsbno = be64_to_cpu(start->l); +-try_another_ag: + args.type = XFS_ALLOCTYPE_START_BNO; ++try_another_ag: + /* + * Make sure there is sufficient room left in the AG to + * complete a full tree split for an extent insert. If +@@ -494,8 +494,8 @@ try_another_ag: + if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { +- cur->bc_private.b.dfops->dop_low = true; + args.fsbno = cur->bc_private.b.firstblock; ++ args.type = XFS_ALLOCTYPE_FIRST_AG; + goto try_another_ag; + } + +@@ -512,7 +512,7 @@ try_another_ag: + goto error0; + cur->bc_private.b.dfops->dop_low = true; + } +- if (args.fsbno == NULLFSBLOCK) { ++ if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; diff --git a/queue-4.9/xfs-tune-down-agno-asserts-in-the-bmap-code.patch b/queue-4.9/xfs-tune-down-agno-asserts-in-the-bmap-code.patch new file mode 100644 index 00000000000..60b561ed819 --- /dev/null +++ b/queue-4.9/xfs-tune-down-agno-asserts-in-the-bmap-code.patch @@ -0,0 +1,83 @@ +From 410d17f67e583559be3a922f8b6cc336331893f3 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Thu, 16 Feb 2017 17:12:51 -0800 +Subject: xfs: tune down agno asserts in the bmap code + +From: Christoph Hellwig + +commit 410d17f67e583559be3a922f8b6cc336331893f3 upstream. + +In various places we currently assert that xfs_bmap_btalloc allocates +from the same as the firstblock value passed in, unless it's either +NULLAGNO or the dop_low flag is set. But the reflink code does not +fully follow this convention as it passes in firstblock purely as +a hint for the allocator without actually having previous allocations +in the transaction, and without having a minleft check on the current +AG, leading to the assert firing on a very full and heavily used +file system. As even the reflink code only allocates from equal or +higher AGs for now we can simply the check to always allow for equal +or higher AGs. + +Note that we need to eventually split the two meanings of the firstblock +value. At that point we can also allow the reflink code to allocate +from any AG instead of limiting it in any way. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_bmap.c | 22 ++++++---------------- + 1 file changed, 6 insertions(+), 16 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -804,9 +804,7 @@ try_another_ag: + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || +- args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || +- (dfops->dop_low && +- args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); ++ args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock)); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; +@@ -3923,17 +3921,13 @@ xfs_bmap_btalloc( + * the first block that was allocated. + */ + ASSERT(*ap->firstblock == NULLFSBLOCK || +- XFS_FSB_TO_AGNO(mp, *ap->firstblock) == +- XFS_FSB_TO_AGNO(mp, args.fsbno) || +- (ap->dfops->dop_low && +- XFS_FSB_TO_AGNO(mp, *ap->firstblock) < +- XFS_FSB_TO_AGNO(mp, args.fsbno))); ++ XFS_FSB_TO_AGNO(mp, *ap->firstblock) <= ++ XFS_FSB_TO_AGNO(mp, args.fsbno)); + + ap->blkno = args.fsbno; + if (*ap->firstblock == NULLFSBLOCK) + *ap->firstblock = args.fsbno; +- ASSERT(nullfb || fb_agno == args.agno || +- (ap->dfops->dop_low && fb_agno < args.agno)); ++ ASSERT(nullfb || fb_agno <= args.agno); + ap->length = args.len; + if (!(ap->flags & XFS_BMAPI_COWFORK)) + ap->ip->i_d.di_nblocks += args.len; +@@ -4858,13 +4852,9 @@ error0: + if (bma.cur) { + if (!error) { + ASSERT(*firstblock == NULLFSBLOCK || +- XFS_FSB_TO_AGNO(mp, *firstblock) == ++ XFS_FSB_TO_AGNO(mp, *firstblock) <= + XFS_FSB_TO_AGNO(mp, +- bma.cur->bc_private.b.firstblock) || +- (dfops->dop_low && +- XFS_FSB_TO_AGNO(mp, *firstblock) < +- XFS_FSB_TO_AGNO(mp, +- bma.cur->bc_private.b.firstblock))); ++ bma.cur->bc_private.b.firstblock)); + *firstblock = bma.cur->bc_private.b.firstblock; + } + xfs_btree_del_cursor(bma.cur, diff --git a/queue-4.9/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch b/queue-4.9/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch new file mode 100644 index 00000000000..0097104f8c3 --- /dev/null +++ b/queue-4.9/xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch @@ -0,0 +1,66 @@ +From c5ecb42342852892f978572ddc6dca703460f25a Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Mon, 6 Feb 2017 17:45:51 -0800 +Subject: xfs: update ctime and mtime on clone destinatation inodes + +From: Christoph Hellwig + +commit c5ecb42342852892f978572ddc6dca703460f25a upstream. + +We're changing both metadata and data, so we need to update the +timestamps for clone operations. Dedupe on the other hand does +not change file data, and only changes invisible metadata so the +timestamps should not be updated. + +This follows existing btrfs behavior. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +[darrick: remove redundant is_dedupe test] +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_reflink.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -985,13 +985,14 @@ STATIC int + xfs_reflink_update_dest( + struct xfs_inode *dest, + xfs_off_t newlen, +- xfs_extlen_t cowextsize) ++ xfs_extlen_t cowextsize, ++ bool is_dedupe) + { + struct xfs_mount *mp = dest->i_mount; + struct xfs_trans *tp; + int error; + +- if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) ++ if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + return 0; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); +@@ -1012,6 +1013,10 @@ xfs_reflink_update_dest( + dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + } + ++ if (!is_dedupe) { ++ xfs_trans_ichgtime(tp, dest, ++ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); ++ } + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + + error = xfs_trans_commit(tp); +@@ -1528,7 +1533,8 @@ xfs_reflink_remap_range( + !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_d.di_cowextsize; + +- ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize); ++ ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, ++ is_dedupe); + + out_unlock: + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); diff --git a/queue-4.9/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch b/queue-4.9/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch new file mode 100644 index 00000000000..f9c2b770391 --- /dev/null +++ b/queue-4.9/xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch @@ -0,0 +1,150 @@ +From f65e6fad293b3a5793b7fa2044800506490e7a2e Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Wed, 8 Mar 2017 09:58:08 -0800 +Subject: xfs: use iomap new flag for newly allocated delalloc blocks + +From: Brian Foster + +commit f65e6fad293b3a5793b7fa2044800506490e7a2e upstream. + +Commit fa7f138 ("xfs: clear delalloc and cache on buffered write +failure") fixed one regression in the iomap error handling code and +exposed another. The fundamental problem is that if a buffered write +is a rewrite of preexisting delalloc blocks and the write fails, the +failure handling code can punch out preexisting blocks with valid +file data. + +This was reproduced directly by sub-block writes in the LTP +kernel/syscalls/write/write03 test. A first 100 byte write allocates +a single block in a file. A subsequent 100 byte write fails and +punches out the block, including the data successfully written by +the previous write. + +To address this problem, update the ->iomap_begin() handler to +distinguish newly allocated delalloc blocks from preexisting +delalloc blocks via the IOMAP_F_NEW flag. Use this flag in the +->iomap_end() handler to decide when a failed or short write should +punch out delalloc blocks. + +This introduces the subtle requirement that ->iomap_begin() should +never combine newly allocated delalloc blocks with existing blocks +in the resulting iomap descriptor. This can occur when a new +delalloc reservation merges with a neighboring extent that is part +of the current write, for example. Therefore, drop the +post-allocation extent lookup from xfs_bmapi_reserve_delalloc() and +just return the record inserted into the fork. This ensures only new +blocks are returned and thus that preexisting delalloc blocks are +always handled as "found" blocks and not punched out on a failed +rewrite. + +Reported-by: Xiong Zhou +Signed-off-by: Brian Foster +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_bmap.c | 24 ++++++++++++++---------- + fs/xfs/xfs_iomap.c | 16 +++++++++++----- + 2 files changed, 25 insertions(+), 15 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -4253,6 +4253,19 @@ xfs_bmapi_read( + return 0; + } + ++/* ++ * Add a delayed allocation extent to an inode. Blocks are reserved from the ++ * global pool and the extent inserted into the inode in-core extent tree. ++ * ++ * On entry, got refers to the first extent beyond the offset of the extent to ++ * allocate or eof is specified if no such extent exists. On return, got refers ++ * to the extent record that was inserted to the inode fork. ++ * ++ * Note that the allocated extent may have been merged with contiguous extents ++ * during insertion into the inode fork. Thus, got does not reflect the current ++ * state of the inode fork on return. If necessary, the caller can use lastx to ++ * look up the updated record in the inode fork. ++ */ + int + xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, +@@ -4339,13 +4352,8 @@ xfs_bmapi_reserve_delalloc( + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; +- xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); + +- /* +- * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay +- * might have merged it into one of the neighbouring ones. +- */ +- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); ++ xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); + + /* + * Tag the inode if blocks were preallocated. Note that COW fork +@@ -4357,10 +4365,6 @@ xfs_bmapi_reserve_delalloc( + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) + xfs_inode_set_cowblocks_tag(ip); + +- ASSERT(got->br_startoff <= aoff); +- ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); +- ASSERT(isnullstartblock(got->br_startblock)); +- ASSERT(got->br_state == XFS_EXT_NORM); + return 0; + + out_unreserve_blocks: +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -637,6 +637,11 @@ retry: + goto out_unlock; + } + ++ /* ++ * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch ++ * them out if the write happens to fail. ++ */ ++ iomap->flags = IOMAP_F_NEW; + trace_xfs_iomap_alloc(ip, offset, count, 0, &got); + done: + if (isnullstartblock(got.br_startblock)) +@@ -1061,7 +1066,8 @@ xfs_file_iomap_end_delalloc( + struct xfs_inode *ip, + loff_t offset, + loff_t length, +- ssize_t written) ++ ssize_t written, ++ struct iomap *iomap) + { + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_fsb; +@@ -1080,14 +1086,14 @@ xfs_file_iomap_end_delalloc( + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + /* +- * Trim back delalloc blocks if we didn't manage to write the whole +- * range reserved. ++ * Trim delalloc blocks if they were allocated by this write and we ++ * didn't manage to write the whole range. + * + * We don't need to care about racing delalloc as we hold i_mutex + * across the reserve/allocate/unreserve calls. If there are delalloc + * blocks in the range, they are ours. + */ +- if (start_fsb < end_fsb) { ++ if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { + truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), + XFS_FSB_TO_B(mp, end_fsb) - 1); + +@@ -1117,7 +1123,7 @@ xfs_file_iomap_end( + { + if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) + return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, +- length, written); ++ length, written, iomap); + return 0; + } + diff --git a/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch b/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch new file mode 100644 index 00000000000..adfd695e2b2 --- /dev/null +++ b/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch @@ -0,0 +1,44 @@ +From d5825712ee98d68a2c17bc89dad2c30276894cba Mon Sep 17 00:00:00 2001 +From: Chandan Rajendra +Date: Thu, 2 Mar 2017 15:06:33 -0800 +Subject: xfs: Use xfs_icluster_size_fsb() to calculate inode alignment mask + +From: Chandan Rajendra + +commit d5825712ee98d68a2c17bc89dad2c30276894cba upstream. + +When block size is larger than inode cluster size, the call to +XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size) returns 0. Also, mkfs.xfs +would have set xfs_sb->sb_inoalignmt to 0. Hence in +xfs_set_inoalignment(), xfs_mount->m_inoalign_mask gets initialized to +-1 instead of 0. However, xfs_mount->m_sinoalign would get correctly +intialized to 0 because for every positive value of xfs_mount->m_dalign, +the condition "!(mp->m_dalign & mp->m_inoalign_mask)" would evaluate to +false. + +Also, xfs_imap() worked fine even with xfs_mount->m_inoalign_mask having +-1 as the value because blks_per_cluster variable would have the value 1 +and hence we would never have a need to use xfs_mount->m_inoalign_mask +to compute the inode chunk's agbno and offset within the chunk. + +Signed-off-by: Chandan Rajendra +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_mount.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -502,8 +502,7 @@ STATIC void + xfs_set_inoalignment(xfs_mount_t *mp) + { + if (xfs_sb_version_hasalign(&mp->m_sb) && +- mp->m_sb.sb_inoalignmt >= +- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) ++ mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) + mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; + else + mp->m_inoalign_mask = 0; diff --git a/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch b/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch new file mode 100644 index 00000000000..6bde120baa6 --- /dev/null +++ b/queue-4.9/xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch @@ -0,0 +1,91 @@ +From 8ee9fdbebc84b39f1d1c201c5e32277c61d034aa Mon Sep 17 00:00:00 2001 +From: Chandan Rajendra +Date: Thu, 16 Feb 2017 17:12:16 -0800 +Subject: xfs: Use xfs_icluster_size_fsb() to calculate inode chunk alignment + +From: Chandan Rajendra + +commit 8ee9fdbebc84b39f1d1c201c5e32277c61d034aa upstream. + +On a ppc64 system, executing generic/256 test with 32k block size gives the following call trace, + +XFS: Assertion failed: args->maxlen > 0, file: /root/repos/linux/fs/xfs/libxfs/xfs_alloc.c, line: 2026 + +kernel BUG at /root/repos/linux/fs/xfs/xfs_message.c:113! +Oops: Exception in kernel mode, sig: 5 [#1] +SMP NR_CPUS=2048 +DEBUG_PAGEALLOC +NUMA +pSeries +Modules linked in: +CPU: 2 PID: 19361 Comm: mkdir Not tainted 4.10.0-rc5 #58 +task: c000000102606d80 task.stack: c0000001026b8000 +NIP: c0000000004ef798 LR: c0000000004ef798 CTR: c00000000082b290 +REGS: c0000001026bb090 TRAP: 0700 Not tainted (4.10.0-rc5) +MSR: 8000000000029032 +CR: 28004428 XER: 00000000 +CFAR: c0000000004ef180 SOFTE: 1 +GPR00: c0000000004ef798 c0000001026bb310 c000000001157300 ffffffffffffffea +GPR04: 000000000000000a c0000001026bb130 0000000000000000 ffffffffffffffc0 +GPR08: 00000000000000d1 0000000000000021 00000000ffffffd1 c000000000dd4990 +GPR12: 0000000022004444 c00000000fe00800 0000000020000000 0000000000000000 +GPR16: 0000000000000000 0000000043a606fc 0000000043a76c08 0000000043a1b3d0 +GPR20: 000001002a35cd60 c0000001026bbb80 0000000000000000 0000000000000001 +GPR24: 0000000000000240 0000000000000004 c00000062dc55000 0000000000000000 +GPR28: 0000000000000004 c00000062ecd9200 0000000000000000 c0000001026bb6c0 +NIP [c0000000004ef798] .assfail+0x28/0x30 +LR [c0000000004ef798] .assfail+0x28/0x30 +Call Trace: +[c0000001026bb310] [c0000000004ef798] .assfail+0x28/0x30 (unreliable) +[c0000001026bb380] [c000000000455d74] .xfs_alloc_space_available+0x194/0x1b0 +[c0000001026bb410] [c00000000045b914] .xfs_alloc_fix_freelist+0x144/0x480 +[c0000001026bb580] [c00000000045c368] .xfs_alloc_vextent+0x698/0xa90 +[c0000001026bb650] [c0000000004a6200] .xfs_ialloc_ag_alloc+0x170/0x820 +[c0000001026bb7c0] [c0000000004a9098] .xfs_dialloc+0x158/0x320 +[c0000001026bb8a0] [c0000000004e628c] .xfs_ialloc+0x7c/0x610 +[c0000001026bb990] [c0000000004e8138] .xfs_dir_ialloc+0xa8/0x2f0 +[c0000001026bbaa0] [c0000000004e8814] .xfs_create+0x494/0x790 +[c0000001026bbbf0] [c0000000004e5ebc] .xfs_generic_create+0x2bc/0x410 +[c0000001026bbce0] [c0000000002b4a34] .vfs_mkdir+0x154/0x230 +[c0000001026bbd70] [c0000000002bc444] .SyS_mkdirat+0x94/0x120 +[c0000001026bbe30] [c00000000000b760] system_call+0x38/0xfc +Instruction dump: +4e800020 60000000 7c0802a6 7c862378 3c82ffca 7ca72b78 38841c18 7c651b78 +38600000 f8010010 f821ff91 4bfff94d <0fe00000> 60000000 7c0802a6 7c892378 + +When block size is larger than inode cluster size, the call to +XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size) returns 0. Also, mkfs.xfs +would have set xfs_sb->sb_inoalignmt to 0. This causes +xfs_ialloc_cluster_alignment() to return 0. Due to this +args.minalignslop (in xfs_ialloc_ag_alloc()) gets the unsigned +equivalent of -1 assigned to it. This later causes alloc_len in +xfs_alloc_space_available() to have a value of 0. In such a scenario +when args.total is also 0, the assert statement "ASSERT(args->maxlen > +0);" fails. + +This commit fixes the bug by replacing the call to XFS_B_TO_FSBT() in +xfs_ialloc_cluster_alignment() with a call to xfs_icluster_size_fsb(). + +Suggested-by: Darrick J. Wong +Signed-off-by: Chandan Rajendra +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_ialloc.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/xfs/libxfs/xfs_ialloc.c ++++ b/fs/xfs/libxfs/xfs_ialloc.c +@@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment( + struct xfs_mount *mp) + { + if (xfs_sb_version_hasalign(&mp->m_sb) && +- mp->m_sb.sb_inoalignmt >= +- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) ++ mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) + return mp->m_sb.sb_inoalignmt; + return 1; + } diff --git a/queue-4.9/xfs-verify-free-block-header-fields.patch b/queue-4.9/xfs-verify-free-block-header-fields.patch new file mode 100644 index 00000000000..7e995279e21 --- /dev/null +++ b/queue-4.9/xfs-verify-free-block-header-fields.patch @@ -0,0 +1,93 @@ +From de14c5f541e78c59006bee56f6c5c2ef1ca07272 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Thu, 2 Feb 2017 15:14:00 -0800 +Subject: xfs: verify free block header fields + +From: Darrick J. Wong + +commit de14c5f541e78c59006bee56f6c5c2ef1ca07272 upstream. + +Perform basic sanity checking of the directory free block header +fields so that we avoid hanging the system on invalid data. + +(Granted that just means that now we shutdown on directory write, +but that seems better than hanging...) + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_dir2_node.c | 51 ++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 49 insertions(+), 2 deletions(-) + +--- a/fs/xfs/libxfs/xfs_dir2_node.c ++++ b/fs/xfs/libxfs/xfs_dir2_node.c +@@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_b + .verify_write = xfs_dir3_free_write_verify, + }; + ++/* Everything ok in the free block header? */ ++static bool ++xfs_dir3_free_header_check( ++ struct xfs_inode *dp, ++ xfs_dablk_t fbno, ++ struct xfs_buf *bp) ++{ ++ struct xfs_mount *mp = dp->i_mount; ++ unsigned int firstdb; ++ int maxbests; ++ ++ maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo); ++ firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) - ++ xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) * ++ maxbests; ++ if (xfs_sb_version_hascrc(&mp->m_sb)) { ++ struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; ++ ++ if (be32_to_cpu(hdr3->firstdb) != firstdb) ++ return false; ++ if (be32_to_cpu(hdr3->nvalid) > maxbests) ++ return false; ++ if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) ++ return false; ++ } else { ++ struct xfs_dir2_free_hdr *hdr = bp->b_addr; ++ ++ if (be32_to_cpu(hdr->firstdb) != firstdb) ++ return false; ++ if (be32_to_cpu(hdr->nvalid) > maxbests) ++ return false; ++ if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused)) ++ return false; ++ } ++ return true; ++} + + static int + __xfs_dir3_free_read( +@@ -168,11 +204,22 @@ __xfs_dir3_free_read( + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_free_buf_ops); ++ if (err || !*bpp) ++ return err; ++ ++ /* Check things that we can't do in the verifier. */ ++ if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) { ++ xfs_buf_ioerror(*bpp, -EFSCORRUPTED); ++ xfs_verifier_error(*bpp); ++ xfs_trans_brelse(tp, *bpp); ++ return -EFSCORRUPTED; ++ } + + /* try read returns without an error or *bpp if it lands in a hole */ +- if (!err && tp && *bpp) ++ if (tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); +- return err; ++ ++ return 0; + } + + int -- 2.47.3