xfs-use-per-ag-reservations-for-the-finobt.patch
xfs-pull-up-iolock-from-xfs_free_eofblocks.patch
xfs-sync-eofblocks-scans-under-iolock-are-livelock-prone.patch
+xfs-fix-eofblocks-race-with-file-extending-async-dio-writes.patch
+xfs-fix-toctou-race-when-locking-an-inode-to-access-the-data-map.patch
+xfs-fail-_dir_open-when-readahead-fails.patch
+xfs-filter-out-obviously-bad-btree-pointers.patch
+xfs-check-for-obviously-bad-level-values-in-the-bmbt-root.patch
+xfs-verify-free-block-header-fields.patch
+xfs-allow-unwritten-extents-in-the-cow-fork.patch
+xfs-mark-speculative-prealloc-cow-fork-extents-unwritten.patch
+xfs-reset-b_first_retry_time-when-clear-the-retry-status-of-xfs_buf_t.patch
+xfs-update-ctime-and-mtime-on-clone-destinatation-inodes.patch
+xfs-reject-all-unaligned-direct-writes-to-reflinked-files.patch
+xfs-don-t-fail-xfs_extent_busy-allocation.patch
+xfs-handle-indlen-shortage-on-delalloc-extent-merge.patch
+xfs-split-indlen-reservations-fairly-when-under-reserved.patch
+xfs-fix-uninitialized-variable-in-_reflink_convert_cow.patch
+xfs-don-t-reserve-blocks-for-right-shift-transactions.patch
+xfs-use-xfs_icluster_size_fsb-to-calculate-inode-chunk-alignment.patch
+xfs-tune-down-agno-asserts-in-the-bmap-code.patch
+xfs-only-reclaim-unwritten-cow-extents-periodically.patch
+xfs-fix-and-streamline-error-handling-in-xfs_end_io.patch
+xfs-use-xfs_icluster_size_fsb-to-calculate-inode-alignment-mask.patch
+xfs-use-iomap-new-flag-for-newly-allocated-delalloc-blocks.patch
+xfs-try-any-ag-when-allocating-the-first-btree-block-when-reflinking.patch
--- /dev/null
+From 05a630d76bd3f39baf0eecfa305bed2820796dee Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:14:01 -0800
+Subject: xfs: allow unwritten extents in the CoW fork
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 05a630d76bd3f39baf0eecfa305bed2820796dee upstream.
+
+In the data fork, we only allow extents to perform the following state
+transitions:
+
+delay -> real <-> unwritten
+
+There's no way to move directly from a delalloc reservation to an
+/unwritten/ allocated extent. However, for the CoW fork we want to be
+able to do the following to each extent:
+
+delalloc -> unwritten -> written -> remapped to data fork
+
+This will help us to avoid a race in the speculative CoW preallocation
+code between a first thread that is allocating a CoW extent and a second
+thread that is remapping part of a file after a write. In order to do
+this, however, we need two things: first, we have to be able to
+transition from da to unwritten, and second the function that converts
+between real and unwritten has to be made aware of the cow fork. Do
+both of those things.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c | 80 +++++++++++++++++++++++++++++------------------
+ 1 file changed, 50 insertions(+), 30 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -1952,6 +1952,7 @@ xfs_bmap_add_extent_delay_real(
+ */
+ trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(ep, new->br_startblock);
++ xfs_bmbt_set_state(ep, new->br_state);
+ trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+
+ (*nextents)++;
+@@ -2290,6 +2291,7 @@ STATIC int /* error */
+ xfs_bmap_add_extent_unwritten_real(
+ struct xfs_trans *tp,
+ xfs_inode_t *ip, /* incore inode pointer */
++ int whichfork,
+ xfs_extnum_t *idx, /* extent number to update/insert */
+ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
+ xfs_bmbt_irec_t *new, /* new data to add to file extents */
+@@ -2309,12 +2311,14 @@ xfs_bmap_add_extent_unwritten_real(
+ /* left is 0, right is 1, prev is 2 */
+ int rval=0; /* return value (logging flags) */
+ int state = 0;/* state bits, accessed thru macros */
+- struct xfs_mount *mp = tp->t_mountp;
++ struct xfs_mount *mp = ip->i_mount;
+
+ *logflagsp = 0;
+
+ cur = *curp;
+- ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
++ ifp = XFS_IFORK_PTR(ip, whichfork);
++ if (whichfork == XFS_COW_FORK)
++ state |= BMAP_COWFORK;
+
+ ASSERT(*idx >= 0);
+ ASSERT(*idx <= xfs_iext_count(ifp));
+@@ -2373,7 +2377,7 @@ xfs_bmap_add_extent_unwritten_real(
+ * Don't set contiguous if the combined extent would be too large.
+ * Also check for all-three-contiguous being too large.
+ */
+- if (*idx < xfs_iext_count(&ip->i_df) - 1) {
++ if (*idx < xfs_iext_count(ifp) - 1) {
+ state |= BMAP_RIGHT_VALID;
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
+ if (isnullstartblock(RIGHT.br_startblock))
+@@ -2413,7 +2417,8 @@ xfs_bmap_add_extent_unwritten_real(
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 2, state);
+- ip->i_d.di_nextents -= 2;
++ XFS_IFORK_NEXT_SET(ip, whichfork,
++ XFS_IFORK_NEXTENTS(ip, whichfork) - 2);
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+@@ -2456,7 +2461,8 @@ xfs_bmap_add_extent_unwritten_real(
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 1, state);
+- ip->i_d.di_nextents--;
++ XFS_IFORK_NEXT_SET(ip, whichfork,
++ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+@@ -2491,7 +2497,8 @@ xfs_bmap_add_extent_unwritten_real(
+ xfs_bmbt_set_state(ep, newext);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
+- ip->i_d.di_nextents--;
++ XFS_IFORK_NEXT_SET(ip, whichfork,
++ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+@@ -2603,7 +2610,8 @@ xfs_bmap_add_extent_unwritten_real(
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_insert(ip, *idx, 1, new, state);
+- ip->i_d.di_nextents++;
++ XFS_IFORK_NEXT_SET(ip, whichfork,
++ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+@@ -2681,7 +2689,8 @@ xfs_bmap_add_extent_unwritten_real(
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
+
+- ip->i_d.di_nextents++;
++ XFS_IFORK_NEXT_SET(ip, whichfork,
++ XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+@@ -2729,7 +2738,8 @@ xfs_bmap_add_extent_unwritten_real(
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 2, &r[0], state);
+
+- ip->i_d.di_nextents += 2;
++ XFS_IFORK_NEXT_SET(ip, whichfork,
++ XFS_IFORK_NEXTENTS(ip, whichfork) + 2);
+ if (cur == NULL)
+ rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
+ else {
+@@ -2783,17 +2793,17 @@ xfs_bmap_add_extent_unwritten_real(
+ }
+
+ /* update reverse mappings */
+- error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new);
++ error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new);
+ if (error)
+ goto done;
+
+ /* convert to a btree if necessary */
+- if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
++ if (xfs_bmap_needs_btree(ip, whichfork)) {
+ int tmp_logflags; /* partial log flag return val */
+
+ ASSERT(cur == NULL);
+ error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur,
+- 0, &tmp_logflags, XFS_DATA_FORK);
++ 0, &tmp_logflags, whichfork);
+ *logflagsp |= tmp_logflags;
+ if (error)
+ goto done;
+@@ -2805,7 +2815,7 @@ xfs_bmap_add_extent_unwritten_real(
+ *curp = cur;
+ }
+
+- xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
++ xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
+ done:
+ *logflagsp |= rval;
+ return error;
+@@ -4458,10 +4468,16 @@ xfs_bmapi_allocate(
+ bma->got.br_state = XFS_EXT_NORM;
+
+ /*
+- * A wasdelay extent has been initialized, so shouldn't be flagged
+- * as unwritten.
++ * In the data fork, a wasdelay extent has been initialized, so
++ * shouldn't be flagged as unwritten.
++ *
++ * For the cow fork, however, we convert delalloc reservations
++ * (extents allocated for speculative preallocation) to
++ * allocated unwritten extents, and only convert the unwritten
++ * extents to real extents when we're about to write the data.
+ */
+- if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
++ if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
++ (bma->flags & XFS_BMAPI_PREALLOC) &&
+ xfs_sb_version_hasextflgbit(&mp->m_sb))
+ bma->got.br_state = XFS_EXT_UNWRITTEN;
+
+@@ -4512,8 +4528,6 @@ xfs_bmapi_convert_unwritten(
+ (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
+ return 0;
+
+- ASSERT(whichfork != XFS_COW_FORK);
+-
+ /*
+ * Modify (by adding) the state flag, if writing.
+ */
+@@ -4538,8 +4552,8 @@ xfs_bmapi_convert_unwritten(
+ return error;
+ }
+
+- error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
+- &bma->cur, mval, bma->firstblock, bma->dfops,
++ error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork,
++ &bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops,
+ &tmp_logflags);
+ /*
+ * Log the inode core unconditionally in the unwritten extent conversion
+@@ -4548,8 +4562,12 @@ xfs_bmapi_convert_unwritten(
+ * in the transaction for the sake of fsync(), even if nothing has
+ * changed, because fsync() will not force the log for this transaction
+ * unless it sees the inode pinned.
++ *
++ * Note: If we're only converting cow fork extents, there aren't
++ * any on-disk updates to make, so we don't need to log anything.
+ */
+- bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
++ if (whichfork != XFS_COW_FORK)
++ bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
+ if (error)
+ return error;
+
+@@ -4623,15 +4641,15 @@ xfs_bmapi_write(
+ ASSERT(*nmap >= 1);
+ ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
+ ASSERT(!(flags & XFS_BMAPI_IGSTATE));
+- ASSERT(tp != NULL);
++ ASSERT(tp != NULL ||
++ (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
++ (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+ ASSERT(len > 0);
+ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
+ ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
+ ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+- ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK);
+- ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);
+
+ /* zeroing is for currently only for data extents, not metadata */
+ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+@@ -5653,8 +5671,8 @@ __xfs_bunmapi(
+ }
+ del.br_state = XFS_EXT_UNWRITTEN;
+ error = xfs_bmap_add_extent_unwritten_real(tp, ip,
+- &lastx, &cur, &del, firstblock, dfops,
+- &logflags);
++ whichfork, &lastx, &cur, &del,
++ firstblock, dfops, &logflags);
+ if (error)
+ goto error0;
+ goto nodelete;
+@@ -5711,8 +5729,9 @@ __xfs_bunmapi(
+ prev.br_state = XFS_EXT_UNWRITTEN;
+ lastx--;
+ error = xfs_bmap_add_extent_unwritten_real(tp,
+- ip, &lastx, &cur, &prev,
+- firstblock, dfops, &logflags);
++ ip, whichfork, &lastx, &cur,
++ &prev, firstblock, dfops,
++ &logflags);
+ if (error)
+ goto error0;
+ goto nodelete;
+@@ -5720,8 +5739,9 @@ __xfs_bunmapi(
+ ASSERT(del.br_state == XFS_EXT_NORM);
+ del.br_state = XFS_EXT_UNWRITTEN;
+ error = xfs_bmap_add_extent_unwritten_real(tp,
+- ip, &lastx, &cur, &del,
+- firstblock, dfops, &logflags);
++ ip, whichfork, &lastx, &cur,
++ &del, firstblock, dfops,
++ &logflags);
+ if (error)
+ goto error0;
+ goto nodelete;
--- /dev/null
+From b3bf607d58520ea8c0666aeb4be60dbb724cd3a2 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:59 -0800
+Subject: xfs: check for obviously bad level values in the bmbt root
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit b3bf607d58520ea8c0666aeb4be60dbb724cd3a2 upstream.
+
+We can't handle a bmbt that's taller than BTREE_MAXLEVELS, and there's
+no such thing as a zero-level bmbt (for that we have extents format),
+so if we see this, send back an error code.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_inode_fork.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_inode_fork.c
++++ b/fs/xfs/libxfs/xfs_inode_fork.c
+@@ -26,6 +26,7 @@
+ #include "xfs_inode.h"
+ #include "xfs_trans.h"
+ #include "xfs_inode_item.h"
++#include "xfs_btree.h"
+ #include "xfs_bmap_btree.h"
+ #include "xfs_bmap.h"
+ #include "xfs_error.h"
+@@ -429,11 +430,13 @@ xfs_iformat_btree(
+ /* REFERENCED */
+ int nrecs;
+ int size;
++ int level;
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
+ size = XFS_BMAP_BROOT_SPACE(mp, dfp);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
++ level = be16_to_cpu(dfp->bb_level);
+
+ /*
+ * blow out if -- fork has less extents than can fit in
+@@ -446,7 +449,8 @@ xfs_iformat_btree(
+ XFS_IFORK_MAXEXT(ip, whichfork) ||
+ XFS_BMDR_SPACE_CALC(nrecs) >
+ XFS_DFORK_SIZE(dip, mp, whichfork) ||
+- XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
++ XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) ||
++ level == 0 || level > XFS_BTREE_MAXLEVELS) {
+ xfs_warn(mp, "corrupt inode %Lu (btree).",
+ (unsigned long long) ip->i_ino);
+ XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
--- /dev/null
+From 5e30c23d13919a718b22d4921dc5c0accc59da27 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Tue, 7 Feb 2017 14:06:46 -0800
+Subject: xfs: don't fail xfs_extent_busy allocation
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 5e30c23d13919a718b22d4921dc5c0accc59da27 upstream.
+
+We don't just need the structure to track busy extents which can be
+avoided with a synchronous transaction, but also to keep track of
+pending discard.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_extent_busy.c | 13 +------------
+ 1 file changed, 1 insertion(+), 12 deletions(-)
+
+--- a/fs/xfs/xfs_extent_busy.c
++++ b/fs/xfs/xfs_extent_busy.c
+@@ -45,18 +45,7 @@ xfs_extent_busy_insert(
+ struct rb_node **rbp;
+ struct rb_node *parent = NULL;
+
+- new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
+- if (!new) {
+- /*
+- * No Memory! Since it is now not possible to track the free
+- * block, make this a synchronous transaction to insure that
+- * the block is not reused before this transaction commits.
+- */
+- trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
+- xfs_trans_set_sync(tp);
+- return;
+- }
+-
++ new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
+ new->agno = agno;
+ new->bno = bno;
+ new->length = len;
--- /dev/null
+From 48af96ab92bc68fb645068b978ce36df2379e076 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 15 Feb 2017 10:18:10 -0800
+Subject: xfs: don't reserve blocks for right shift transactions
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 48af96ab92bc68fb645068b978ce36df2379e076 upstream.
+
+The block reservation for the transaction allocated in
+xfs_shift_file_space() is an artifact of the original collapse range
+support. It exists to handle the case where a collapse range occurs,
+the initial extent is left shifted into a location that forms a
+contiguous boundary with the previous extent and thus the extents
+are merged. This code was subsequently refactored and reused for
+insert range (right shift) support.
+
+If an insert range occurs under low free space conditions, the
+extent at the starting offset is split before the first shift
+transaction is allocated. If the block reservation fails, this
+leaves separate, but contiguous extents around in the inode. While
+not a fatal problem, this is unexpected and will flag a warning on
+subsequent insert range operations on the inode. This problem has
+been reproduce intermittently by generic/270 running against a
+ramdisk device.
+
+Since right shift does not create new extent boundaries in the
+inode, a block reservation for extent merge is unnecessary. Update
+xfs_shift_file_space() to conditionally reserve fs blocks for left
+shift transactions only. This avoids the warning reproduced by
+generic/270.
+
+Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c | 20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1387,10 +1387,16 @@ xfs_shift_file_space(
+ xfs_fileoff_t stop_fsb;
+ xfs_fileoff_t next_fsb;
+ xfs_fileoff_t shift_fsb;
++ uint resblks;
+
+ ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+
+ if (direction == SHIFT_LEFT) {
++ /*
++ * Reserve blocks to cover potential extent merges after left
++ * shift operations.
++ */
++ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ next_fsb = XFS_B_TO_FSB(mp, offset + len);
+ stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
+ } else {
+@@ -1398,6 +1404,7 @@ xfs_shift_file_space(
+ * If right shift, delegate the work of initialization of
+ * next_fsb to xfs_bmap_shift_extent as it has ilock held.
+ */
++ resblks = 0;
+ next_fsb = NULLFSBLOCK;
+ stop_fsb = XFS_B_TO_FSB(mp, offset);
+ }
+@@ -1439,21 +1446,14 @@ xfs_shift_file_space(
+ }
+
+ while (!error && !done) {
+- /*
+- * We would need to reserve permanent block for transaction.
+- * This will come into picture when after shifting extent into
+- * hole we found that adjacent extents can be merged which
+- * may lead to freeing of a block during record update.
+- */
+- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
++ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
++ &tp);
+ if (error)
+ break;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+- ip->i_gdquot, ip->i_pdquot,
+- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
++ ip->i_gdquot, ip->i_pdquot, resblks, 0,
+ XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out_trans_cancel;
--- /dev/null
+From 7a652bbe366464267190c2792a32ce4fff5595ef Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:58 -0800
+Subject: xfs: fail _dir_open when readahead fails
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 7a652bbe366464267190c2792a32ce4fff5595ef upstream.
+
+When we open a directory, we try to readahead block 0 of the directory
+on the assumption that we're going to need it soon. If the bmbt is
+corrupt, the directory will never be usable and the readahead fails
+immediately, so we might as well prevent the directory from being opened
+at all. This prevents a subsequent read or modify operation from
+hitting it and taking the fs offline.
+
+NOTE: We're only checking for early failures in the block mapping, not
+the readahead directory block itself.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_da_btree.c | 6 ++----
+ fs/xfs/libxfs/xfs_da_btree.h | 2 +-
+ fs/xfs/xfs_file.c | 4 ++--
+ 3 files changed, 5 insertions(+), 7 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_da_btree.c
++++ b/fs/xfs/libxfs/xfs_da_btree.c
+@@ -2633,7 +2633,7 @@ out_free:
+ /*
+ * Readahead the dir/attr block.
+ */
+-xfs_daddr_t
++int
+ xfs_da_reada_buf(
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+@@ -2664,7 +2664,5 @@ out_free:
+ if (mapp != &map)
+ kmem_free(mapp);
+
+- if (error)
+- return -1;
+- return mappedbno;
++ return error;
+ }
+--- a/fs/xfs/libxfs/xfs_da_btree.h
++++ b/fs/xfs/libxfs/xfs_da_btree.h
+@@ -201,7 +201,7 @@ int xfs_da_read_buf(struct xfs_trans *tr
+ xfs_dablk_t bno, xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp, int whichfork,
+ const struct xfs_buf_ops *ops);
+-xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
++int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno, int whichfork,
+ const struct xfs_buf_ops *ops);
+ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -992,9 +992,9 @@ xfs_dir_open(
+ */
+ mode = xfs_ilock_data_map_shared(ip);
+ if (ip->i_d.di_nextents > 0)
+- xfs_dir3_data_readahead(ip, 0, -1);
++ error = xfs_dir3_data_readahead(ip, 0, -1);
+ xfs_iunlock(ip, mode);
+- return 0;
++ return error;
+ }
+
+ STATIC int
--- /dev/null
+From d5a91baeb6033c3392121e4d5c011cdc08dfa9f7 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:58 -0800
+Subject: xfs: filter out obviously bad btree pointers
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit d5a91baeb6033c3392121e4d5c011cdc08dfa9f7 upstream.
+
+Don't let anybody load an obviously bad btree pointer. Since the values
+come from disk, we must return an error, not just ASSERT.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c | 5 +----
+ fs/xfs/libxfs/xfs_btree.c | 3 ++-
+ fs/xfs/libxfs/xfs_btree.h | 2 +-
+ 3 files changed, 4 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -1278,7 +1278,6 @@ xfs_bmap_read_extents(
+ /* REFERENCED */
+ xfs_extnum_t room; /* number of entries there's room for */
+
+- bno = NULLFSBLOCK;
+ mp = ip->i_mount;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
+@@ -1291,9 +1290,7 @@ xfs_bmap_read_extents(
+ ASSERT(level > 0);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
+ bno = be64_to_cpu(*pp);
+- ASSERT(bno != NULLFSBLOCK);
+- ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+- ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
++
+ /*
+ * Go down the tree until leaf level is reached, following the first
+ * pointer (leftmost) at each level.
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -810,7 +810,8 @@ xfs_btree_read_bufl(
+ xfs_daddr_t d; /* real disk block address */
+ int error;
+
+- ASSERT(fsbno != NULLFSBLOCK);
++ if (!XFS_FSB_SANITY_CHECK(mp, fsbno))
++ return -EFSCORRUPTED;
+ d = XFS_FSB_TO_DADDR(mp, fsbno);
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+ mp->m_bsize, lock, &bp, ops);
+--- a/fs/xfs/libxfs/xfs_btree.h
++++ b/fs/xfs/libxfs/xfs_btree.h
+@@ -491,7 +491,7 @@ static inline int xfs_btree_get_level(st
+ #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
+
+ #define XFS_FSB_SANITY_CHECK(mp,fsb) \
+- (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
++ (fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+ XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
+
+ /*
--- /dev/null
+From 787eb485509f9d58962bd8b4dbc6a5ac6e2034fe Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Thu, 2 Mar 2017 15:02:51 -0800
+Subject: xfs: fix and streamline error handling in xfs_end_io
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 787eb485509f9d58962bd8b4dbc6a5ac6e2034fe upstream.
+
+There are two different cases of buffered I/O errors:
+
+ - first we can have an already shutdown fs. In that case we should skip
+ any on-disk operations and just clean up the appen transaction if
+ present and destroy the ioend
+ - a real I/O error. In that case we should cleanup any lingering COW
+ blocks. This gets skipped in the current code and is fixed by this
+ patch.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c | 59 ++++++++++++++++++++++++------------------------------
+ 1 file changed, 27 insertions(+), 32 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -279,54 +279,49 @@ xfs_end_io(
+ struct xfs_ioend *ioend =
+ container_of(work, struct xfs_ioend, io_work);
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
++ xfs_off_t offset = ioend->io_offset;
++ size_t size = ioend->io_size;
+ int error = ioend->io_bio->bi_error;
+
+ /*
+- * Set an error if the mount has shut down and proceed with end I/O
+- * processing so it can perform whatever cleanups are necessary.
++ * Just clean up the in-memory strutures if the fs has been shut down.
+ */
+- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
++ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ error = -EIO;
++ goto done;
++ }
+
+ /*
+- * For a CoW extent, we need to move the mapping from the CoW fork
+- * to the data fork. If instead an error happened, just dump the
+- * new blocks.
++ * Clean up any COW blocks on an I/O error.
+ */
+- if (ioend->io_type == XFS_IO_COW) {
+- if (error)
+- goto done;
+- if (ioend->io_bio->bi_error) {
+- error = xfs_reflink_cancel_cow_range(ip,
+- ioend->io_offset, ioend->io_size, true);
+- goto done;
++ if (unlikely(error)) {
++ switch (ioend->io_type) {
++ case XFS_IO_COW:
++ xfs_reflink_cancel_cow_range(ip, offset, size, true);
++ break;
+ }
+- error = xfs_reflink_end_cow(ip, ioend->io_offset,
+- ioend->io_size);
+- if (error)
+- goto done;
++
++ goto done;
+ }
+
+ /*
+- * For unwritten extents we need to issue transactions to convert a
+- * range to normal written extens after the data I/O has finished.
+- * Detecting and handling completion IO errors is done individually
+- * for each case as different cleanup operations need to be performed
+- * on error.
++ * Success: commit the COW or unwritten blocks if needed.
+ */
+- if (ioend->io_type == XFS_IO_UNWRITTEN) {
+- if (error)
+- goto done;
+- error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+- ioend->io_size);
+- } else if (ioend->io_append_trans) {
+- error = xfs_setfilesize_ioend(ioend, error);
+- } else {
+- ASSERT(!xfs_ioend_is_append(ioend) ||
+- ioend->io_type == XFS_IO_COW);
++ switch (ioend->io_type) {
++ case XFS_IO_COW:
++ error = xfs_reflink_end_cow(ip, offset, size);
++ break;
++ case XFS_IO_UNWRITTEN:
++ error = xfs_iomap_write_unwritten(ip, offset, size);
++ break;
++ default:
++ ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
++ break;
+ }
+
+ done:
++ if (ioend->io_append_trans)
++ error = xfs_setfilesize_ioend(ioend, error);
+ xfs_destroy_ioend(ioend, error);
+ }
+
--- /dev/null
+From e4229d6b0bc9280f29624faf170cf76a9f1ca60e Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Fri, 27 Jan 2017 23:22:57 -0800
+Subject: xfs: fix eofblocks race with file extending async dio writes
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit e4229d6b0bc9280f29624faf170cf76a9f1ca60e upstream.
+
+It's possible for post-eof blocks to end up being used for direct I/O
+writes. dio write performs an upfront unwritten extent allocation, sends
+the dio and then updates the inode size (if necessary) on write
+completion. If a file release occurs while a file extending dio write is
+in flight, it is possible to mistake the post-eof blocks for speculative
+preallocation and incorrectly truncate them from the inode. This means
+that the resulting dio write completion can discover a hole and allocate
+new blocks rather than perform unwritten extent conversion.
+
+This requires a strange mix of I/O and is thus not likely to reproduce
+in real world workloads. It is intermittently reproduced by generic/299.
+The error manifests as an assert failure due to transaction overrun
+because the aforementioned write completion transaction has only
+reserved enough blocks for btree operations:
+
+ XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, \
+ file: fs/xfs//xfs_trans.c, line: 309
+
+The root cause is that xfs_free_eofblocks() uses i_size to truncate
+post-eof blocks from the inode, but async, file extending direct writes
+do not update i_size until write completion, long after inode locks are
+dropped. Therefore, xfs_free_eofblocks() effectively truncates the inode
+to the incorrect size.
+
+Update xfs_free_eofblocks() to serialize against dio similar to how
+extending writes are serialized against i_size updates before post-eof
+block zeroing. Specifically, wait on dio while under the iolock. This
+ensures that dio write completions have updated i_size before post-eof
+blocks are processed.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_bmap_util.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -959,6 +959,9 @@ xfs_free_eofblocks(
+ if (error)
+ return error;
+
++ /* wait on dio to ensure i_size has settled */
++ inode_dio_wait(VFS_I(ip));
++
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
+ &tp);
+ if (error) {
--- /dev/null
+From 4b5bd5bf3fb182dc504b1b64e0331300f156e756 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:13:57 -0800
+Subject: xfs: fix toctou race when locking an inode to access the data map
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 4b5bd5bf3fb182dc504b1b64e0331300f156e756 upstream.
+
+We use di_format and if_flags to decide whether we're grabbing the ilock
+in btree mode (btree extents not loaded) or shared mode (anything else),
+but the state of those fields can be changed by other threads that are
+also trying to load the btree extents -- IFEXTENTS gets set before the
+_bmap_read_extents call and cleared if it fails.
+
+We don't actually need to have IFEXTENTS set until after the bmbt
+records are successfully loaded and validated, which will fix the race
+between multiple threads trying to read the same directory. The next
+patch strengthens directory bmbt validation by refusing to open the
+directory if reading the bmbt to start directory readahead fails.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_inode_fork.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_inode_fork.c
++++ b/fs/xfs/libxfs/xfs_inode_fork.c
+@@ -497,15 +497,14 @@ xfs_iread_extents(
+ * We know that the size is valid (it's checked in iformat_btree)
+ */
+ ifp->if_bytes = ifp->if_real_bytes = 0;
+- ifp->if_flags |= XFS_IFEXTENTS;
+ xfs_iext_add(ifp, 0, nextents);
+ error = xfs_bmap_read_extents(tp, ip, whichfork);
+ if (error) {
+ xfs_iext_destroy(ifp);
+- ifp->if_flags &= ~XFS_IFEXTENTS;
+ return error;
+ }
+ xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
++ ifp->if_flags |= XFS_IFEXTENTS;
+ return 0;
+ }
+ /*
--- /dev/null
+From 93aaead52a9eebdc20dc8fa673c350e592a06949 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Mon, 13 Feb 2017 22:52:27 -0800
+Subject: xfs: fix uninitialized variable in _reflink_convert_cow
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 93aaead52a9eebdc20dc8fa673c350e592a06949 upstream.
+
+Fix an uninitialize variable.
+
+Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_reflink.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -363,7 +363,7 @@ xfs_reflink_convert_cow(
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
+ xfs_extnum_t idx;
+ bool found;
+- int error;
++ int error = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
--- /dev/null
+From 0e339ef8556d9e567aa7925f8892c263d79430d9 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Mon, 13 Feb 2017 22:48:18 -0800
+Subject: xfs: handle indlen shortage on delalloc extent merge
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 0e339ef8556d9e567aa7925f8892c263d79430d9 upstream.
+
+When a delalloc extent is created, it can be merged with pre-existing,
+contiguous, delalloc extents. When this occurs,
+xfs_bmap_add_extent_hole_delay() merges the extents along with the
+associated indirect block reservations. The expectation here is that the
+combined worst case indlen reservation is always less than or equal to
+the indlen reservation for the individual extents.
+
+This is not always the case, however, as existing extents can less than
+the expected indlen reservation if the extent was previously split due
+to a hole punch. If a new extent merges with such an extent, the total
+indlen requirement may be larger than the sum of the indlen reservations
+held by both extents.
+
+xfs_bmap_add_extent_hole_delay() assumes that the worst case indlen
+reservation is always available and assigns it to the merged extent
+without consideration for the indlen held by the pre-existing extent. As
+a result, the subsequent xfs_mod_fdblocks() call can attempt an
+unintentional allocation rather than a free (indicated by an ASSERT()
+failure). Further, if the allocation happens to fail in this context,
+the failure goes unhandled and creates a filesystem wide block
+accounting inconsistency.
+
+Fix xfs_bmap_add_extent_hole_delay() to function as designed. Cap the
+indlen reservation assigned to the merged extent to the sum of the
+indlen reservations held by each of the individual extents.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -2907,7 +2907,8 @@ xfs_bmap_add_extent_hole_delay(
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+- newlen = xfs_bmap_worst_indlen(ip, temp);
++ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
++ oldlen);
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+ nullstartblock((int)newlen));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+@@ -2928,7 +2929,8 @@ xfs_bmap_add_extent_hole_delay(
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock);
+- newlen = xfs_bmap_worst_indlen(ip, temp);
++ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
++ oldlen);
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
+ nullstartblock((int)newlen));
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+@@ -2944,7 +2946,8 @@ xfs_bmap_add_extent_hole_delay(
+ temp = new->br_blockcount + right.br_blockcount;
+ oldlen = startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+- newlen = xfs_bmap_worst_indlen(ip, temp);
++ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
++ oldlen);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ new->br_startoff,
+ nullstartblock((int)newlen), temp, right.br_state);
--- /dev/null
+From 5eda43000064a69a39fb7869cc63c9571535ad29 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:14:02 -0800
+Subject: xfs: mark speculative prealloc CoW fork extents unwritten
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit 5eda43000064a69a39fb7869cc63c9571535ad29 upstream.
+
+Christoph Hellwig pointed out that there's a potentially nasty race when
+performing simultaneous nearby directio cow writes:
+
+"Thread 1 writes a range from B to c
+
+" B --------- C
+ p
+
+"a little later thread 2 writes from A to B
+
+" A --------- B
+ p
+
+[editor's note: the 'p' denote cowextsize boundaries, which I added to
+make this more clear]
+
+"but the code preallocates beyond B into the range where thread
+"1 has just written, but ->end_io hasn't been called yet.
+"But once ->end_io is called thread 2 has already allocated
+"up to the extent size hint into the write range of thread 1,
+"so the end_io handler will splice the unintialized blocks from
+"that preallocation back into the file right after B."
+
+We can avoid this race by ensuring that thread 1 cannot accidentally
+remap the blocks that thread 2 allocated (as part of speculative
+preallocation) as part of t2's write preparation in t1's end_io handler.
+The way we make this happen is by taking advantage of the unwritten
+extent flag as an intermediate step.
+
+Recall that when we begin the process of writing data to shared blocks,
+we create a delayed allocation extent in the CoW fork:
+
+D: --RRRRRRSSSRRRRRRRR---
+C: ------DDDDDDD---------
+
+When a thread prepares to CoW some dirty data out to disk, it will now
+convert the delalloc reservation into an /unwritten/ allocated extent in
+the cow fork. The da conversion code tries to opportunistically
+allocate as much of a (speculatively prealloc'd) extent as possible, so
+we may end up allocating a larger extent than we're actually writing
+out:
+
+D: --RRRRRRSSSRRRRRRRR---
+U: ------UUUUUUU---------
+
+Next, we convert only the part of the extent that we're actively
+planning to write to normal (i.e. not unwritten) status:
+
+D: --RRRRRRSSSRRRRRRRR---
+U: ------UURRUUU---------
+
+If the write succeeds, the end_cow function will now scan the relevant
+range of the CoW fork for real extents and remap only the real extents
+into the data fork:
+
+D: --RRRRRRRRSRRRRRRRR---
+U: ------UU--UUU---------
+
+This ensures that we never obliterate valid data fork extents with
+unwritten blocks from the CoW fork.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c | 6 ++
+ fs/xfs/xfs_iomap.c | 2
+ fs/xfs/xfs_reflink.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++----
+ fs/xfs/xfs_reflink.h | 2
+ fs/xfs/xfs_trace.h | 8 ++-
+ 5 files changed, 123 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -486,6 +486,12 @@ xfs_submit_ioend(
+ struct xfs_ioend *ioend,
+ int status)
+ {
++ /* Convert CoW extents to regular */
++ if (!status && ioend->io_type == XFS_IO_COW) {
++ status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
++ ioend->io_offset, ioend->io_size);
++ }
++
+ /* Reserve log space if we might write beyond the on-disk inode size. */
+ if (!status &&
+ ioend->io_type != XFS_IO_UNWRITTEN &&
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -685,7 +685,7 @@ xfs_iomap_write_allocate(
+ int nres;
+
+ if (whichfork == XFS_COW_FORK)
+- flags |= XFS_BMAPI_COWFORK;
++ flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
+
+ /*
+ * Make sure that the dquots are there.
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -82,11 +82,22 @@
+ * mappings are a reservation against the free space in the filesystem;
+ * adjacent mappings can also be combined into fewer larger mappings.
+ *
++ * As an optimization, the CoW extent size hint (cowextsz) creates
++ * outsized aligned delalloc reservations in the hope of landing out of
++ * order nearby CoW writes in a single extent on disk, thereby reducing
++ * fragmentation and improving future performance.
++ *
++ * D: --RRRRRRSSSRRRRRRRR--- (data fork)
++ * C: ------DDDDDDD--------- (CoW fork)
++ *
+ * When dirty pages are being written out (typically in writepage), the
+- * delalloc reservations are converted into real mappings by allocating
+- * blocks and replacing the delalloc mapping with real ones. A delalloc
+- * mapping can be replaced by several real ones if the free space is
+- * fragmented.
++ * delalloc reservations are converted into unwritten mappings by
++ * allocating blocks and replacing the delalloc mapping with real ones.
++ * A delalloc mapping can be replaced by several unwritten ones if the
++ * free space is fragmented.
++ *
++ * D: --RRRRRRSSSRRRRRRRR---
++ * C: ------UUUUUUU---------
+ *
+ * We want to adapt the delalloc mechanism for copy-on-write, since the
+ * write paths are similar. The first two steps (creating the reservation
+@@ -101,13 +112,29 @@
+ * Block-aligned directio writes will use the same mechanism as buffered
+ * writes.
+ *
++ * Just prior to submitting the actual disk write requests, we convert
++ * the extents representing the range of the file actually being written
++ * (as opposed to extra pieces created for the cowextsize hint) to real
++ * extents. This will become important in the next step:
++ *
++ * D: --RRRRRRSSSRRRRRRRR---
++ * C: ------UUrrUUU---------
++ *
+ * CoW remapping must be done after the data block write completes,
+ * because we don't want to destroy the old data fork map until we're sure
+ * the new block has been written. Since the new mappings are kept in a
+ * separate fork, we can simply iterate these mappings to find the ones
+ * that cover the file blocks that we just CoW'd. For each extent, simply
+ * unmap the corresponding range in the data fork, map the new range into
+- * the data fork, and remove the extent from the CoW fork.
++ * the data fork, and remove the extent from the CoW fork. Because of
++ * the presence of the cowextsize hint, however, we must be careful
++ * only to remap the blocks that we've actually written out -- we must
++ * never remap delalloc reservations nor CoW staging blocks that have
++ * yet to be written. This corresponds exactly to the real extents in
++ * the CoW fork:
++ *
++ * D: --RRRRRRrrSRRRRRRRR---
++ * C: ------UU--UUU---------
+ *
+ * Since the remapping operation can be applied to an arbitrary file
+ * range, we record the need for the remap step as a flag in the ioend
+@@ -296,6 +323,65 @@ xfs_reflink_reserve_cow(
+ return 0;
+ }
+
++/* Convert part of an unwritten CoW extent to a real one. */
++STATIC int
++xfs_reflink_convert_cow_extent(
++ struct xfs_inode *ip,
++ struct xfs_bmbt_irec *imap,
++ xfs_fileoff_t offset_fsb,
++ xfs_filblks_t count_fsb,
++ struct xfs_defer_ops *dfops)
++{
++ struct xfs_bmbt_irec irec = *imap;
++ xfs_fsblock_t first_block;
++ int nimaps = 1;
++
++ if (imap->br_state == XFS_EXT_NORM)
++ return 0;
++
++ xfs_trim_extent(&irec, offset_fsb, count_fsb);
++ trace_xfs_reflink_convert_cow(ip, &irec);
++ if (irec.br_blockcount == 0)
++ return 0;
++ return xfs_bmapi_write(NULL, ip, irec.br_startoff, irec.br_blockcount,
++ XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block,
++ 0, &irec, &nimaps, dfops);
++}
++
++/* Convert all of the unwritten CoW extents in a file's range to real ones. */
++int
++xfs_reflink_convert_cow(
++ struct xfs_inode *ip,
++ xfs_off_t offset,
++ xfs_off_t count)
++{
++ struct xfs_bmbt_irec got;
++ struct xfs_defer_ops dfops;
++ struct xfs_mount *mp = ip->i_mount;
++ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
++ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
++ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
++ xfs_extnum_t idx;
++ bool found;
++ int error;
++
++ xfs_ilock(ip, XFS_ILOCK_EXCL);
++
++ /* Convert all the extents to real from unwritten. */
++ for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
++ found && got.br_startoff < end_fsb;
++ found = xfs_iext_get_extent(ifp, ++idx, &got)) {
++ error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
++ end_fsb - offset_fsb, &dfops);
++ if (error)
++ break;
++ }
++
++ /* Finish up. */
++ xfs_iunlock(ip, XFS_ILOCK_EXCL);
++ return error;
++}
++
+ /* Allocate all CoW reservations covering a range of blocks in a file. */
+ static int
+ __xfs_reflink_allocate_cow(
+@@ -328,6 +414,7 @@ __xfs_reflink_allocate_cow(
+ goto out_unlock;
+ ASSERT(nimaps == 1);
+
++ /* Make sure there's a CoW reservation for it. */
+ error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+ if (error)
+ goto out_trans_cancel;
+@@ -337,14 +424,16 @@ __xfs_reflink_allocate_cow(
+ goto out_trans_cancel;
+ }
+
++ /* Allocate the entire reservation as unwritten blocks. */
+ xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
+- XFS_BMAPI_COWFORK, &first_block,
++ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
+ &imap, &nimaps, &dfops);
+ if (error)
+ goto out_trans_cancel;
+
++ /* Finish up. */
+ error = xfs_defer_finish(&tp, &dfops, NULL);
+ if (error)
+ goto out_trans_cancel;
+@@ -389,11 +478,12 @@ xfs_reflink_allocate_cow_range(
+ if (error) {
+ trace_xfs_reflink_allocate_cow_range_error(ip, error,
+ _RET_IP_);
+- break;
++ return error;
+ }
+ }
+
+- return error;
++ /* Convert the CoW extents to regular. */
++ return xfs_reflink_convert_cow(ip, offset, count);
+ }
+
+ /*
+@@ -669,6 +759,16 @@ xfs_reflink_end_cow(
+
+ ASSERT(!isnullstartblock(got.br_startblock));
+
++ /*
++ * Don't remap unwritten extents; these are
++ * speculatively preallocated CoW extents that have been
++ * allocated but have not yet been involved in a write.
++ */
++ if (got.br_state == XFS_EXT_UNWRITTEN) {
++ idx--;
++ goto next_extent;
++ }
++
+ /* Unmap the old blocks in the data fork. */
+ xfs_defer_init(&dfops, &firstfsb);
+ rlen = del.br_blockcount;
+--- a/fs/xfs/xfs_reflink.h
++++ b/fs/xfs/xfs_reflink.h
+@@ -30,6 +30,8 @@ extern int xfs_reflink_reserve_cow(struc
+ struct xfs_bmbt_irec *imap, bool *shared);
+ extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
+ xfs_off_t offset, xfs_off_t count);
++extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
++ xfs_off_t count);
+ extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
+ struct xfs_bmbt_irec *imap, bool *need_alloc);
+ extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -3183,6 +3183,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_extlen_t, len)
+ __field(xfs_fsblock_t, pblk)
++ __field(int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+@@ -3190,13 +3191,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class
+ __entry->lblk = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->pblk = irec->br_startblock;
++ __entry->state = irec->br_state;
+ ),
+- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu",
++ TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->lblk,
+ __entry->len,
+- __entry->pblk)
++ __entry->pblk,
++ __entry->state)
+ );
+ #define DEFINE_INODE_IREC_EVENT(name) \
+ DEFINE_EVENT(xfs_inode_irec_class, name, \
+@@ -3345,6 +3348,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
++DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
+
+ DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
+ DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
--- /dev/null
+From 3802a345321a08093ba2ddb1849e736f84e8d450 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Tue, 7 Mar 2017 16:45:58 -0800
+Subject: xfs: only reclaim unwritten COW extents periodically
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 3802a345321a08093ba2ddb1849e736f84e8d450 upstream.
+
+We only want to reclaim preallocations from our periodic work item.
+Currently this is archived by looking for a dirty inode, but that check
+is rather fragile. Instead add a flag to xfs_reflink_cancel_cow_* so
+that the caller can ask for just cancelling unwritten extents in the COW
+fork.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+[darrick: fix typos in commit message]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c | 2 +-
+ fs/xfs/xfs_icache.c | 2 +-
+ fs/xfs/xfs_inode.c | 2 +-
+ fs/xfs/xfs_reflink.c | 23 ++++++++++++++++-------
+ fs/xfs/xfs_reflink.h | 4 ++--
+ fs/xfs/xfs_super.c | 2 +-
+ 6 files changed, 22 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -298,7 +298,7 @@ xfs_end_io(
+ goto done;
+ if (ioend->io_bio->bi_error) {
+ error = xfs_reflink_cancel_cow_range(ip,
+- ioend->io_offset, ioend->io_size);
++ ioend->io_offset, ioend->io_size, true);
+ goto done;
+ }
+ error = xfs_reflink_end_cow(ip, ioend->io_offset,
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1610,7 +1610,7 @@ xfs_inode_free_cowblocks(
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+
+- ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
++ ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
+
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1624,7 +1624,7 @@ xfs_itruncate_extents(
+
+ /* Remove all pending CoW reservations. */
+ error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block,
+- last_block);
++ last_block, true);
+ if (error)
+ goto out;
+
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -571,14 +571,18 @@ xfs_reflink_trim_irec_to_next_cow(
+ }
+
+ /*
+- * Cancel all pending CoW reservations for some block range of an inode.
++ * Cancel CoW reservations for some block range of an inode.
++ *
++ * If cancel_real is true this function cancels all COW fork extents for the
++ * inode; if cancel_real is false, real extents are not cleared.
+ */
+ int
+ xfs_reflink_cancel_cow_blocks(
+ struct xfs_inode *ip,
+ struct xfs_trans **tpp,
+ xfs_fileoff_t offset_fsb,
+- xfs_fileoff_t end_fsb)
++ xfs_fileoff_t end_fsb,
++ bool cancel_real)
+ {
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+ struct xfs_bmbt_irec got, prev, del;
+@@ -605,7 +609,7 @@ xfs_reflink_cancel_cow_blocks(
+ &idx, &got, &del);
+ if (error)
+ break;
+- } else {
++ } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
+ xfs_trans_ijoin(*tpp, ip, 0);
+ xfs_defer_init(&dfops, &firstfsb);
+
+@@ -648,13 +652,17 @@ xfs_reflink_cancel_cow_blocks(
+ }
+
+ /*
+- * Cancel all pending CoW reservations for some byte range of an inode.
++ * Cancel CoW reservations for some byte range of an inode.
++ *
++ * If cancel_real is true this function cancels all COW fork extents for the
++ * inode; if cancel_real is false, real extents are not cleared.
+ */
+ int
+ xfs_reflink_cancel_cow_range(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+- xfs_off_t count)
++ xfs_off_t count,
++ bool cancel_real)
+ {
+ struct xfs_trans *tp;
+ xfs_fileoff_t offset_fsb;
+@@ -680,7 +688,8 @@ xfs_reflink_cancel_cow_range(
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /* Scrape out the old CoW reservations */
+- error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb);
++ error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
++ cancel_real);
+ if (error)
+ goto out_cancel;
+
+@@ -1686,7 +1695,7 @@ next:
+ * We didn't find any shared blocks so turn off the reflink flag.
+ * First, get rid of any leftover CoW mappings.
+ */
+- error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF);
++ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
+ if (error)
+ return error;
+
+--- a/fs/xfs/xfs_reflink.h
++++ b/fs/xfs/xfs_reflink.h
+@@ -39,9 +39,9 @@ extern int xfs_reflink_trim_irec_to_next
+
+ extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
+ struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
+- xfs_fileoff_t end_fsb);
++ xfs_fileoff_t end_fsb, bool cancel_real);
+ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
+- xfs_off_t count);
++ xfs_off_t count, bool cancel_real);
+ extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t count);
+ extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -948,7 +948,7 @@ xfs_fs_destroy_inode(
+ XFS_STATS_INC(ip->i_mount, vn_remove);
+
+ if (xfs_is_reflink_inode(ip)) {
+- error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
++ error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+ if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount))
+ xfs_warn(ip->i_mount,
+ "Error %d while evicting CoW blocks for inode %llu.",
--- /dev/null
+rom 54a4ef8af4e0dc5c983d17fcb9cf5fd25666d94e Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Mon, 6 Feb 2017 13:00:54 -0800
+Subject: xfs: reject all unaligned direct writes to reflinked files
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 54a4ef8af4e0dc5c983d17fcb9cf5fd25666d94e upstream.
+
+We currently fall back from direct to buffered writes if we detect a
+remaining shared extent in the iomap_begin callback. But by the time
+iomap_begin is called for the potentially unaligned end block we might
+have already written most of the data to disk, which we'd now write
+again using buffered I/O. To avoid this reject all writes to reflinked
+files before starting I/O so that we are guaranteed to only write the
+data once.
+
+The alternative would be to unshare the unaligned start and/or end block
+before doing the I/O. I think that's doable, and will actually be
+required to support reflinks on DAX file system. But it will take a
+little more time and I'd rather get rid of the double write ASAP.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+[slight changes in context due to the new direct I/O code in 4.10+]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_aops.c | 45 ---------------------------------------------
+ fs/xfs/xfs_file.c | 9 +++++++++
+ fs/xfs/xfs_trace.h | 2 +-
+ 3 files changed, 10 insertions(+), 46 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -1263,44 +1263,6 @@ xfs_map_trim_size(
+ bh_result->b_size = mapping_size;
+ }
+
+-/* Bounce unaligned directio writes to the page cache. */
+-static int
+-xfs_bounce_unaligned_dio_write(
+- struct xfs_inode *ip,
+- xfs_fileoff_t offset_fsb,
+- struct xfs_bmbt_irec *imap)
+-{
+- struct xfs_bmbt_irec irec;
+- xfs_fileoff_t delta;
+- bool shared;
+- bool x;
+- int error;
+-
+- irec = *imap;
+- if (offset_fsb > irec.br_startoff) {
+- delta = offset_fsb - irec.br_startoff;
+- irec.br_blockcount -= delta;
+- irec.br_startblock += delta;
+- irec.br_startoff = offset_fsb;
+- }
+- error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
+- if (error)
+- return error;
+-
+- /*
+- * We're here because we're trying to do a directio write to a
+- * region that isn't aligned to a filesystem block. If any part
+- * of the extent is shared, fall back to buffered mode to handle
+- * the RMW. This is done by returning -EREMCHG ("remote addr
+- * changed"), which is caught further up the call stack.
+- */
+- if (shared) {
+- trace_xfs_reflink_bounce_dio_write(ip, imap);
+- return -EREMCHG;
+- }
+- return 0;
+-}
+-
+ STATIC int
+ __xfs_get_blocks(
+ struct inode *inode,
+@@ -1438,13 +1400,6 @@ __xfs_get_blocks(
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ imap.br_startblock != DELAYSTARTBLOCK &&
+ (create || !ISUNWRITTEN(&imap))) {
+- if (create && direct && !is_cow) {
+- error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
+- &imap);
+- if (error)
+- return error;
+- }
+-
+ xfs_map_buffer(inode, bh_result, &imap, offset);
+ if (ISUNWRITTEN(&imap))
+ set_buffer_unwritten(bh_result);
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -554,6 +554,15 @@ xfs_file_dio_aio_write(
+ if ((iocb->ki_pos & mp->m_blockmask) ||
+ ((iocb->ki_pos + count) & mp->m_blockmask)) {
+ unaligned_io = 1;
++
++ /*
++ * We can't properly handle unaligned direct I/O to reflink
++ * files yet, as we can't unshare a partial block.
++ */
++ if (xfs_is_reflink_inode(ip)) {
++ trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
++ return -EREMCHG;
++ }
+ iolock = XFS_IOLOCK_EXCL;
+ } else {
+ iolock = XFS_IOLOCK_SHARED;
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -3353,7 +3353,7 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_conv
+ DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
+ DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
+
+-DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
++DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
+ DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
+ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
+
--- /dev/null
+From 4dd2eb633598cb6a5a0be2fd9a2be0819f5eeb5f Mon Sep 17 00:00:00 2001
+From: Hou Tao <houtao1@huawei.com>
+Date: Fri, 3 Feb 2017 14:39:07 -0800
+Subject: xfs: reset b_first_retry_time when clear the retry status of xfs_buf_t
+
+From: Hou Tao <houtao1@huawei.com>
+
+commit 4dd2eb633598cb6a5a0be2fd9a2be0819f5eeb5f upstream.
+
+After successful IO or permanent error, b_first_retry_time also
+needs to be cleared, else the invalid first retry time will be
+used by the next retry check.
+
+Signed-off-by: Hou Tao <houtao1@huawei.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_buf_item.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -1162,6 +1162,7 @@ xfs_buf_iodone_callbacks(
+ */
+ bp->b_last_error = 0;
+ bp->b_retries = 0;
++ bp->b_first_retry_time = 0;
+
+ xfs_buf_do_callbacks(bp);
+ bp->b_fspriv = NULL;
--- /dev/null
+From 75d65361cf3c0dae2af970c305e19c727b28a510 Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Mon, 13 Feb 2017 22:48:30 -0800
+Subject: xfs: split indlen reservations fairly when under reserved
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 75d65361cf3c0dae2af970c305e19c727b28a510 upstream.
+
+Certain workoads that punch holes into speculative preallocation can
+cause delalloc indirect reservation splits when the delalloc extent is
+split in two. If further splits occur, an already short-handed extent
+can be split into two in a manner that leaves zero indirect blocks for
+one of the two new extents. This occurs because the shortage is large
+enough that the xfs_bmap_split_indlen() algorithm completely drains the
+requested indlen of one of the extents before it honors the existing
+reservation.
+
+This ultimately results in a warning from xfs_bmap_del_extent(). This
+has been observed during file copies of large, sparse files using 'cp
+--sparse=always.'
+
+To avoid this problem, update xfs_bmap_split_indlen() to explicitly
+apply the reservation shortage fairly between both extents. This smooths
+out the overall indlen shortage and defers the situation where we end up
+with a delalloc extent with zero indlen reservation to extreme
+circumstances.
+
+Reported-by: Patrick Dung <mpatdung@gmail.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c | 61 +++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 43 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4899,34 +4899,59 @@ xfs_bmap_split_indlen(
+ xfs_filblks_t len2 = *indlen2;
+ xfs_filblks_t nres = len1 + len2; /* new total res. */
+ xfs_filblks_t stolen = 0;
++ xfs_filblks_t resfactor;
+
+ /*
+ * Steal as many blocks as we can to try and satisfy the worst case
+ * indlen for both new extents.
+ */
+- while (nres > ores && avail) {
+- nres--;
+- avail--;
+- stolen++;
+- }
++ if (ores < nres && avail)
++ stolen = XFS_FILBLKS_MIN(nres - ores, avail);
++ ores += stolen;
++
++ /* nothing else to do if we've satisfied the new reservation */
++ if (ores >= nres)
++ return stolen;
++
++ /*
++ * We can't meet the total required reservation for the two extents.
++ * Calculate the percent of the overall shortage between both extents
++ * and apply this percentage to each of the requested indlen values.
++ * This distributes the shortage fairly and reduces the chances that one
++ * of the two extents is left with nothing when extents are repeatedly
++ * split.
++ */
++ resfactor = (ores * 100);
++ do_div(resfactor, nres);
++ len1 *= resfactor;
++ do_div(len1, 100);
++ len2 *= resfactor;
++ do_div(len2, 100);
++ ASSERT(len1 + len2 <= ores);
++ ASSERT(len1 < *indlen1 && len2 < *indlen2);
+
+ /*
+- * The only blocks available are those reserved for the original
+- * extent and what we can steal from the extent being removed.
+- * If this still isn't enough to satisfy the combined
+- * requirements for the two new extents, skim blocks off of each
+- * of the new reservations until they match what is available.
++ * Hand out the remainder to each extent. If one of the two reservations
++ * is zero, we want to make sure that one gets a block first. The loop
++ * below starts with len1, so hand len2 a block right off the bat if it
++ * is zero.
+ */
+- while (nres > ores) {
+- if (len1) {
+- len1--;
+- nres--;
++ ores -= (len1 + len2);
++ ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores);
++ if (ores && !len2 && *indlen2) {
++ len2++;
++ ores--;
++ }
++ while (ores) {
++ if (len1 < *indlen1) {
++ len1++;
++ ores--;
+ }
+- if (nres == ores)
++ if (!ores)
+ break;
+- if (len2) {
+- len2--;
+- nres--;
++ if (len2 < *indlen2) {
++ len2++;
++ ores--;
+ }
+ }
+
--- /dev/null
+From 2fcc319d2467a5f5b78f35f79fd6e22741a31b1e Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Wed, 8 Mar 2017 10:38:53 -0800
+Subject: xfs: try any AG when allocating the first btree block when reflinking
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 2fcc319d2467a5f5b78f35f79fd6e22741a31b1e upstream.
+
+When a reflink operation causes the bmap code to allocate a btree block
+we're currently doing single-AG allocations due to having ->firstblock
+set and then try any higher AG due a little reflink quirk we've put in
+when adding the reflink code. But given that we do not have a minleft
+reservation of any kind in this AG we can still not have any space in
+the same or higher AG even if the file system has enough free space.
+To fix this use a XFS_ALLOCTYPE_FIRST_AG allocation in this fall back
+path instead.
+
+[And yes, we need to redo this properly instead of piling hacks over
+ hacks. I'm working on that, but it's not going to be a small series.
+ In the meantime this fixes the customer reported issue]
+
+Also add a warning for failing allocations to make it easier to debug.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c | 10 +++++++---
+ fs/xfs/libxfs/xfs_bmap_btree.c | 6 +++---
+ 2 files changed, 10 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -769,8 +769,8 @@ xfs_bmap_extents_to_btree(
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
+ } else if (dfops->dop_low) {
+-try_another_ag:
+ args.type = XFS_ALLOCTYPE_START_BNO;
++try_another_ag:
+ args.fsbno = *firstblock;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+@@ -796,13 +796,17 @@ try_another_ag:
+ if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+ args.fsbno == NULLFSBLOCK &&
+ args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+- dfops->dop_low = true;
++ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ goto try_another_ag;
+ }
++ if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
++ xfs_iroot_realloc(ip, -1, whichfork);
++ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
++ return -ENOSPC;
++ }
+ /*
+ * Allocation can't fail, the space was reserved.
+ */
+- ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(*firstblock == NULLFSBLOCK ||
+ args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock));
+ *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+--- a/fs/xfs/libxfs/xfs_bmap_btree.c
++++ b/fs/xfs/libxfs/xfs_bmap_btree.c
+@@ -453,8 +453,8 @@ xfs_bmbt_alloc_block(
+
+ if (args.fsbno == NULLFSBLOCK) {
+ args.fsbno = be64_to_cpu(start->l);
+-try_another_ag:
+ args.type = XFS_ALLOCTYPE_START_BNO;
++try_another_ag:
+ /*
+ * Make sure there is sufficient room left in the AG to
+ * complete a full tree split for an extent insert. If
+@@ -494,8 +494,8 @@ try_another_ag:
+ if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+ args.fsbno == NULLFSBLOCK &&
+ args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+- cur->bc_private.b.dfops->dop_low = true;
+ args.fsbno = cur->bc_private.b.firstblock;
++ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ goto try_another_ag;
+ }
+
+@@ -512,7 +512,7 @@ try_another_ag:
+ goto error0;
+ cur->bc_private.b.dfops->dop_low = true;
+ }
+- if (args.fsbno == NULLFSBLOCK) {
++ if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
--- /dev/null
+From 410d17f67e583559be3a922f8b6cc336331893f3 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Thu, 16 Feb 2017 17:12:51 -0800
+Subject: xfs: tune down agno asserts in the bmap code
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 410d17f67e583559be3a922f8b6cc336331893f3 upstream.
+
+In various places we currently assert that xfs_bmap_btalloc allocates
+from the same as the firstblock value passed in, unless it's either
+NULLAGNO or the dop_low flag is set. But the reflink code does not
+fully follow this convention as it passes in firstblock purely as
+a hint for the allocator without actually having previous allocations
+in the transaction, and without having a minleft check on the current
+AG, leading to the assert firing on a very full and heavily used
+file system. As even the reflink code only allocates from equal or
+higher AGs for now we can simply the check to always allow for equal
+or higher AGs.
+
+Note that we need to eventually split the two meanings of the firstblock
+value. At that point we can also allow the reflink code to allocate
+from any AG instead of limiting it in any way.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c | 22 ++++++----------------
+ 1 file changed, 6 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -804,9 +804,7 @@ try_another_ag:
+ */
+ ASSERT(args.fsbno != NULLFSBLOCK);
+ ASSERT(*firstblock == NULLFSBLOCK ||
+- args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
+- (dfops->dop_low &&
+- args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
++ args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock));
+ *firstblock = cur->bc_private.b.firstblock = args.fsbno;
+ cur->bc_private.b.allocated++;
+ ip->i_d.di_nblocks++;
+@@ -3923,17 +3921,13 @@ xfs_bmap_btalloc(
+ * the first block that was allocated.
+ */
+ ASSERT(*ap->firstblock == NULLFSBLOCK ||
+- XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
+- XFS_FSB_TO_AGNO(mp, args.fsbno) ||
+- (ap->dfops->dop_low &&
+- XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
+- XFS_FSB_TO_AGNO(mp, args.fsbno)));
++ XFS_FSB_TO_AGNO(mp, *ap->firstblock) <=
++ XFS_FSB_TO_AGNO(mp, args.fsbno));
+
+ ap->blkno = args.fsbno;
+ if (*ap->firstblock == NULLFSBLOCK)
+ *ap->firstblock = args.fsbno;
+- ASSERT(nullfb || fb_agno == args.agno ||
+- (ap->dfops->dop_low && fb_agno < args.agno));
++ ASSERT(nullfb || fb_agno <= args.agno);
+ ap->length = args.len;
+ if (!(ap->flags & XFS_BMAPI_COWFORK))
+ ap->ip->i_d.di_nblocks += args.len;
+@@ -4858,13 +4852,9 @@ error0:
+ if (bma.cur) {
+ if (!error) {
+ ASSERT(*firstblock == NULLFSBLOCK ||
+- XFS_FSB_TO_AGNO(mp, *firstblock) ==
++ XFS_FSB_TO_AGNO(mp, *firstblock) <=
+ XFS_FSB_TO_AGNO(mp,
+- bma.cur->bc_private.b.firstblock) ||
+- (dfops->dop_low &&
+- XFS_FSB_TO_AGNO(mp, *firstblock) <
+- XFS_FSB_TO_AGNO(mp,
+- bma.cur->bc_private.b.firstblock)));
++ bma.cur->bc_private.b.firstblock));
+ *firstblock = bma.cur->bc_private.b.firstblock;
+ }
+ xfs_btree_del_cursor(bma.cur,
--- /dev/null
+From c5ecb42342852892f978572ddc6dca703460f25a Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Mon, 6 Feb 2017 17:45:51 -0800
+Subject: xfs: update ctime and mtime on clone destinatation inodes
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit c5ecb42342852892f978572ddc6dca703460f25a upstream.
+
+We're changing both metadata and data, so we need to update the
+timestamps for clone operations. Dedupe on the other hand does
+not change file data, and only changes invisible metadata so the
+timestamps should not be updated.
+
+This follows existing btrfs behavior.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+[darrick: remove redundant is_dedupe test]
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_reflink.c | 12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -985,13 +985,14 @@ STATIC int
+ xfs_reflink_update_dest(
+ struct xfs_inode *dest,
+ xfs_off_t newlen,
+- xfs_extlen_t cowextsize)
++ xfs_extlen_t cowextsize,
++ bool is_dedupe)
+ {
+ struct xfs_mount *mp = dest->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+- if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
++ if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+ return 0;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+@@ -1012,6 +1013,10 @@ xfs_reflink_update_dest(
+ dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ }
+
++ if (!is_dedupe) {
++ xfs_trans_ichgtime(tp, dest,
++ XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
++ }
+ xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+
+ error = xfs_trans_commit(tp);
+@@ -1528,7 +1533,8 @@ xfs_reflink_remap_range(
+ !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+ cowextsize = src->i_d.di_cowextsize;
+
+- ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize);
++ ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
++ is_dedupe);
+
+ out_unlock:
+ xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
--- /dev/null
+From f65e6fad293b3a5793b7fa2044800506490e7a2e Mon Sep 17 00:00:00 2001
+From: Brian Foster <bfoster@redhat.com>
+Date: Wed, 8 Mar 2017 09:58:08 -0800
+Subject: xfs: use iomap new flag for newly allocated delalloc blocks
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit f65e6fad293b3a5793b7fa2044800506490e7a2e upstream.
+
+Commit fa7f138 ("xfs: clear delalloc and cache on buffered write
+failure") fixed one regression in the iomap error handling code and
+exposed another. The fundamental problem is that if a buffered write
+is a rewrite of preexisting delalloc blocks and the write fails, the
+failure handling code can punch out preexisting blocks with valid
+file data.
+
+This was reproduced directly by sub-block writes in the LTP
+kernel/syscalls/write/write03 test. A first 100 byte write allocates
+a single block in a file. A subsequent 100 byte write fails and
+punches out the block, including the data successfully written by
+the previous write.
+
+To address this problem, update the ->iomap_begin() handler to
+distinguish newly allocated delalloc blocks from preexisting
+delalloc blocks via the IOMAP_F_NEW flag. Use this flag in the
+->iomap_end() handler to decide when a failed or short write should
+punch out delalloc blocks.
+
+This introduces the subtle requirement that ->iomap_begin() should
+never combine newly allocated delalloc blocks with existing blocks
+in the resulting iomap descriptor. This can occur when a new
+delalloc reservation merges with a neighboring extent that is part
+of the current write, for example. Therefore, drop the
+post-allocation extent lookup from xfs_bmapi_reserve_delalloc() and
+just return the record inserted into the fork. This ensures only new
+blocks are returned and thus that preexisting delalloc blocks are
+always handled as "found" blocks and not punched out on a failed
+rewrite.
+
+Reported-by: Xiong Zhou <xzhou@redhat.com>
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_bmap.c | 24 ++++++++++++++----------
+ fs/xfs/xfs_iomap.c | 16 +++++++++++-----
+ 2 files changed, 25 insertions(+), 15 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4253,6 +4253,19 @@ xfs_bmapi_read(
+ return 0;
+ }
+
++/*
++ * Add a delayed allocation extent to an inode. Blocks are reserved from the
++ * global pool and the extent inserted into the inode in-core extent tree.
++ *
++ * On entry, got refers to the first extent beyond the offset of the extent to
++ * allocate or eof is specified if no such extent exists. On return, got refers
++ * to the extent record that was inserted to the inode fork.
++ *
++ * Note that the allocated extent may have been merged with contiguous extents
++ * during insertion into the inode fork. Thus, got does not reflect the current
++ * state of the inode fork on return. If necessary, the caller can use lastx to
++ * look up the updated record in the inode fork.
++ */
+ int
+ xfs_bmapi_reserve_delalloc(
+ struct xfs_inode *ip,
+@@ -4339,13 +4352,8 @@ xfs_bmapi_reserve_delalloc(
+ got->br_startblock = nullstartblock(indlen);
+ got->br_blockcount = alen;
+ got->br_state = XFS_EXT_NORM;
+- xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
+
+- /*
+- * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
+- * might have merged it into one of the neighbouring ones.
+- */
+- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
++ xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
+
+ /*
+ * Tag the inode if blocks were preallocated. Note that COW fork
+@@ -4357,10 +4365,6 @@ xfs_bmapi_reserve_delalloc(
+ if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+ xfs_inode_set_cowblocks_tag(ip);
+
+- ASSERT(got->br_startoff <= aoff);
+- ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
+- ASSERT(isnullstartblock(got->br_startblock));
+- ASSERT(got->br_state == XFS_EXT_NORM);
+ return 0;
+
+ out_unreserve_blocks:
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -637,6 +637,11 @@ retry:
+ goto out_unlock;
+ }
+
++ /*
++ * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
++ * them out if the write happens to fail.
++ */
++ iomap->flags = IOMAP_F_NEW;
+ trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+ done:
+ if (isnullstartblock(got.br_startblock))
+@@ -1061,7 +1066,8 @@ xfs_file_iomap_end_delalloc(
+ struct xfs_inode *ip,
+ loff_t offset,
+ loff_t length,
+- ssize_t written)
++ ssize_t written,
++ struct iomap *iomap)
+ {
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t start_fsb;
+@@ -1080,14 +1086,14 @@ xfs_file_iomap_end_delalloc(
+ end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+ /*
+- * Trim back delalloc blocks if we didn't manage to write the whole
+- * range reserved.
++ * Trim delalloc blocks if they were allocated by this write and we
++ * didn't manage to write the whole range.
+ *
+ * We don't need to care about racing delalloc as we hold i_mutex
+ * across the reserve/allocate/unreserve calls. If there are delalloc
+ * blocks in the range, they are ours.
+ */
+- if (start_fsb < end_fsb) {
++ if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
+ truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
+ XFS_FSB_TO_B(mp, end_fsb) - 1);
+
+@@ -1117,7 +1123,7 @@ xfs_file_iomap_end(
+ {
+ if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
+ return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
+- length, written);
++ length, written, iomap);
+ return 0;
+ }
+
--- /dev/null
+From d5825712ee98d68a2c17bc89dad2c30276894cba Mon Sep 17 00:00:00 2001
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Date: Thu, 2 Mar 2017 15:06:33 -0800
+Subject: xfs: Use xfs_icluster_size_fsb() to calculate inode alignment mask
+
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+
+commit d5825712ee98d68a2c17bc89dad2c30276894cba upstream.
+
+When block size is larger than inode cluster size, the call to
+XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size) returns 0. Also, mkfs.xfs
+would have set xfs_sb->sb_inoalignmt to 0. Hence in
+xfs_set_inoalignment(), xfs_mount->m_inoalign_mask gets initialized to
+-1 instead of 0. However, xfs_mount->m_sinoalign would get correctly
+intialized to 0 because for every positive value of xfs_mount->m_dalign,
+the condition "!(mp->m_dalign & mp->m_inoalign_mask)" would evaluate to
+false.
+
+Also, xfs_imap() worked fine even with xfs_mount->m_inoalign_mask having
+-1 as the value because blks_per_cluster variable would have the value 1
+and hence we would never have a need to use xfs_mount->m_inoalign_mask
+to compute the inode chunk's agbno and offset within the chunk.
+
+Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/xfs_mount.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -502,8 +502,7 @@ STATIC void
+ xfs_set_inoalignment(xfs_mount_t *mp)
+ {
+ if (xfs_sb_version_hasalign(&mp->m_sb) &&
+- mp->m_sb.sb_inoalignmt >=
+- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
++ mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
+ mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
+ else
+ mp->m_inoalign_mask = 0;
--- /dev/null
+From 8ee9fdbebc84b39f1d1c201c5e32277c61d034aa Mon Sep 17 00:00:00 2001
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Date: Thu, 16 Feb 2017 17:12:16 -0800
+Subject: xfs: Use xfs_icluster_size_fsb() to calculate inode chunk alignment
+
+From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+
+commit 8ee9fdbebc84b39f1d1c201c5e32277c61d034aa upstream.
+
+On a ppc64 system, executing generic/256 test with 32k block size gives the following call trace,
+
+XFS: Assertion failed: args->maxlen > 0, file: /root/repos/linux/fs/xfs/libxfs/xfs_alloc.c, line: 2026
+
+kernel BUG at /root/repos/linux/fs/xfs/xfs_message.c:113!
+Oops: Exception in kernel mode, sig: 5 [#1]
+SMP NR_CPUS=2048
+DEBUG_PAGEALLOC
+NUMA
+pSeries
+Modules linked in:
+CPU: 2 PID: 19361 Comm: mkdir Not tainted 4.10.0-rc5 #58
+task: c000000102606d80 task.stack: c0000001026b8000
+NIP: c0000000004ef798 LR: c0000000004ef798 CTR: c00000000082b290
+REGS: c0000001026bb090 TRAP: 0700 Not tainted (4.10.0-rc5)
+MSR: 8000000000029032 <SF,EE,ME,IR,DR,RI>
+CR: 28004428 XER: 00000000
+CFAR: c0000000004ef180 SOFTE: 1
+GPR00: c0000000004ef798 c0000001026bb310 c000000001157300 ffffffffffffffea
+GPR04: 000000000000000a c0000001026bb130 0000000000000000 ffffffffffffffc0
+GPR08: 00000000000000d1 0000000000000021 00000000ffffffd1 c000000000dd4990
+GPR12: 0000000022004444 c00000000fe00800 0000000020000000 0000000000000000
+GPR16: 0000000000000000 0000000043a606fc 0000000043a76c08 0000000043a1b3d0
+GPR20: 000001002a35cd60 c0000001026bbb80 0000000000000000 0000000000000001
+GPR24: 0000000000000240 0000000000000004 c00000062dc55000 0000000000000000
+GPR28: 0000000000000004 c00000062ecd9200 0000000000000000 c0000001026bb6c0
+NIP [c0000000004ef798] .assfail+0x28/0x30
+LR [c0000000004ef798] .assfail+0x28/0x30
+Call Trace:
+[c0000001026bb310] [c0000000004ef798] .assfail+0x28/0x30 (unreliable)
+[c0000001026bb380] [c000000000455d74] .xfs_alloc_space_available+0x194/0x1b0
+[c0000001026bb410] [c00000000045b914] .xfs_alloc_fix_freelist+0x144/0x480
+[c0000001026bb580] [c00000000045c368] .xfs_alloc_vextent+0x698/0xa90
+[c0000001026bb650] [c0000000004a6200] .xfs_ialloc_ag_alloc+0x170/0x820
+[c0000001026bb7c0] [c0000000004a9098] .xfs_dialloc+0x158/0x320
+[c0000001026bb8a0] [c0000000004e628c] .xfs_ialloc+0x7c/0x610
+[c0000001026bb990] [c0000000004e8138] .xfs_dir_ialloc+0xa8/0x2f0
+[c0000001026bbaa0] [c0000000004e8814] .xfs_create+0x494/0x790
+[c0000001026bbbf0] [c0000000004e5ebc] .xfs_generic_create+0x2bc/0x410
+[c0000001026bbce0] [c0000000002b4a34] .vfs_mkdir+0x154/0x230
+[c0000001026bbd70] [c0000000002bc444] .SyS_mkdirat+0x94/0x120
+[c0000001026bbe30] [c00000000000b760] system_call+0x38/0xfc
+Instruction dump:
+4e800020 60000000 7c0802a6 7c862378 3c82ffca 7ca72b78 38841c18 7c651b78
+38600000 f8010010 f821ff91 4bfff94d <0fe00000> 60000000 7c0802a6 7c892378
+
+When block size is larger than inode cluster size, the call to
+XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size) returns 0. Also, mkfs.xfs
+would have set xfs_sb->sb_inoalignmt to 0. This causes
+xfs_ialloc_cluster_alignment() to return 0. Due to this
+args.minalignslop (in xfs_ialloc_ag_alloc()) gets the unsigned
+equivalent of -1 assigned to it. This later causes alloc_len in
+xfs_alloc_space_available() to have a value of 0. In such a scenario
+when args.total is also 0, the assert statement "ASSERT(args->maxlen >
+0);" fails.
+
+This commit fixes the bug by replacing the call to XFS_B_TO_FSBT() in
+xfs_ialloc_cluster_alignment() with a call to xfs_icluster_size_fsb().
+
+Suggested-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_ialloc.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment(
+ struct xfs_mount *mp)
+ {
+ if (xfs_sb_version_hasalign(&mp->m_sb) &&
+- mp->m_sb.sb_inoalignmt >=
+- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
++ mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
+ return mp->m_sb.sb_inoalignmt;
+ return 1;
+ }
--- /dev/null
+From de14c5f541e78c59006bee56f6c5c2ef1ca07272 Mon Sep 17 00:00:00 2001
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+Date: Thu, 2 Feb 2017 15:14:00 -0800
+Subject: xfs: verify free block header fields
+
+From: Darrick J. Wong <darrick.wong@oracle.com>
+
+commit de14c5f541e78c59006bee56f6c5c2ef1ca07272 upstream.
+
+Perform basic sanity checking of the directory free block header
+fields so that we avoid hanging the system on invalid data.
+
+(Granted that just means that now we shutdown on directory write,
+but that seems better than hanging...)
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/xfs/libxfs/xfs_dir2_node.c | 51 ++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 49 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_dir2_node.c
++++ b/fs/xfs/libxfs/xfs_dir2_node.c
+@@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_b
+ .verify_write = xfs_dir3_free_write_verify,
+ };
+
++/* Everything ok in the free block header? */
++static bool
++xfs_dir3_free_header_check(
++ struct xfs_inode *dp,
++ xfs_dablk_t fbno,
++ struct xfs_buf *bp)
++{
++ struct xfs_mount *mp = dp->i_mount;
++ unsigned int firstdb;
++ int maxbests;
++
++ maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo);
++ firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
++ xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
++ maxbests;
++ if (xfs_sb_version_hascrc(&mp->m_sb)) {
++ struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
++
++ if (be32_to_cpu(hdr3->firstdb) != firstdb)
++ return false;
++ if (be32_to_cpu(hdr3->nvalid) > maxbests)
++ return false;
++ if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
++ return false;
++ } else {
++ struct xfs_dir2_free_hdr *hdr = bp->b_addr;
++
++ if (be32_to_cpu(hdr->firstdb) != firstdb)
++ return false;
++ if (be32_to_cpu(hdr->nvalid) > maxbests)
++ return false;
++ if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused))
++ return false;
++ }
++ return true;
++}
+
+ static int
+ __xfs_dir3_free_read(
+@@ -168,11 +204,22 @@ __xfs_dir3_free_read(
+
+ err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
++ if (err || !*bpp)
++ return err;
++
++ /* Check things that we can't do in the verifier. */
++ if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) {
++ xfs_buf_ioerror(*bpp, -EFSCORRUPTED);
++ xfs_verifier_error(*bpp);
++ xfs_trans_brelse(tp, *bpp);
++ return -EFSCORRUPTED;
++ }
+
+ /* try read returns without an error or *bpp if it lands in a hole */
+- if (!err && tp && *bpp)
++ if (tp)
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
+- return err;
++
++ return 0;
+ }
+
+ int